In [262]:
import pandas as pd
import xml.etree.ElementTree as etree
import os
from glob import glob
from nltk.corpus import stopwords
import re, string
from nltk.tag.stanford import StanfordPOSTagger as POS_Tag #Download the tagger from the site: https://nlp.stanford.edu/software/tagger.shtml Extract the file:stanford-postagger.jar
from nltk import FreqDist

## Import twitts classified to train the model

In [353]:
# http://tass.sepln.org/tass_data/download.php?auth=4tNaxs9su4VeTvJejrj

In [352]:
df_tweets_classified = pd.DataFrame()
columns = ["content","sentiment"]
df_tweets_classified = pd.DataFrame(columns = columns)

raw_data_path = r'data/raw/*TASS2019*.xml'
xml_files = glob(raw_data_path)
xml_files

for files in xml_files:
    tree = etree.parse(files)
    root = tree.getroot()
    
    for node in root: 
        tweet = node.attrib.get("tweet")    
        content = node.find("content").text if node is not None else None
        sentiment = node.find("sentiment/polarity/value").text if node is not None else None        
        df_tweets_classified = df_tweets_classified.append(pd.Series([content,sentiment], index = columns), ignore_index = True)    

df_tweets_classified

Unnamed: 0,content,sentiment
0,@NoilyMV yo soy totalmente puntual,NONE
1,@SandraCauffman Hola Sandrita. No le habia des...,P
2,Si andan haciendo eso mejor se quedaran callad...,N
3,Que pereza quiero choco banano,N
4,"@robertobrenes Bueno, no es tanto lo mayor com...",N
...,...,...
4795,@AmorAKilates @Roocio_Mk si me pasa lo mismo!,NONE
4796,@clauchoarrionda pquno ladra y ls demas retwitean,NEU
4797,A mi desayuno le hizo falta un alfajor podrida...,N
4798,Viste cuando necesitas que alguien te escuche ...,N


## Removing Noise from the Data

In [354]:
stop_words = stopwords.words('spanish')
#stop_words.extend([''])

stop_words = [re.sub('á','a', i) for i in stop_words]
stop_words = [re.sub('é','e', i) for i in stop_words]
stop_words = [re.sub('í','i', i) for i in stop_words]
stop_words = [re.sub('ó','o', i) for i in stop_words]
stop_words = [re.sub('ú','u', i) for i in stop_words]

In [355]:
df_tweets_classified["clean_content"] = df_tweets_classified["content"].str.lower()

# change accentsb
df_tweets_classified.replace('á','a', regex=True, inplace=True)
df_tweets_classified.replace('é','e', regex=True, inplace=True)
df_tweets_classified.replace('í','i', regex=True, inplace=True)
df_tweets_classified.replace('ó','o', regex=True, inplace=True)
df_tweets_classified.replace('ú','u', regex=True, inplace=True)

df_tweets_classified['clean_content'].replace('http\S+','',regex=True, inplace = True)
df_tweets_classified['clean_content'] = df_tweets_classified['clean_content'].map(lambda x: re.sub(r'@\S+', ' ', x))
df_tweets_classified['clean_content'] = df_tweets_classified['clean_content'].map(lambda x: re.sub(r'#\S+', ' ', x))
df_tweets_classified['clean_content'] = df_tweets_classified['clean_content'].map(lambda x: re.sub(r'\b\w{1}\s', ' ', x))
df_tweets_classified['clean_content'] = df_tweets_classified['clean_content'].map(lambda x: re.sub(r'\b\w{2}\s', ' ', x))
df_tweets_classified['clean_content'] = df_tweets_classified['clean_content'].map(lambda x: re.sub(r'[^a-zñ]+', ' ', x))

df_tweets_classified['clean_content']=df_tweets_classified['clean_content'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))
df_tweets_classified[['content','clean_content']]

Unnamed: 0,content,clean_content
0,@NoilyMV yo soy totalmente puntual,totalmente puntual
1,@SandraCauffman Hola Sandrita. No le habia des...,hola sandrita deseado feliz dia madre tarde se...
2,Si andan haciendo eso mejor se quedaran callad...,andan haciendo mejor quedaran calladitas jaja ...
3,Que pereza quiero choco banano,pereza quiero choco banano
4,"@robertobrenes Bueno, no es tanto lo mayor com...",bueno mayor cuanto campo usted sos cartaguito ...
...,...,...
4795,@AmorAKilates @Roocio_Mk si me pasa lo mismo!,pasa mismo
4796,@clauchoarrionda pquno ladra y ls demas retwitean,pquno ladra demas retwitean
4797,A mi desayuno le hizo falta un alfajor podrida...,desayuno hizo falta alfajor podrida galletas a...
4798,Viste cuando necesitas que alguien te escuche ...,viste necesitas alguien escuche alguien bueno


## Splitting Data by Positive and Negative Sentiments

In [356]:
df_tweets_classified.sentiment.unique()

array(['NONE', 'P', 'N', 'NEU'], dtype=object)

In [357]:
positive_tweets = df_tweets_classified.query("sentiment == 'P'")
positive_tweets

Unnamed: 0,content,sentiment,clean_content
1,@SandraCauffman Hola Sandrita. No le habia des...,P,hola sandrita deseado feliz dia madre tarde se...
6,@doriamdiaz El de Halfon de Germinal se ve mor...,P,halfon germinal mortal mary bang volando todav...
8,"El amor es paciente, es bondadoso, no es envid...",P,amor paciente bondadoso envidioso orgulloso eg...
9,"El amanecer respirando o2 puro, es mas que un ...",P,amanecer respirando puro regalo frente mar pre...
14,"Buenas noches papus, que descansen",P,buenas noches papus descansen
...,...,...,...
4787,@BenjaCandado eres una de las mejores personas...,P,mejores personas conocido vida
4790,Nos separan mas de 9KM y quieres JAJAJAJAJAJAJ...,P,separan km quieres jajajajajajajajajajajajajaj...
4791,@CuadradoAndres @grazianopascale @adeladubra j...,P,jaja muchas gracias buena onda implicados
4793,@Niaso01 @LuisSuarez9 @neymarjr El futbol es h...,P,futbol hermoso cdo juntan talentos


In [358]:
negative_tweets = df_tweets_classified.query("sentiment == 'N'")
negative_tweets

Unnamed: 0,content,sentiment,clean_content
2,Si andan haciendo eso mejor se quedaran callad...,N,andan haciendo mejor quedaran calladitas jaja ...
3,Que pereza quiero choco banano,N,pereza quiero choco banano
4,"@robertobrenes Bueno, no es tanto lo mayor com...",N,bueno mayor cuanto campo usted sos cartaguito ...
10,Acabo de ver una chamaca con un piercing de mo...,N,acabo ver chamaca piercing mota ombligo peor c...
12,Acabo de ver a una excompañera de trabajo y me...,N,acabo ver excompañera trabajo senti tentado sa...
...,...,...,...
4783,Queria un juguito y no puedo por el colorante,N,queria juguito puedo colorante
4786,@NinetalesGD NO ME JODAS... que gran lastima,N,jodas gran lastima
4789,"Que lindo, los aliens eligieron visitar a Urug...",N,lindo aliens eligieron visitar uruguay proxima...
4797,A mi desayuno le hizo falta un alfajor podrida...,N,desayuno hizo falta alfajor podrida galletas a...


<h3>Tokenizing the Data</h3>

In [359]:
pos_token_tweets_list=[]
neg_token_tweets_list=[]

for token in positive_tweets['clean_content'].str.split():   
    pos_token_tweets_list.append(token)

for token in negative_tweets['clean_content'].str.split():          
    neg_token_tweets_list.append(token)

In [368]:
print(pos_token_tweets_list[9],'\n')
print(neg_token_tweets_list[9])

['gracias', 'dios', 'cerrando', 'broche', 'oro', 'dios', 'siempre', 'dando', 'mejor', 'hijos', 'semana', 'mejor', 'pasada'] 

['horario', 'semestre', 'tan', 'tan', 'feo', 'despedir', 'poca', 'vida', 'social', 'estreno', 'suicide', 'squad']


<h3>Normalizing the Data</h3>

In [167]:
spanish_postagger = POS_Tag('C:\\Users\\gdlsajor\\AppData\\Roaming\\nltk_data\\models\\spanish.tagger', 'C:\\Users\\gdlsajor\\AppData\\Roaming\\nltk_data\\stanford-postagger.jar', encoding='utf8')
java_path = "C:/Program Files/Java/jre1.8.0_201/bin/java.exe"
os.environ['JAVAHOME'] = java_path

In [453]:
def normalization(words):    
    tokens = []
    tagged_words = spanish_postagger.tag(words) ### Normalizing data            
    #print (tagged_words) 
        
    for (word, tag) in tagged_words:                      
        if tag not in ['np00000','word','nc0n000','di0000','pr000000','vaip000','sp000','z0','i']:
            #print(word+' '+tag)
            tokens.append(word)
    return tokens
#https://stackoverflow.com/questions/14732465/nltk-tagging-spanish-words-using-a-corpus

In [454]:
#df_tweets_classified = pd.DataFrame(columns = columns)
pos_tokens_normalized = []
neg_tokens_normalized = []

for words in pos_token_tweets_list:
    #pos_tokens_normalized.append(normalization(words))

for words in neg_token_tweets_list:
    #neg_tokens_normalized.append(normalization(words))

In [515]:
#
x=130
print(positive_tweets.content.iloc[x])
print(pos_tokens_normalized[x],'\n')

print(negative_tweets.content.iloc[x])
print(neg_tokens_normalized[x])

Esperando que sea un buen jueves por la noche
['esperando', 'buen', 'noche'] 

Ya no tengo libro para leer ni serie para ver, mis noches se estan volviendo aburridas
['libro', 'leer', 'serie', 'ver', 'noches', 'volviendo', 'aburridas']


In [511]:
#Export lists to text files

with open(r'data\clean\pos_tokens_normalized.txt', "w") as f:
    for s in pos_tokens_normalized:
        f.write(str(s) +"\n")

with open(r'data\clean\neg_tokens_normalized.txt', "w") as f:
    for s in neg_tokens_normalized:
        f.write(str(s) +"\n")

#pos_tokens_normalized.to_csv(r'data\clean\pos_tokens_normalized.csv', index = None, header=True)
#neg_tokens_normalized.to_csv(r'data\clean\neg_tokens_normalized.csv', index = None, header=True)

In [512]:
# Import
score=[]
with open(r'data\clean\neg_tokens_normalized.txt', "r") as f:
    for line in f:
        score.append(str(line.strip()))
score[1]  

"['pereza', 'quiero', 'choco', 'banano']"

<h3>Determining Word Density</h3>

In [483]:
def get_all_words(cleaned_tokens_list):
    for tokens in cleaned_tokens_list:
        for token in tokens:
            yield token

In [484]:
all_pos_words = get_all_words(pos_tokens_normalized)
all_neg_words = get_all_words(neg_tokens_normalized)

freq_dist_pos = FreqDist(all_pos_words)
freq_dist_neg = FreqDist(all_neg_words)

print(freq_dist_pos.most_common(20),'\n')
print(freq_dist_neg.most_common(20))

[('feliz', 110), ('dia', 100), ('mejor', 97), ('hoy', 90), ('gracias', 77), ('bien', 75), ('buen', 62), ('buena', 62), ('dias', 62), ('asi', 60), ('siempre', 58), ('ser', 55), ('tan', 53), ('año', 51), ('ver', 48), ('quiero', 47), ('solo', 47), ('bonito', 47), ('lindo', 46), ('vida', 44)] 

[('triste', 90), ('solo', 83), ('quiero', 82), ('tan', 76), ('hoy', 75), ('asi', 73), ('dia', 70), ('ser', 68), ('mal', 67), ('ahora', 63), ('vida', 62), ('ver', 60), ('hacer', 56), ('voy', 56), ('puedo', 55), ('mejor', 53), ('año', 52), ('hace', 49), ('extraño', 48), ('cosas', 48)]


<h3>Preparing Data for the Model</h3>

Converting Tokens to a Dictionary

In [485]:
def get_tweets_for_model(cleaned_tokens_list):
    for tweet_tokens in cleaned_tokens_list:
        yield dict([token, True] for token in tweet_tokens)

positive_tokens_for_model = get_tweets_for_model(pos_tokens_normalized)
negative_tokens_for_model = get_tweets_for_model(neg_tokens_normalized)

<h3>Splitting the Dataset for Training and Testing the Model</h3>

In [491]:
import random

positive_dataset = [(tweet_dict, "Positive")
                     for tweet_dict in positive_tokens_for_model]

negative_dataset = [(tweet_dict, "Negative")
                     for tweet_dict in negative_tokens_for_model]

dataset = positive_dataset + negative_dataset

random.shuffle(dataset)

train_data = dataset[:2000]
test_data = dataset[2000:]

<h3>Building and Testing the Model</h3>

In [492]:
from nltk import classify
from nltk import NaiveBayesClassifier
classifier = NaiveBayesClassifier.train(train_data)

print("Accuracy is:", classify.accuracy(classifier, test_data))

print(classifier.show_most_informative_features(20))

Accuracy is: 0.7392325763508223
Most Informative Features
                    peor = True           Negati : Positi =     13.7 : 1.0
                  triste = True           Negati : Positi =     11.6 : 1.0
                    dios = True           Positi : Negati =      9.4 : 1.0
                    odio = True           Negati : Positi =      8.2 : 1.0
                 encanta = True           Positi : Negati =      7.6 : 1.0
                   buena = True           Positi : Negati =      7.3 : 1.0
                 hermoso = True           Positi : Negati =      7.2 : 1.0
                 gracias = True           Positi : Negati =      7.0 : 1.0
               esperando = True           Positi : Negati =      6.7 : 1.0
                  llegar = True           Positi : Negati =      6.7 : 1.0
                   exito = True           Positi : Negati =      6.7 : 1.0
                    pais = True           Negati : Positi =      6.2 : 1.0
                  genial = True           

In [498]:
from nltk.tokenize import word_tokenize
#nltk.download('punkt')

#custom_tweet = "I ordered just once from TerribleCo, they screwed up, never used the app again."
#custom_tweet = 'Congrats #SportStar on your 7th best goal from last season winning goal of the year :) #Baller #Topbin #oneofmanyworldies'
#custom_tweet = 'Thank you for sending my baggage to CityX and flying me to CityY at the same time. Brilliant service. #thanksGenericAirline'

custom_tweet = "Bad day today"

custom_tokens = word_tokenize(custom_tweet)
#custom_tokens = remove_noise(word_tokenize(custom_tweet))

print(classifier.classify(dict([token, True] for token in custom_tokens)))

Negative


In [None]:
#tensorflow
# To save:
import pickle
f = open('sentiment_classifier.pickle', 'wb')
pickle.dump(classifier, f)
f.close()

In [None]:
# To load:
import pickle
f = open('sentiment_classifier.pickle', 'rb')
classifier = pickle.load(f)
f.close()