In [None]:
import pandas as pd
#pip install googletrans
from googletrans import Translator
from nltk.tag import pos_tag
from nltk.stem.wordnet import WordNetLemmatizer
import re, string
import pickle
from nltk.tokenize import word_tokenize
import datetime as dt
import plotly.graph_objects as go
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import matplotlib.pyplot as plt

# Import spanish twitts

In [None]:
## Import csv twitts in Spanish
es_twitts=pd.read_csv('data/clean/es_twitts.csv')

## Filtering by only MX
#es_twitts.query("country_code=='MX'").count()
es_twitts=es_twitts.query("country_code=='MX'")

In [None]:
es_twitts.head(5)

# Twitts translation from Spanish to English

In [None]:
pd.options.mode.chained_assignment = None  # default='warn'
es_twitts['text_english'] = es_twitts['text'].apply(translator.translate, src='es', dest='en').apply(getattr, args=('text',))

In [None]:
## Export csv twitts traslated in English
#es_twitts.to_csv (r'C:\input\english_twitts.csv', index = None, header=True)

In [None]:
## Import csv twitts in Spanish
es_twitts=pd.read_csv('C:\\input\\english_twitts.csv')

# Removing Noise from the Data

In [None]:
def remove_noise(tweet_tokens, stop_words = ()):

    cleaned_tokens = []

    for token, tag in pos_tag(tweet_tokens):
        token = re.sub('https?:\/\/.*[\r\n]*','', token)
        token = re.sub('http?:\/\/.*[\r\n]*','', token)
        token = re.sub("(@[A-Za-z0-9_]+)","", token)

        if tag.startswith("NN"):
            pos = 'n'
        elif tag.startswith('VB'):
            pos = 'v'
        else:
            pos = 'a'

        lemmatizer = WordNetLemmatizer()
        token = lemmatizer.lemmatize(token, pos)

        if len(token) > 0 and token not in string.punctuation and token.lower() not in stop_words:
            cleaned_tokens.append(token.lower())
    return cleaned_tokens

# Load Sentiment Model Trained and Tested

In [None]:
# To load:
f = open('sentiment_classifier.pickle', 'rb')
classifier = pickle.load(f)
f.close()

# Run model to classify twitts

In [None]:
for index, row in es_twitts.iterrows():
    #print(index)
    #print(row['created_at'],row['text'],row['text_english'])
        
    custom_tokens = remove_noise(word_tokenize(row['text_english']))
    
    es_twitts.at[index,'classified']=classifier.classify(dict([token, True] for token in custom_tokens))

In [None]:
es_twitts

In [None]:
## Export csv twitts traslated in English
#es_twitts.to_csv (r'C:\input\classified_twitts.csv', index = None, header=True)

## Import csv twitts traslated in English
es_twitts=pd.read_csv('C:\\input\\classified_twitts.csv')

In [None]:
# Imporing the necessary columns to plot
es_twitts_plot = pd.DataFrame()

es_twitts_plot["created_at"]=es_twitts.created_at
#es_twitts_plot["spa"]=es_twitts.text
#es_twitts_plot["eng"]=es_twitts.text_english

es_twitts_plot.loc[es_twitts['classified'].str.contains('Positive'), 'pos'] = 'yes'
es_twitts_plot.loc[es_twitts['classified'].str.contains('Negative'), 'neg'] = 'yes'

In [None]:
es_twitts_plot.head(5)

In [None]:
pd.options.mode.chained_assignment = None  # default='warn'

# Coverting all date values in proper Datetime format
for i in range(len(es_twitts_plot.created_at)):
     es_twitts_plot.created_at[i] = dt.datetime.strptime(es_twitts_plot.created_at[i],'%Y-%m-%dT%H:%M:%SZ').date()

In [None]:
# Add new columns to get positive/negavite counts per day
es_twitts_plot['count_pos']  = es_twitts_plot.groupby('created_at')['pos'].transform('count')
es_twitts_plot['count_neg'] = es_twitts_plot.groupby('created_at')['neg'].transform('count')

es_twitts_plot.drop(['pos', 'neg'],axis=1,inplace=True)
es_twitts_plot.drop_duplicates(inplace=True)

In [None]:
es_twitts_plot.head(10)

# Plotting twitts Classified

In [None]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=es_twitts_plot['created_at'], y=es_twitts_plot['count_pos'], name="Positive",
                         line_color='deepskyblue'))

fig.add_trace(go.Scatter(x=es_twitts_plot['created_at'], y=es_twitts_plot['count_neg'], name="Negative",
                         line_color='dimgray'))

fig.update_layout(title_text='MX Twitts Positive/Negative Per Day',
                  xaxis_rangeslider_visible=True)
fig.show()

# Word Cloud By Twitts Classified

In [None]:
pos_twitts=es_twitts.query('classified=="Positive"')
neg_twitts=es_twitts.query('classified=="Negative"')

In [None]:
#text = df.description[0]
text = (" ").join(pos_twitts.text.tolist())

# Create and generate a word cloud image:
wordcloud = WordCloud().generate(text)

# Display the generated image:
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

In [None]:
#text = df.description[0]
text = (" ").join(neg_twitts.text.tolist())

# Create and generate a word cloud image:
wordcloud = WordCloud().generate(text)

# Display the generated image:
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()