## Sentiment Analisys

### Load Data

In [1]:
import pandas as pd
import numpy as np
import pickle

In [2]:
data = pickle.load(open('data/df_clean_izq.pkl','rb'))

In [3]:
inf_izq = data.loc[data.topics == 'inflation']
inf_izq.shape

(36, 7)

In [4]:
data = pickle.load(open('data/df_clean_der.pkl','rb'))

In [5]:
inf_der = data.loc[data.topics == 'inflation']
inf_der.shape

(42, 7)

### Parts of Speech (POS) Tagging and Sentiment Analysis

- CC coordinating conjunction
- CD cardinal digit
- DT determiner
- EX existential there
- FW foreign word
- IN preposition/subordinating conjunction
- JJ This NLTK POS Tag is an adjective (large)
- JJR adjective, comparative (larger)
- JJS adjective, superlative (largest)
- LS list market
- MD modal (could, will)
- NN noun, singular (cat, tree)
- NNS noun plural (desks)
- NNP proper noun, singular (sarah)
- NNPS proper noun, plural (indians or americans)
- PDT predeterminer (all, both, half)
- POS possessive ending (parent\ 's)
- PRP personal pronoun (hers, herself, him,himself)
- PRPS possessive pronoun (her, his, mine, my, our )
- RB adverb (occasionally, swiftly)
- RBR adverb, comparative (greater)
- RBS adverb, superlative (biggest)
- RP particle (about)
- TO infinite marker (to)
- UH interjection (goodbye)
- VB verb (ask)
- VBG verb gerund (judging)
- VBD verb past tense (pleaded)
- VBN verb past participle (reunified)
- VBP verb, present tense not 3rd person singular(wrap)
- VBZ verb, present tense with 3rd person singular (bases)
- WDT wh-determiner (that, what)
- WP wh- pronoun (who)
- WRB wh- adverb (how) 

In [6]:
import nltk

In [7]:
from nltk.sentiment import SentimentIntensityAnalyzer

In [8]:
# nltk.download('vader_lexicon')

In [9]:
from textblob import TextBlob
from googletrans import Translator

### NLTK vs TextBlob (tagging)

Import tokens for nltk tagging

In [10]:
token_df = pd.read_pickle('tokens/topic_tokens_izq.pkl')

In [11]:
token_df = token_df[token_df.topics == 'inflation']

In [12]:
token_df.drop(['topics'], axis=1, inplace=True)

In [13]:
# made a corpus from tokens in df
corpus = []
for row in range(len(token_df)):
    corpus.append(token_df.iloc[row,(token_df.iloc[row] == 1).values].index.tolist())
len(corpus)

39

nltk (tokens) vs textblob (text)

In [14]:
display( dict(nltk.pos_tag(corpus[0])).get('comision'),
        dict(nltk.pos_tag(corpus[0])).get('nacional'),
        dict(nltk.pos_tag(corpus[0])).get('valores'),
        dict(nltk.pos_tag(corpus[0])).get('cantidad'),
        dict(nltk.pos_tag(corpus[0])).get('tener'),
        dict(nltk.pos_tag(corpus[0])).get('venderlo'),
       )

None

'JJ'

None

None

None

None

In [15]:
text = TextBlob(inf_izq.iloc[0, -2])
print(text.tags[:60])

[('La', 'NNP'), ('inflación', 'VBZ'), ('no', 'DT'), ('se', 'JJ'), ('detiene', 'NN'), ('y', 'NN'), ('el', 'FW'), ('fin', 'NN'), ('de', 'IN'), ('año', 'FW'), ('para', 'NN'), ('la', 'FW'), ('mayoría', 'FW'), ('de', 'FW'), ('las', 'FW'), ('familias', 'FW'), ('se', 'FW'), ('presenta', 'FW'), ('difícil', 'FW'), ('de', 'FW'), ('afrontar', 'FW'), ('El', 'NNP'), ('índice', 'NNP'), ('de', 'FW'), ('precios', 'FW'), ('mayoristas', 'NNS'), ('aumentó', 'VBP'), ('durante', 'JJ'), ('noviembre', 'RB'), ('4,2', 'CD'), ('%', 'NN'), ('mientras', 'FW'), ('que', 'NN'), ('el', 'FW'), ('costo', 'NN'), ('de', 'IN'), ('la', 'FW'), ('construcción', 'FW'), ('registró', 'FW'), ('un', 'JJ'), ('incremento', 'NN'), ('de', 'IN'), ('12,4', 'CD'), ('%', 'NN'), ('según', 'JJ'), ('informó', 'NN'), ('este', 'NN'), ('lunes', 'NNS'), ('el', 'VBP'), ('Instituto', 'NNP'), ('Nacional', 'NNP'), ('de', 'FW'), ('estadística', 'FW'), ('y', 'FW'), ('Censos', 'NNP'), ('Indec', 'NNP'), ('De', 'NNP'), ('esta', 'FW'), ('forma', 'NN'), (

### Text Blob and language
Translation with TextBlob and googletrans

#### Spanish VS English (Example)
There are evident issues in the tagging process before translation that lead to the imposibility of getting the sentiment scores.

In [16]:
# Spanish
print(f'Input en español:\n{inf_izq.iloc[0].lead}\n')

text = TextBlob(inf_izq.iloc[0].lead)
print(f'POS:\n{text.tags}\n')

print(f'Sustantivos:\n{text.noun_phrases}\n')

print(text.sentiment,'\n')

Input en español:
Al mismo tiempo los precios mayoristas del conjunto de los ramas crecieron un 4,2 % consolidando la suba de la inflación. Los salarios cada vez alcanzan para menos y la vivienda propia se torna cada vez más inaccesible.

POS:
[('Al', 'NNP'), ('mismo', 'NNP'), ('tiempo', 'VBD'), ('los', 'JJ'), ('precios', 'NNS'), ('mayoristas', 'VBP'), ('del', 'FW'), ('conjunto', 'NN'), ('de', 'FW'), ('los', 'FW'), ('ramas', 'FW'), ('crecieron', 'NN'), ('un', 'JJ'), ('4,2', 'CD'), ('%', 'NN'), ('consolidando', 'NN'), ('la', 'NN'), ('suba', 'FW'), ('de', 'FW'), ('la', 'FW'), ('inflación', 'NN'), ('Los', 'NNP'), ('salarios', 'NNS'), ('cada', 'VBP'), ('vez', 'JJ'), ('alcanzan', 'NN'), ('para', 'NN'), ('menos', 'FW'), ('y', 'NN'), ('la', 'FW'), ('vivienda', 'FW'), ('propia', 'FW'), ('se', 'FW'), ('torna', 'FW'), ('cada', 'NN'), ('vez', 'NN'), ('más', 'NN'), ('inaccesible', 'JJ')]

Sustantivos:
['al', 'mismo tiempo los precios mayoristas del conjunto', 'los ramas crecieron un', '% consolida

In [17]:
# English
translator = Translator()
translation = translator.translate(inf_izq.iloc[0].lead, src='es', dest='en')

print(f'Traducción:\n{translation.text}\n')

text = TextBlob(translation.text)
print(f'POS:\n{text.tags}\n')

print(f'Sustantivos:\n{text.noun_phrases}\n') 

print(text.sentiment)

Traducción:
At the same time the wholesale prices of the branches grew by 4.2% consolidating the increase in inflation.Salaries increasingly reach and their own housing becomes increasingly inaccessible.

POS:
[('At', 'IN'), ('the', 'DT'), ('same', 'JJ'), ('time', 'NN'), ('the', 'DT'), ('wholesale', 'JJ'), ('prices', 'NNS'), ('of', 'IN'), ('the', 'DT'), ('branches', 'NNS'), ('grew', 'VBD'), ('by', 'IN'), ('4.2', 'CD'), ('%', 'NN'), ('consolidating', 'VBG'), ('the', 'DT'), ('increase', 'NN'), ('in', 'IN'), ('inflation.Salaries', 'NNS'), ('increasingly', 'RB'), ('reach', 'VBP'), ('and', 'CC'), ('their', 'PRP$'), ('own', 'JJ'), ('housing', 'NN'), ('becomes', 'NNS'), ('increasingly', 'RB'), ('inaccessible', 'JJ')]

Sustantivos:
['wholesale prices']

Sentiment(polarity=0.3, subjectivity=0.5625)


In [18]:
print(f'Input en español: {inf_der.iloc[0].lead}\n')
text = TextBlob(inf_der.iloc[0].lead)

print(f'POS:\n{text.tags}\n')

print(f'Sustantivos:\n{text.noun_phrases}\n')

print(text.sentiment,'\n')

Input en español: La famosa multinacional financiera alerta sobre los peligros inflacionarios a raíz de la caída en la demanda de pesos en cara a 2021. Además, el economista Steve Hanke asegura que la inflación argentina está fuertemente reprimida, y la real podría llegar a los tres dígitos.

POS:
[('La', 'NNP'), ('famosa', 'VBD'), ('multinacional', 'JJ'), ('financiera', 'NN'), ('alerta', 'NN'), ('sobre', 'NN'), ('los', 'NN'), ('peligros', 'JJ'), ('inflacionarios', 'VBZ'), ('a', 'DT'), ('raíz', 'NN'), ('de', 'IN'), ('la', 'FW'), ('caída', 'FW'), ('en', 'FW'), ('la', 'FW'), ('demanda', 'FW'), ('de', 'FW'), ('pesos', 'FW'), ('en', 'FW'), ('cara', 'NN'), ('a', 'DT'), ('2021', 'CD'), ('Además', 'NNP'), ('el', 'NN'), ('economista', 'NN'), ('Steve', 'NNP'), ('Hanke', 'NNP'), ('asegura', 'NN'), ('que', 'NN'), ('la', 'NN'), ('inflación', 'JJ'), ('argentina', 'NN'), ('está', 'NN'), ('fuertemente', 'NN'), ('reprimida', 'NN'), ('y', 'RB'), ('la', 'JJ'), ('real', 'JJ'), ('podría', 'NN'), ('llegar'

In [19]:
translator = Translator()
translation = translator.translate(inf_der.iloc[0].lead, src='es', dest='en')

print(f'Traducción: {translation.text}\n')
text = TextBlob(translation.text)

print(f'POS:\n{text.tags}\n')

print(f'Sustantivos:\n{text.noun_phrases}\n') 

print(text.sentiment)

Traducción: The famous financial multinational alert on inflationary dangers as a result of the fall in demand for pesos in face 2021. In addition, economist Steve Hanke ensures that Argentine inflation is strongly repressed, and the real could reach the three digits.

POS:
[('The', 'DT'), ('famous', 'JJ'), ('financial', 'JJ'), ('multinational', 'JJ'), ('alert', 'NN'), ('on', 'IN'), ('inflationary', 'JJ'), ('dangers', 'NNS'), ('as', 'IN'), ('a', 'DT'), ('result', 'NN'), ('of', 'IN'), ('the', 'DT'), ('fall', 'NN'), ('in', 'IN'), ('demand', 'NN'), ('for', 'IN'), ('pesos', 'NN'), ('in', 'IN'), ('face', 'NN'), ('2021', 'CD'), ('In', 'IN'), ('addition', 'NN'), ('economist', 'NN'), ('Steve', 'NNP'), ('Hanke', 'NNP'), ('ensures', 'VBZ'), ('that', 'IN'), ('Argentine', 'NNP'), ('inflation', 'NN'), ('is', 'VBZ'), ('strongly', 'RB'), ('repressed', 'VBN'), ('and', 'CC'), ('the', 'DT'), ('real', 'JJ'), ('could', 'MD'), ('reach', 'VB'), ('the', 'DT'), ('three', 'CD'), ('digits', 'NNS')]

Sustantivos

### Datasets - Sentiment Analisys

#### NLTK
with nltk VADER (Valence Aware Dictionary and sEntiment Reasoner)

In [20]:
# nltk sentiment
sia = SentimentIntensityAnalyzer()
print(f'Example: {sia.polarity_scores(inf_izq.iloc[0, :].body)}')

Example: {'neg': 0.04, 'neu': 0.96, 'pos': 0.0, 'compound': -0.9287}


In [21]:
def polarity(df):
    nltk_pol = lambda x: sia.polarity_scores(x)
    df_pol = df.body.apply(nltk_pol)
    neg = 0; pos = 0; neu = 0
    for row in list(df_pol.values):
        pos += row['pos']
        neg += row['neg']
        neu += row['neu']
        return {'neg': neg,
                'pos': pos,
                'neu': neu
               }

#### 1. "derecha diario"

nltk

In [22]:
print(f'NLTK\n{polarity(inf_der)}')

NLTK
{'neg': 0.025, 'pos': 0.008, 'neu': 0.967}


Text blob with google translator

In [23]:
translator = Translator()
pol_t = lambda x: np.around(TextBlob(translator.translate(x, src='es', dest='en').text).sentiment.polarity, 3)
sub_t = lambda x: np.around(TextBlob(translator.translate(x, src='es', dest='en').text).sentiment.subjectivity, 3)

In [24]:
inf_der['polarity'] = inf_der.lead.apply(pol_t)
inf_der['subjectivity'] = inf_der.lead.apply(sub_t)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [25]:
print(f'TextBlob\nPolarity: {inf_der.polarity.mean()}\nSubjectivity: {inf_der.subjectivity.mean()}')

TextBlob
Polarity: 0.10252380952380957
Subjectivity: 0.4447142857142857


Text blob with textblob translator

In [26]:
# pol = lambda x: np.around(TextBlob(x).translate(to='en').sentiment.polarity, 3)
# sub = lambda x: np.around(TextBlob(x).translate(to='en').sentiment.subjectivity, 3)

In [27]:
# inf_der['polarity'] = inf_der.lead.apply(pol)
# inf_der['subjectivity'] = inf_der.lead.apply(sub)

In [28]:
# print(f'Polarity: {inf_der.polarity.mean()}\nSubjectivity: {inf_der.subjectivity.mean()}')

#### 2. "izquierda diario"

nltk

In [29]:
print(f'NLTK\n{polarity(inf_izq)}')

NLTK
{'neg': 0.04, 'pos': 0.0, 'neu': 0.96}


Text blob with google translator

In [30]:
#inf_izq['polarity'] = inf_izq.lead.apply(pol)
#inf_izq['subjectivity'] = inf_izq.lead.apply(sub)

In [31]:
# print(f'Polarity: {inf_izq.polarity.mean()}\nSubjectivity: {inf_izq.subjectivity.mean()}')

In [32]:
inf_izq['polarity'] = inf_izq.lead.apply(pol_t)
inf_izq['subjectivity'] = inf_izq.lead.apply(sub_t)

In [33]:
print(f'TextBlob\nPolarity: {inf_izq.polarity.mean()}\nSubjectivity: {inf_izq.subjectivity.mean()}')

TextBlob
Polarity: 0.11661111111111112
Subjectivity: 0.4015833333333334
