In [1]:
import pandas as pd
import numpy as np
import pickle

### Data

Izquierda Diario

In [2]:
df_izq = pickle.load(open('data/df_clean_izq.pkl','rb'))

In [3]:
df_inflation_izq = df_izq[df_izq.topics == 'inflation']

In [4]:
df_exchange_izq = df_izq[df_izq.topics == 'exchange']

In [5]:
corpus_izq = df_inflation_izq.append(df_exchange_izq)

Derecha Diario

In [6]:
df_der = pickle.load(open('data/df_clean_der.pkl','rb'))

In [7]:
df_inflation_der = df_der[df_der.topics == 'inflation']

In [8]:
df_exchange_der = df_der[df_der.topics == 'exchange']

In [9]:
corpus_der = df_inflation_der.append(df_exchange_der)

### Tokenization

In [10]:
import re
import unidecode

In [11]:
from nltk.corpus import stopwords
import spacy
from spacy.lang.es import Spanish

In [12]:
parser = Spanish()
stopwords_sp = stopwords.words('spanish') + ['si','dia', 'vez', 'fin','dias', 'dos', 'mas','ano', 'asi','puede', 'mes','incluso']

In [13]:
def tokenizer(texto):

    alphanumeric = re.sub(r'([^\s\w]|_)+', '', texto).lower()
    no_accents = unidecode.unidecode(alphanumeric)
    
    tokens = parser(no_accents)
    
    tokens_list = [str(token) for token in tokens]
    
    tokens_clean = [token for token in tokens_list if token not in stopwords_sp and token.isalpha()]

    return tokens_clean

In [14]:
tokens_izq = [tokenizer(text) for text in corpus_izq.body]

In [15]:
tokens_der = [tokenizer(text) for text in corpus_der.body]

### Gensim

Give ids to tokens with gensim Dictionary

In [16]:
# import logging
# logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [17]:
from gensim.corpora import Dictionary

In [18]:
# tokens input
dictionary_izq = Dictionary(tokens_izq)
dictionary_izq.filter_extremes(no_below=2, no_above=0.97, keep_n=None)

# convert each document into the gensim bag-of-words
corpus_izq = [dictionary_izq.doc2bow(text) for text in tokens_izq]

In [19]:
# tokens input
dictionary_der = Dictionary(tokens_der)
dictionary_der.filter_extremes(no_below=2, no_above=0.97, keep_n=None)

# convert each document into the gensim bag-of-words
corpus_der = [dictionary_der.doc2bow(text) for text in tokens_der]

### Latent Dirichlet Allocation (LDA)
Check the unsupervised word classification using LDA

In [20]:
from gensim import matutils, models
import scipy.sparse

Izquierda Diario

In [21]:
# make a dictionary with the ids of each word assigned by Gensim dictionary method

id_word_izq = dict((v, k) for k, v in dictionary_izq.token2id.items())

In [22]:
# we already know that there are two topics

lda_izq = models.LdaModel(corpus=corpus_izq, id2word=id_word_izq, num_topics=2, passes=20, random_state= 30)

In [23]:
lda_izq.print_topics()

[(0,
  '0.020*"precios" + 0.014*"inflacion" + 0.014*"aumento" + 0.011*"productos" + 0.011*"suba" + 0.010*"alimentos" + 0.009*"salarios" + 0.009*"trabajadores" + 0.008*"canasta" + 0.008*"indec"'),
 (1,
  '0.037*"dolar" + 0.013*"dolares" + 0.012*"millones" + 0.011*"banco" + 0.011*"us" + 0.010*"central" + 0.009*"deuda" + 0.009*"guzman" + 0.008*"gobierno" + 0.008*"bonos"')]

In [24]:
import pyLDAvis.gensim

lda_display = pyLDAvis.gensim.prepare(lda_izq, corpus_izq, dictionary_izq, sort_topics=False)

pyLDAvis.display(lda_display)

Derecha Diario

In [25]:
# make a dictionary with the ids of each word assigned by Gensim dictionary method

id_word_der = dict((v, k) for k, v in dictionary_der.token2id.items())

  and should_run_async(code)


In [26]:
# we already know that there are two topics

lda_der = models.LdaModel(corpus=corpus_der, id2word=id_word_der, num_topics=2, passes=20, random_state= 30)

  and should_run_async(code)


In [27]:
lda_der.print_topics()

  and should_run_async(code)


[(0,
  '0.016*"dolar" + 0.012*"mercado" + 0.010*"gobierno" + 0.009*"tipo" + 0.008*"pesos" + 0.007*"banco" + 0.007*"dolares" + 0.007*"cambio" + 0.006*"medidas" + 0.006*"guzman"'),
 (1,
  '0.016*"inflacion" + 0.014*"precios" + 0.008*"economia" + 0.008*"aumento" + 0.008*"gobierno" + 0.007*"monetaria" + 0.007*"argentina" + 0.007*"si" + 0.006*"dinero" + 0.005*"nivel"')]

In [28]:
lda_display = pyLDAvis.gensim.prepare(lda_der, corpus_der, dictionary_der, sort_topics=False)
pyLDAvis.display(lda_display)

  and should_run_async(code)


### Filtering by tags

In [29]:
import nltk

  and should_run_async(code)


In [30]:
# tag is not accurate
nltk.pos_tag(tokens_izq[0][:10])

  and should_run_async(code)


[('trata', 'NNS'),
 ('mismo', 'NNS'),
 ('aumento', 'IN'),
 ('aplico', 'JJ'),
 ('massalin', 'NN'),
 ('particulares', 'NNS'),
 ('pasado', 'VBP'),
 ('noviembre', 'JJ'),
 ('toca', 'NN'),
 ('marcas', 'NN')]

In [31]:
from polyglot.text import Text
!polyglot download embeddings2.es pos2.es

  and should_run_async(code)


[polyglot_data] Downloading package embeddings2.es to
[polyglot_data]     /home/guido/polyglot_data...
[polyglot_data]   Package embeddings2.es is already up-to-date!
[polyglot_data] Downloading package pos2.es to
[polyglot_data]     /home/guido/polyglot_data...
[polyglot_data]   Package pos2.es is already up-to-date!


In [32]:
def noun_only(tokens):
    nouns = []
    for token in tokens:
        word_t = Text(token, hint_language_code='es')
        if word_t.pos_tags[0][1] == 'NOUN':
            nouns.append(word_t.pos_tags[0][0])
    return nouns

  and should_run_async(code)


In [33]:
tokens_nouns_izq = [noun_only(token) for token in tokens_izq]
tokens_nouns_der = [noun_only(token) for token in tokens_der]

  and should_run_async(code)


### New Noun Dictionary + LDA

In [34]:
# tokens input
dictionary_izq = Dictionary(tokens_nouns_izq)
dictionary_izq.filter_extremes(no_below=2, no_above=0.97, keep_n=None)

# convert each document into the gensim bag-of-words
corpus_izq = [dictionary_izq.doc2bow(text) for text in tokens_nouns_izq]

  and should_run_async(code)


In [35]:
# tokens input
dictionary_der = Dictionary(tokens_nouns_der)
dictionary_der.filter_extremes(no_below=2, no_above=0.97, keep_n=None)

# convert each document into the gensim bag-of-words
corpus_der = [dictionary_der.doc2bow(text) for text in tokens_nouns_der]

  and should_run_async(code)


Izquierda Diario

In [36]:
id_word_izq = dict((v, k) for k, v in dictionary_izq.token2id.items())

  and should_run_async(code)


In [37]:
lda_izq = models.LdaModel(corpus=corpus_izq, id2word=id_word_izq, num_topics=2, passes=20, random_state= 30)

  and should_run_async(code)


In [38]:
lda_izq.print_topics()

  and should_run_async(code)


[(0,
  '0.064*"precios" + 0.035*"productos" + 0.032*"alimentos" + 0.029*"salarios" + 0.022*"incremento" + 0.018*"gobierno" + 0.017*"aumentos" + 0.017*"bebidas" + 0.017*"meses" + 0.016*"indice"'),
 (1,
  '0.039*"banco" + 0.031*"deuda" + 0.030*"gobierno" + 0.029*"bonos" + 0.026*"economia" + 0.025*"medidas" + 0.020*"brecha" + 0.019*"mercado" + 0.017*"tipo" + 0.016*"presion"')]

In [39]:
lda_display = pyLDAvis.gensim.prepare(lda_izq, corpus_izq, dictionary_izq, sort_topics=False)
pyLDAvis.display(lda_display)

  and should_run_async(code)


Derecha Diario

In [40]:
id_word_der = dict((v, k) for k, v in dictionary_der.token2id.items())

  and should_run_async(code)


In [41]:
lda_der = models.LdaModel(corpus=corpus_der, id2word=id_word_der, num_topics=2, passes=20, random_state= 30)

  and should_run_async(code)


In [42]:
lda_der.print_topics()

  and should_run_async(code)


[(0,
  '0.033*"gobierno" + 0.030*"mercado" + 0.024*"banco" + 0.023*"pais" + 0.019*"tipo" + 0.017*"medidas" + 0.015*"brecha" + 0.014*"demanda" + 0.013*"semana" + 0.012*"ministro"'),
 (1,
  '0.041*"precios" + 0.029*"economia" + 0.024*"gobierno" + 0.018*"mercado" + 0.016*"tipo" + 0.016*"medidas" + 0.016*"nivel" + 0.015*"dinero" + 0.011*"cuarentena" + 0.011*"tasa"')]

In [43]:
lda_display = pyLDAvis.gensim.prepare(lda_der, corpus_der, dictionary_der, sort_topics=False)
pyLDAvis.display(lda_display)

  and should_run_async(code)
