In [1]:
import os
import warnings

from gensim.models import CoherenceModel
from gensim.models.ldamodel import LdaModel
from plotly import tools
import pandas as pd
import plotly.offline as pyo
import plotly.graph_objs as go
import pyLDAvis
import pyLDAvis.gensim
import spacy

from helpers import MiCorpus
import helpers as hp

In [2]:
nlp = spacy.load('es_md')

In [3]:
def create_models(corpus, topics, params, ngrams, directory, lang, other=None):
    """
    Crea modelos LDA para diferentes números de tópicos

    Parameters
    ----------
    corpus: MiCorpus
    topics: iterable con el # de tópicos
    params: dict con parámetros requeridos en modelo
    ngrams: dict (bigrams, trigrams)
    directory: str   
    lang: spacy.lang
    other: dict, optional (stopwords, postags, entities, stemmer)
    
    Returns
    -------
    dict of str
        Dict con resultados de modelos LDA
    """
    models = {}
    
    for i in topics:
        result = {}
        id2word = corpus.diccionario
        with warnings.catch_warnings():
            warnings.simplefilter('ignore')
            lda = LdaModel(corpus, num_topics=i, id2word=id2word, **params)
        
        texts = list(hp.iter_documents(ngrams, directory, lang, other))
        cm = CoherenceModel(model=lda, texts=texts, dictionary=id2word, coherence='c_v')
        coherence = cm.get_coherence()
        
        result['lda'] = lda
        result['coherence'] = coherence
        
        models[i] = result
    
    return models

In [4]:
dircorpus = '/Users/tombito/Dropbox/datasets/banrep/consultivos/corpus'
pathstops = '/Users/tombito/Dropbox/datasets/wordlists/stopwords/stopwords.xlsx'
dirmodels = 'modelos'
os.makedirs(dirmodels, exist_ok=True)

stops = hp.load_stopwords(pathstops, 'spanish', col='word')
tags = ['NOUN', 'VERB', 'ADJ', 'ADV', 'ADP','AUX', 'DET', 'PRON']
ents = ['PER', 'ORG']

extra = dict(stopwords=stops, postags=tags, entities=ents, ) 
# opcional stemmer=SnowballStemmer('spanish')
# habiendo importado from nltk.stem import SnowballStemmer

In [5]:
consultivos = MiCorpus(dircorpus, nlp, extra)
diccionario = consultivos.diccionario
diccionario.save(os.path.join(dirmodels, 'consultivos.dict'))

ngramas = consultivos.ngramas
bigramas = ngramas['bigrams']
bigramas.save(os.path.join(dirmodels, 'bigramas'))
trigramas = ngramas['trigrams']
trigramas.save(os.path.join(dirmodels, 'trigramas'))

In [6]:
%%time
n = (5, 10, 20, 35, 55, 80, 110)
lda_params = dict(chunksize=100, passes=2, alpha='auto', eta='auto', random_state=100)
modelos = create_models(consultivos, n, lda_params, ngramas, dircorpus, nlp, extra)

CPU times: user 30min 6s, sys: 5min 47s, total: 35min 54s
Wall time: 19min 10s


In [7]:
scores = [modelos[i]['coherence'] for i in n]

# if several indexes with max score, choose first
best_is = [i for i, j in enumerate(scores) if j == max(scores)][0]
best = n[best_is]

In [15]:
# generar gráfica del Coherence Score
colors = ['rgba(204,204,204,1)' if not i==best else 'rgba(222,45,38,0.8)' for i in n]

trace = go.Bar(x=n, y=scores, marker=dict(color=colors))
layout = dict(title='Coherence Score para cada número de tópicos', 
              xaxis=dict(title='Número de tópicos'), 
              yaxis=dict(title='Coherence Score (c_v)',
                         hoverformat='.3f')
             )

fig = dict(data=[trace], layout=layout)
filename = os.path.join(dirmodels, 'coherence.html')
cohfile = pyo.plot(fig, show_link=False, filename=filename)

In [16]:
# para guardar otro habría que seleccionar otro n
ldamodel = modelos[best]['lda']
ldamodel.save(os.path.join(dirmodels, 'topicos-{:0>2}.lda'.format(best)))

In [17]:
df = pd.DataFrame(data=(dict(d) for d in ldamodel[consultivos]), index=hp.get_docnames(dircorpus))
df['dominante'] = df.idxmax(axis=1)
df.to_csv(os.path.join(dirmodels, 'doctopics-{:0>2}.csv'.format(best)), encoding='utf-8')

In [19]:
with warnings.catch_warnings():
    warnings.simplefilter('ignore')
    vis = pyLDAvis.gensim.prepare(ldamodel, list(consultivos), diccionario, sort_topics=False)

pyLDAvis.save_html(vis, os.path.join(dirmodels, 'topicos-{:0>2}.html'.format(best)))

In [59]:
# Number of Documents for Each Topic
rows = 3
cols = 2

topic_counts = df['dominante'].value_counts()
topic_contribution = round(topic_counts/topic_counts.sum(), 4)
head_topics = topic_contribution.head(rows*cols)
head_topics

47    0.1507
55    0.0822
84    0.0685
75    0.0685
93    0.0685
53    0.0411
Name: dominante, dtype: float64

In [60]:
fig = tools.make_subplots(rows=rows, cols=cols, 
                          subplot_titles=(['Tópico {}'.format(t) for t in head_topics.index]),
                          print_grid=False,
                         )

r = 1
for i, t in enumerate(head_topics.index, 1):
    dfg=pd.DataFrame(ldamodel.show_topic(t, 15), columns=['term','prob']).set_index('term')
    
    trace = go.Bar(x=dfg['prob'], y=dfg.index, orientation='h',)
    
    if i%2==0:
        fig.add_trace(trace, row=r, col=2)
        r+=1
    else:
        fig.add_trace(trace, row=r, col=1)

fig.layout.update(title='Principales palabras de tópicos más dominantes',
                  showlegend=False, yaxis=dict(automargin=True),
                  height=1200, width=1200)

f = os.path.join(dirmodels, 'word_topics-{:0>2}.html'.format(best))
headfile = pyo.plot(fig, show_link=False, filename=f)