Modelling notebook

In [1]:
import pandas as pd
import numpy as np
import gensim
import nltk

from decouple import config
from gensim.corpora import Dictionary
from gensim.models import LdaMulticore
from gsdmm import MovieGroupProcess

nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/danipinho/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/danipinho/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
DATA_PATH = config("DATA_PATH")

In [3]:
data = pd.read_json(DATA_PATH + "/Data/tweets_preprocessed_json.json")

In [4]:
data.head()

Unnamed: 0,query,text,clean_text,text_token
0,Operador Nacional do Sistema Eletrico,O ONS - Operador Nacional do Sistema Elétrico ...,ons chesf iniciaram imediatamente retomada car...,"[ons, chesf, iniciaram, imediatamente, retomad..."
1,Operador Nacional do Sistema Eletrico,O Nordeste registrou no domingo um recorde de ...,nordeste registrou domingo recorde geracao ene...,"[nordeste, registrou, domingo, recorde, geraca..."
2,Operador Nacional do Sistema Eletrico,É melhor o Operador Nacional do Sistema Elétri...,melhor aumentar producao hoje tende ficar inst...,"[melhor, aumentar, producao, hoje, tende, fica..."
3,Operador Nacional do Sistema Eletrico,Visita Técnica ao Operador Nacional do Sistema...,visita tecnica ons brasilia ons,"[visita, tecnica, ons, brasilia, ons]"
4,Operador Nacional do Sistema Eletrico,New post in Forex Consultor Expert BR News: Mo...,new post forex consultor expert news money tim...,"[new, post, forex, consultor, expert, news, mo..."


### LDA

In [5]:
docs = data["text_token"].to_numpy() 

In [6]:
docs

array([list(['ons', 'chesf', 'iniciaram', 'imediatamente', 'retomada', 'cargas', 'concessionaria', 'servico', 'normalizado', 'gradativamente', 'pouco', 'tempo', 'todos', 'clientes', 'fornecimento', 'regularizado']),
       list(['nordeste', 'registrou', 'domingo', 'recorde', 'geracao', 'energia', 'solar', 'fotovoltaica', 'instantanea', 'gigawatts', 'gerados', '10h45', 'informou', 'nesta', 'segunda', 'feira', 'ons']),
       list(['melhor', 'aumentar', 'producao', 'hoje', 'tende', 'ficar', 'instavel', 'tanto', 'secador', 'ligado']),
       ...,
       list(['parcelado', 'secador', 'cabelos', 'britania', 'sp3300v', 'emissao', 'ions']),
       list(['nordeste', 'fez', 'ligeiro', 'ajuste', 'cima', 'estimativa', 'chuvas', 'media', 'historica', 'operador', 'estimou', 'reservatorios', 'hidreletricas', 'sudeste', 'centro', 'oeste', 'devem', 'alcancar', 'capacidade', 'final', 'junho']),
       list(['fsp', 'diz', 'operador', 'nacional', 'sistema', 'eletrico', 'passou', 'esperar', 'queda', 'carg

In [7]:
dictionary = Dictionary(docs)

In [8]:
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

In [9]:
bow_corpus = [dictionary.doc2bow(doc) for doc in docs]

In [10]:
lda_model = gensim.models.LdaMulticore(bow_corpus, 
                                         num_topics=10, 
                                         id2word=dictionary, 
                                         passes=4, 
                                         workers=2,
                                         random_state=42)

In [11]:
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.040*"eletrica" + 0.010*"reajuste" + 0.009*"preciso" + 0.008*"transporte" + 0.008*"geracao" + 0.008*"eletrico" + 0.008*"anime" + 0.007*"educacao" + 0.007*"parece" + 0.007*"mundo"
Topic: 1 
Words: 0.046*"serena" + 0.040*"jabeur" + 0.032*"wta" + 0.027*"situacao" + 0.024*"acabei" + 0.022*"reservatorios" + 0.016*"volume" + 0.015*"williams" + 0.015*"video" + 0.014*"2021"
Topic: 2 
Words: 0.025*"eletrico" + 0.024*"eletrica" + 0.023*"publicar" + 0.017*"foto" + 0.015*"subsistemas" + 0.015*"pacote" + 0.014*"reservatoriosbr" + 0.013*"linha" + 0.012*"energia" + 0.009*"devido"
Topic: 3 
Words: 0.048*"energia" + 0.036*"aumento" + 0.029*"eletrica" + 0.028*"carga" + 0.027*"janeiro" + 0.019*"geracao" + 0.017*"estima" + 0.017*"nacional" + 0.013*"aneel" + 0.012*"brasil"
Topic: 4 
Words: 0.033*"eletrica" + 0.011*"geracao" + 0.011*"ver" + 0.011*"ainda" + 0.010*"eletrico" + 0.008*"gas" + 0.007*"nada" + 0.007*"conta" + 0.007*"todo" + 0.007*"bem"
Topic: 5 
Words: 0.036*"geracao" + 0.020*"el

### GSDMM

In [12]:
vocab_length = len(dictionary)

In [13]:
gsdmm = MovieGroupProcess(K=50, alpha=0.1, beta=0.1, n_iters=15)

In [14]:
y = gsdmm.fit(docs, vocab_length)

In stage 0: transferred 11418 clusters with 50 clusters populated
In stage 1: transferred 5738 clusters with 50 clusters populated
In stage 2: transferred 3938 clusters with 50 clusters populated
In stage 3: transferred 3240 clusters with 50 clusters populated
In stage 4: transferred 2805 clusters with 50 clusters populated
In stage 5: transferred 2537 clusters with 50 clusters populated
In stage 6: transferred 2413 clusters with 50 clusters populated
In stage 7: transferred 2327 clusters with 50 clusters populated
In stage 8: transferred 2198 clusters with 50 clusters populated
In stage 9: transferred 2191 clusters with 50 clusters populated
In stage 10: transferred 2108 clusters with 50 clusters populated
In stage 11: transferred 2080 clusters with 50 clusters populated
In stage 12: transferred 2052 clusters with 50 clusters populated
In stage 13: transferred 2005 clusters with 50 clusters populated
In stage 14: transferred 2052 clusters with 50 clusters populated


In [15]:
doc_count = np.array(gsdmm.cluster_doc_count)
print('Number of documents per topic :', doc_count)

Number of documents per topic : [223 185 301 227 265 155 262 222 313 150 172 105 560 302 216 653 281 303
 495 249 289 286 187 300 347 143 306 253 142 112 122 214 183 332 380 178
 251 515 331 200 205 159 220 178 305 228 271 132 312 309]


In [16]:
top_index = doc_count.argsort()[-15:][::-1]
print('Most important clusters (by number of docs inside):', top_index)

Most important clusters (by number of docs inside): [15 12 37 18 34 24 33 38  8 48 49 26 44 17 13]


In [17]:
def top_words(cluster_word_distribution, top_cluster, values):
    for cluster in top_cluster:
        sort_dicts = sorted(cluster_word_distribution[cluster].items(), key=lambda k: k[1], reverse=True)[:values]
        print("\nCluster %s : %s"%(cluster, sort_dicts))

In [18]:
top_words(gsdmm.cluster_word_distribution, top_index, 20)


Cluster 15 : [('eletrica', 408), ('eletrico', 119), ('ano', 86), ('casa', 83), ('dia', 75), ('horas', 63), ('luz', 60), ('agora', 53), ('cidade', 44), ('chuva', 39), ('energia', 38), ('hoje', 35), ('todo', 34), ('vez', 33), ('falta', 33), ('gente', 33), ('problema', 32), ('novo', 31), ('bem', 27), ('ainda', 26)]

Cluster 12 : [('reservatorios', 239), ('ons', 211), ('hidreletricas', 191), ('energia', 184), ('nivel', 149), ('usinas', 142), ('carga', 123), ('diz', 103), ('eletrico', 97), ('nacional', 93), ('sudeste', 89), ('melhora', 85), ('aumento', 77), ('mes', 76), ('ano', 75), ('pais', 74), ('operador', 74), ('brasil', 72), ('centro', 72), ('oeste', 71)]

Cluster 37 : [('minas', 87), ('ministro', 74), ('ministerio', 54), ('sabe', 47), ('melhor', 36), ('eletrica', 34), ('parte', 34), ('acho', 31), ('governo', 30), ('anime', 30), ('ver', 29), ('presidente', 28), ('cara', 27), ('ainda', 27), ('manga', 27), ('bem', 25), ('tarcisio', 23), ('brasil', 23), ('seguranca', 22), ('tava', 22)]

