Modelling notebook

In [42]:
import pandas as pd
import numpy as np
import gensim
import nltk

from decouple import config
from gensim.corpora import Dictionary
from gensim.models import LdaMulticore
from gsdmm import MovieGroupProcess

nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/danipinho/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/danipinho/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [4]:
DATA_PATH = config("DATA_PATH")

In [7]:
data = pd.read_csv(DATA_PATH + "/Data/tweets_preprocessed.csv", index_col=[0])

In [8]:
data.head()

Unnamed: 0,query,id,text,author_id,conversation_id,entities,geo,lang,referenced_tweets,clean_text
0,Operador Nacional do Sistema Eletrico,1208027618970361856,O ONS - Operador Nacional do Sistema Elétrico ...,2873781071,1208027617108082689,"{'annotations': [{'start': 29, 'end': 44, 'pro...",,pt,[<ReferencedTweet id=1208027617108082689 type=...,o ons operador nacional do sistema elétrico e ...
1,Operador Nacional do Sistema Eletrico,1202138471432937472,O Nordeste registrou no domingo um recorde de ...,1104827298279288833,1202138471432937472,"{'urls': [{'start': 215, 'end': 238, 'url': 'h...",,pt,,o nordeste registrou no domingo um recorde de ...
2,Operador Nacional do Sistema Eletrico,1198261545194668033,É melhor o Operador Nacional do Sistema Elétri...,71368901,1198261545194668033,,,pt,,é melhor o operador nacional do sistema elétri...
3,Operador Nacional do Sistema Eletrico,1198012280329375744,Visita Técnica ao Operador Nacional do Sistema...,33377191,1198012280329375744,"{'urls': [{'start': 171, 'end': 194, 'url': 'h...","{'place_id': '5722ff20ba67083b', 'coordinates'...",pt,,visita técnica ao operador nacional do sistema...
4,Operador Nacional do Sistema Eletrico,1197929769402339330,New post in Forex Consultor Expert BR News: Mo...,95810074,1197929769402339330,"{'urls': [{'start': 256, 'end': 279, 'url': 'h...",,pt,,new post in forex consultor expert br news mon...


In [14]:
data["text_token"] = data["clean_text"].apply(lambda x: nltk.word_tokenize(x))

In [24]:
stopwords = nltk.corpus.stopwords.words("portuguese")

In [26]:
def remove_stopwords(row):
    list_token = []
    for token in row:
        if token not in stopwords:
            list_token.append(token)
    return list_token


In [27]:
data["text_token"] = data["text_token"].apply(lambda row: remove_stopwords(row))

In [28]:
data["text_token"]

0        [ons, operador, nacional, sistema, elétrico, c...
1        [nordeste, registrou, domingo, recorde, geraçã...
2        [melhor, operador, nacional, sistema, elétrico...
3        [visita, técnica, operador, nacional, sistema,...
4        [new, post, in, forex, consultor, expert, br, ...
                               ...                        
13024                             [cali, es, enferma, ons]
13025                            [wn, veamos, ons, juntos]
13026    [parcelado, secador, cabelos, britânia, sp3300...
13027    [nordeste, ons, fez, ligeiro, ajuste, cima, es...
13028    [fsp, diz, q, operador, nacional, sistema, elé...
Name: text_token, Length: 13029, dtype: object

### LDA

In [29]:
docs = data["text_token"].to_numpy() 

In [30]:
docs

array([list(['ons', 'operador', 'nacional', 'sistema', 'elétrico', 'chesf', 'iniciaram', 'imediatamente', 'retomada', 'cargas', 'concessionária', 'serviço', 'normalizado', 'gradativamente', 'pouco', 'tempo', 'todos', 'clientes', 'fornecimento', 'regularizado']),
       list(['nordeste', 'registrou', 'domingo', 'recorde', 'geração', 'energia', 'solar', 'fotovoltaica', 'instantânea', '1', '16', 'gigawatts', 'gw', 'gerados', '10h45', 'informou', 'nesta', 'segunda', 'feira', 'operador', 'nacional', 'sistema', 'elétrico', 'ons']),
       list(['melhor', 'operador', 'nacional', 'sistema', 'elétrico', 'aumentar', 'produção', 'hoje', 'pq', 'tende', 'ficar', 'instável', 'tanto', 'secador', 'ligado']),
       ...,
       list(['parcelado', 'secador', 'cabelos', 'britânia', 'sp3300v', 'emissão', 'íons', 'r', '79', '90']),
       list(['nordeste', 'ons', 'fez', 'ligeiro', 'ajuste', 'cima', 'estimativa', 'chuvas', '63', '64', 'média', 'histórica', 'operador', 'estimou', 'reservatórios', 'hidrelétri

In [31]:
dictionary = Dictionary(docs)

In [36]:
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

In [37]:
bow_corpus = [dictionary.doc2bow(doc) for doc in docs]

In [40]:
lda_model = gensim.models.LdaMulticore(bow_corpus, 
                                         num_topics=10, 
                                         id2word=dictionary, 
                                         passes=4, 
                                         workers=2,
                                         random_state=42)

In [41]:
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.032*"ons" + 0.027*"elétrica" + 0.018*"infraestrutura" + 0.017*"acabei" + 0.016*"6" + 0.016*"2" + 0.014*"3" + 0.013*"ano" + 0.013*"água" + 0.013*"sul"
Topic: 1 
Words: 0.119*"ons" + 0.055*"hidrelétricas" + 0.052*"reservatórios" + 0.051*"usinas" + 0.048*"diz" + 0.046*"nível" + 0.036*"melhora" + 0.022*"serena" + 0.017*"jabeur" + 0.014*"sistema"
Topic: 2 
Words: 0.040*"elétrica" + 0.017*"infraestrutura" + 0.016*"sistema" + 0.013*"elétrico" + 0.013*"conta" + 0.012*"pra" + 0.011*"nacional" + 0.011*"luz" + 0.010*"aneel" + 0.009*"água"
Topic: 3 
Words: 0.093*"ons" + 0.036*"sistema" + 0.035*"nacional" + 0.032*"elétrico" + 0.031*"operador" + 0.025*"reservatórios" + 0.016*"dados" + 0.016*"2022" + 0.013*"12" + 0.013*"situação"
Topic: 4 
Words: 0.028*"sistema" + 0.026*"elétrico" + 0.022*"elétrica" + 0.018*"ons" + 0.015*"infraestrutura" + 0.015*"q" + 0.012*"bravo" + 0.012*"aqui" + 0.009*"linha" + 0.008*"tô"
Topic: 5 
Words: 0.081*"ons" + 0.031*"deve" + 0.031*"aumento" + 0.031*"ter

### GSDMM

In [43]:
vocab_length = len(dictionary)

In [44]:
gsdmm = MovieGroupProcess(K=50, alpha=0.1, beta=0.1, n_iters=15)

In [45]:
y = gsdmm.fit(docs, vocab_length)

In stage 0: transferred 11421 clusters with 50 clusters populated
In stage 1: transferred 5784 clusters with 50 clusters populated
In stage 2: transferred 3661 clusters with 50 clusters populated
In stage 3: transferred 2791 clusters with 50 clusters populated
In stage 4: transferred 2477 clusters with 50 clusters populated
In stage 5: transferred 2253 clusters with 50 clusters populated
In stage 6: transferred 2079 clusters with 50 clusters populated
In stage 7: transferred 1853 clusters with 50 clusters populated
In stage 8: transferred 1815 clusters with 50 clusters populated
In stage 9: transferred 1687 clusters with 50 clusters populated
In stage 10: transferred 1666 clusters with 50 clusters populated
In stage 11: transferred 1677 clusters with 50 clusters populated
In stage 12: transferred 1643 clusters with 50 clusters populated
In stage 13: transferred 1643 clusters with 50 clusters populated
In stage 14: transferred 1667 clusters with 50 clusters populated


In [46]:
doc_count = np.array(gsdmm.cluster_doc_count)
print('Number of documents per topic :', doc_count)

Number of documents per topic : [111 212 110  92 116 113 280 183 253 294 282 292 595 231 314 169 226 126
 140 141 350 656 232 153 371 281 260 262 273 370 428 215 515 150 213 225
 119 193 513 275 208 207 285 133 259 399 213 369 306 316]


In [47]:
top_index = doc_count.argsort()[-15:][::-1]
print('Most important clusters (by number of docs inside):', top_index)

Most important clusters (by number of docs inside): [21 12 32 38 30 45 24 29 47 20 49 14 48  9 11]


In [48]:
def top_words(cluster_word_distribution, top_cluster, values):
    for cluster in top_cluster:
        sort_dicts = sorted(cluster_word_distribution[cluster].items(), key=lambda k: k[1], reverse=True)[:values]
        print("\nCluster %s : %s"%(cluster, sort_dicts))

In [49]:
top_words(gsdmm.cluster_word_distribution, top_index, 20)


Cluster 21 : [('ons', 373), ('pra', 96), ('q', 91), ('sistema', 77), ('elétrico', 73), ('vai', 46), ('pq', 43), ('n', 43), ('vou', 43), ('energia', 38), ('melhor', 35), ('ver', 34), ('tava', 34), ('aqui', 33), ('parte', 33), ('ter', 31), ('hoje', 30), ('tudo', 29), ('tá', 28), ('anime', 28)]

Cluster 12 : [('energia', 547), ('elétrica', 461), ('pra', 94), ('aqui', 90), ('infraestrutura', 85), ('sistema', 72), ('dia', 72), ('ano', 72), ('elétrico', 70), ('casa', 65), ('horas', 60), ('luz', 59), ('agora', 55), ('vai', 47), ('falta', 44), ('chuva', 42), ('tá', 40), ('cidade', 35), ('vez', 34), ('pq', 34)]

Cluster 32 : [('energia', 473), ('elétrica', 335), ('infraestrutura', 117), ('vai', 111), ('aumento', 98), ('sistema', 79), ('conta', 74), ('elétrico', 73), ('gás', 63), ('pra', 62), ('país', 61), ('preço', 56), ('geração', 54), ('governo', 52), ('brasil', 51), ('empresa', 50), ('povo', 47), ('cara', 44), ('ter', 43), ('gasolina', 40)]

Cluster 38 : [('sistema', 372), ('ons', 363), ('e