# LDA

In [54]:
import pandas as pd
import numpy as np
import random

In [77]:
df_copom = pd.read_csv('df_copom.csv')

In [78]:
df_copom = df_copom[df_copom['Type'] == 'Minutes']
df_copom

Unnamed: 0,Date,Selic,Meeting_Number,Decision,Decision_txt,Text,Type
114,2020/08/05,2.0,232.0,-0.25,decrease,"1. Regarding the global outlook, the Covid-19 ...",Minutes
116,2020/09/16,2.0,233.0,0.0,mantain,A) Update of economic outlook and Copom’s base...,Minutes
118,2020/10/28,2.0,234.0,0.0,mantain,A) Update of economic outlook and Copom’s base...,Minutes
120,2020/12/09,2.0,235.0,0.0,mantain,A) Update of economic outlook and Copom’s base...,Minutes
122,2021/01/20,2.0,236.0,0.0,mantain,A) Update of economic outlook and Copom’s base...,Minutes
123,2021/03/17,2.75,237.0,0.75,increase,A) Update of economic outlook and Copom’s base...,Minutes
125,2021/05/05,3.5,238.0,0.75,increase,A) Update of economic outlook and Copom’s base...,Minutes
128,2021/06/16,4.25,239.0,0.75,increase,A) Update of economic outlook and Copom’s base...,Minutes
129,2021/08/04,5.25,240.0,1.0,increase,A) Update of economic outlook and Copom’s base...,Minutes
131,2021/09/22,6.25,241.0,1.0,increase,A) Update of economic outlook and Copom’s base...,Minutes


In [79]:
from sklearn.feature_extraction.text import CountVectorizer

In [80]:
cv = CountVectorizer(max_df=0.95, min_df=1, stop_words='english')

In [81]:
dtm = cv.fit_transform(df_copom['Text'])

In [82]:
dtm

<23x1910 sparse matrix of type '<class 'numpy.int64'>'
	with 9352 stored elements in Compressed Sparse Row format>

In [83]:
from sklearn.decomposition import LatentDirichletAllocation

In [84]:
LDA = LatentDirichletAllocation(n_components=3,random_state=123)

In [85]:
LDA.fit(dtm)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='batch', learning_offset=10.0,
             max_doc_update_iter=100, max_iter=10, mean_change_tol=0.001,
             n_components=3, n_jobs=None, n_topics=None, perp_tol=0.1,
             random_state=123, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)

In [86]:
len(cv.get_feature_names())

1910

In [87]:
for i in range(10):
    random_word_id = random.randint(0,2403)
    print(cv.get_feature_names()[random_word_id])

departmentgilneu
strongly
compromise
especially
manoel
implicit


IndexError: list index out of range

In [88]:
len(LDA.components_)

3

In [89]:
len(LDA.components_[0])

1910

In [90]:
LDA.components_

array([[ 3.15464938,  0.33334159,  0.33333611, ...,  0.33333562,
         6.33332101,  2.33093657],
       [ 9.50801129,  0.55932442,  0.33333866, ...,  0.33333749,
         0.33333938,  0.33572165],
       [15.33733933,  2.10733399,  1.33332523, ...,  4.33332689,
         0.33333961,  0.33334178]])

In [91]:
single_topic = LDA.components_[0]

In [92]:
single_topic.argsort()

array([ 962,  954, 1209, ...,   37,   35,  943], dtype=int64)

In [93]:
top_word_indices = single_topic.argsort()[-10:]

In [94]:
for index in top_word_indices:
    print(cv.get_feature_names()[index])

cycle
period
greater
scenarios
tightening
pressures
uncertainty
2024
2023
inflationary


In [95]:
for index,topic in enumerate(LDA.components_):
    print(f'THE TOP 15 WORDS FOR TOPIC #{index}')
    print([cv.get_feature_names()[i] for i in topic.argsort()[-20:]])
    print('\n')

THE TOP 15 WORDS FOR TOPIC #0
['emphasizes', 'labor', 'deceleration', 'strategy', 'remain', 'quarter', 'impacts', 'longer', 'financial', 'continues', 'cycle', 'period', 'greater', 'scenarios', 'tightening', 'pressures', 'uncertainty', '2024', '2023', 'inflationary']


THE TOP 15 WORDS FOR TOPIC #1
['result', 'reference', 'pace', 'pandemic', 'upward', 'usd', 'scenarios', 'future', 'evolution', 'price', 'recovery', 'second', 'normalization', 'adjustment', '2021', 'regarding', 'tightening', 'appropriate', '2023', 'baseline']


THE TOP 15 WORDS FOR TOPIC #2
['sectors', 'transfer', 'remain', 'necessary', 'conditions', 'continue', 'financial', 'uncertainty', 'programs', 'regarding', '2020', 'slack', 'guidance', 'forward', 'evolution', 'stimulus', 'pandemic', 'recovery', '2021', 'baseline']




In [96]:
topic_results = LDA.transform(dtm)

In [97]:
topic_results[0].argmax()

2

In [98]:
topic_results.argmax(axis=1)

array([2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0], dtype=int64)

In [99]:
df_copom['LDA'] = topic_results.argmax(axis=1)
df_copom[['Decision_txt','LDA']]

Unnamed: 0,Decision_txt,LDA
114,decrease,2
116,mantain,2
118,mantain,2
120,mantain,2
122,mantain,2
123,increase,2
125,increase,1
128,increase,1
129,increase,1
131,increase,1


# Non-Negative Matric Factorization

In [58]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [59]:
tfidf = TfidfVectorizer(max_df=0.95, min_df=1, stop_words='english')

In [60]:
dtm2 = tfidf.fit_transform(df_copom['Text'])

In [61]:
from sklearn.decomposition import NMF

In [62]:
nmf_model = NMF(n_components=3,random_state=123)

In [64]:
nmf_model.fit(dtm2)

NMF(alpha=0.0, beta_loss='frobenius', init=None, l1_ratio=0.0, max_iter=200,
  n_components=3, random_state=123, shuffle=False, solver='cd', tol=0.0001,
  verbose=0)

In [65]:
for i in range(10):
    random_word_id = random.randint(0,2403)
    print(tfidf.get_feature_names()[random_word_id])

understanding
expand
inherently
strongly
andgiven
implemented
agriculture
20th
eugênio
end


In [66]:
nmf_model.components_

array([[9.00084836e-02, 3.69296219e-03, 1.70899824e-03, ...,
        3.09937891e-02, 5.95255438e-03, 2.74706313e-03],
       [8.10096301e-02, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [3.36432965e-02, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 1.97450392e-05]])

In [67]:
single_topic2 = nmf_model.components_[0]

In [68]:
top_word_indices2 = single_topic2.argsort()[-10:]

In [69]:
for index in top_word_indices2:
    print(tfidf.get_feature_names()[index])

expectations
baseline
risks
rate
economic
monetary
policy
scenario
committee
inflation


In [70]:
for index,topic in enumerate(nmf_model.components_):
    print(f'THE TOP 15 WORDS FOR TOPIC #{index}')
    print([tfidf.get_feature_names()[i] for i in topic.argsort()[-15:]])
    print('\n')

THE TOP 15 WORDS FOR TOPIC #0
['2023', 'meeting', 'balance', '2022', 'projections', 'expectations', 'baseline', 'risks', 'rate', 'economic', 'monetary', 'policy', 'scenario', 'committee', 'inflation']


THE TOP 15 WORDS FOR TOPIC #1
['edson', 'feltrim', 'lopes', 'anthero', 'moraes', 'meirelles', 'awazu', 'alexandre', 'antonio', 'released', 'aldo', 'tombini', 'altamir', 'following', 'luiz']


THE TOP 15 WORDS FOR TOPIC #2
['reduce', 'action', 'target', 'monetary', 'march', 'released', 'macroeconomic', 'brasília', 'minutes', 'bias', 'points', 'percent', 'thursday', 'basis', 'votes']




In [72]:
topic_results2 = nmf_model.transform(dtm2)

In [74]:
topic_results2.argmax(axis=1)

array([2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0], dtype=int64)

In [73]:
df_copom['NMF'] = topic_results2.argmax(axis=1)
df_copom[['Decision_txt','NMF']]

Unnamed: 0,Decision_txt,NMF
0,decrease,2
1,decrease,2
2,decrease,2
3,decrease,2
4,decrease,2
...,...,...
154,mantain,0
155,mantain,0
156,mantain,0
157,mantain,0
