In [1]:
# This notebook exemplifies topic modeling
# using latent dirichlet allocation.
#
# Author: Fabrício Galende Marques de Carvalho

from pandas import DataFrame
from sklearn.preprocessing import normalize
from  sklearn.feature_extraction.text import TfidfVectorizer


documents = ["O tempo está chuvoso",
             "O dia está ensolarado sem nuvens",
             "A comida está muito saborosa",
             "Dia chuvoso na cidade",
             "Gostei da comida do restaurante, muito saborosa",
             "Começamos o trabalho cedo no escritório"]

# First we get text tfidf representation
tfidf_vectorizer = TfidfVectorizer(norm='l2', use_idf=True)
documents_tfidf_repr = tfidf_vectorizer.fit_transform(documents)
repr_lexicon = tfidf_vectorizer.get_feature_names_out()
print("Representation lexicon: ")
print(repr_lexicon, "\n")
print("Feature vectors: ")
print(documents_tfidf_repr.toarray())
print("\nFeature vetor size: ", len(repr_lexicon))


Representation lexicon: 
['cedo' 'chuvoso' 'cidade' 'começamos' 'comida' 'da' 'dia' 'do'
 'ensolarado' 'escritório' 'está' 'gostei' 'muito' 'na' 'no' 'nuvens'
 'restaurante' 'saborosa' 'sem' 'tempo' 'trabalho'] 

Feature vectors: 
[[0.         0.55902156 0.         0.         0.         0.
  0.         0.         0.         0.         0.47196441 0.
  0.         0.         0.         0.         0.         0.
  0.         0.68172171 0.        ]
 [0.         0.         0.         0.         0.         0.
  0.4024458  0.         0.490779   0.         0.3397724  0.
  0.         0.         0.         0.490779   0.         0.
  0.490779   0.         0.        ]
 [0.         0.         0.         0.         0.51897905 0.
  0.         0.         0.         0.         0.43815778 0.
  0.51897905 0.         0.         0.         0.         0.51897905
  0.         0.         0.        ]
 [0.         0.44836665 0.54677906 0.         0.         0.
  0.44836665 0.         0.         0.         0.     

In [2]:
# Now we use lda algorithm to get topics
from sklearn.decomposition import LatentDirichletAllocation
number_of_topics = 3
topic_labels = []
doc_labels = []
for t in range(number_of_topics):
    topic_labels.append("Topic_" + str(t+1))

for d in range(len(documents)):
    doc_labels.append("Document_" + str(d+1))
    
lda = LatentDirichletAllocation(n_components= number_of_topics, max_iter=50000, random_state=0)
topic_matrix = lda.fit_transform(documents_tfidf_repr)
topic_document_dataframe = DataFrame(data = topic_matrix, columns = topic_labels, index = doc_labels)
print("Document to topic matrix: ")
print(topic_document_dataframe)


Document to topic matrix: 
             Topic_1   Topic_2   Topic_3
Document_1  0.128829  0.126367  0.744804
Document_2  0.108303  0.106227  0.785470
Document_3  0.112570  0.764960  0.122471
Document_4  0.762866  0.113233  0.123901
Document_5  0.093218  0.813525  0.093257
Document_6  0.790266  0.104807  0.104927


In [3]:
# Now we check words that belong to each topic
topic_word_matrix = lda.components_
topic_word_dict = {}
topic_word_dataframe = []
topic_word_dataframe_sorted = []
for i, topic_label in enumerate(topic_labels):
    topic_distr = topic_word_matrix[i]
    topic_distr_prob = topic_distr / topic_distr.sum()
    topic_word_dict[topic_label] = topic_distr_prob
    topic_word_dataframe.append(DataFrame(topic_word_dict["Topic_"+str(i+1)], index=repr_lexicon, columns=["probability"]))
    topic_word_dataframe_sorted.append(topic_word_dataframe[i].apply(lambda x: x.sort_values(ascending=False), axis=0))

print("Most frequent words per topic: ")
for topic_df in topic_word_dataframe_sorted:
    print('\n')
    print(topic_df.head(5))


Most frequent words per topic: 


            probability
cidade         0.078211
na             0.078211
cedo           0.069400
escritório     0.069400
começamos      0.069400


          probability
comida       0.101980
saborosa     0.101980
muito        0.101980
está         0.064597
da           0.063644


            probability
está           0.106178
tempo          0.092315
chuvoso        0.081816
ensolarado     0.074963
nuvens         0.074963


In [5]:
# Let's examine word probability per topic:
df = DataFrame(topic_word_matrix, columns=repr_lexicon, index=topic_labels)
# column normalization (to get word probability per topic)
topic_per_word_df = df / df.sum(axis=0)
print("Word probability per topic:")
print(topic_per_word_df)

Word probability per topic:
             cedo   chuvoso    cidade  começamos    comida        da  \
Topic_1  0.537682  0.385804  0.566937   0.537682  0.180507  0.237547   
Topic_2  0.231132  0.167468  0.216352   0.231132  0.638829  0.524887   
Topic_3  0.231186  0.446728  0.216712   0.231186  0.180664  0.237566   

              dia        do  ensolarado  escritório  ...    gostei     muito  \
Topic_1  0.419861  0.237547    0.224468    0.537682  ...  0.237547  0.180507   
Topic_2  0.181443  0.524887    0.224384    0.231132  ...  0.524887  0.638829   
Topic_3  0.398697  0.237566    0.551148    0.231186  ...  0.237566  0.180664   

               na        no    nuvens  restaurante  saborosa       sem  \
Topic_1  0.566937  0.537682  0.224468     0.237547  0.180507  0.224468   
Topic_2  0.216352  0.231132  0.224384     0.524887  0.638829  0.224384   
Topic_3  0.216712  0.231186  0.551148     0.237566  0.180664  0.551148   

            tempo  trabalho  
Topic_1  0.199217  0.537682  
Topic