In [1]:
import gensim #pip install gensim
import pprint
from gensim import corpora, models
from gensim.models import LdaModel
from gensim.models.coherencemodel import CoherenceModel
from nltk.tokenize import word_tokenize
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import logging
import pyLDAvis.gensim #pip install pyLDAvis
import pyLDAvis.gensim_models
pyLDAvis.enable_notebook()


''' 
 default_term_info = default_term_info.sort_values(
        by='saliency', ascending=False).head(R).drop(labels='saliency', axis=1) #FIX pyLDAVIS
'''

In [2]:
#1. Creazione del TF-IDF (Term Frequency - Inverse Document Frequency), assegnamo un peso ad ogni parola
#  - TF: frequenza del termine nel documento: num. occorrenze / num. totale di parole nel documento
#  - IDF: inverso della frequenza del termine nei documenti: log(num. totale di documenti / num. documenti che contengono il termine)
#  - TF-IDF: prodotto delle due precedenti (risulta elevato quando la parola è molto presente nel documento e poco presente negli altri documenti)

source = './doc/cleaned.csv'
df = pd.read_csv(source)

#Tokenizzazione
tweets = df['lemmatized_text'].apply(lambda text: word_tokenize(text))

# Create a dictionary
dictionary = corpora.Dictionary(tweets)

# Create a corpus
corpus = [dictionary.doc2bow(doc) for doc in tweets]

# Train the TF-IDF model
tfidf = models.TfidfModel(corpus)

# Transform the corpus into TF-IDF vectors
tfidf_corpus = tfidf[corpus]

In [3]:
# Show some results
for i, doc in enumerate(tfidf_corpus):
    if i > 2:
        break
    print(f"TF-IDF values for document {i}:")
    for token_id, tfidf_value in doc:
        word = dictionary[token_id]  # Get the word corresponding to the token_id
        print(f"Token ID: {token_id}, Word: {word}, TF-IDF Value: {tfidf_value}")
    print("\n")

TF-IDF values for document 0:
Token ID: 0, Word: abuse, TF-IDF Value: 0.08351972565693129
Token ID: 1, Word: abused, TF-IDF Value: 0.10561463665339407
Token ID: 2, Word: action, TF-IDF Value: 0.05909593110836793
Token ID: 3, Word: agency, TF-IDF Value: 0.3698914715979529
Token ID: 4, Word: along, TF-IDF Value: 0.07650986140193991
Token ID: 5, Word: call, TF-IDF Value: 0.11628774127377883
Token ID: 6, Word: came, TF-IDF Value: 0.07522756557898667
Token ID: 7, Word: can, TF-IDF Value: 0.10543304589762349
Token ID: 8, Word: clearly, TF-IDF Value: 0.08667131008946559
Token ID: 9, Word: comment, TF-IDF Value: 0.08179051475953035
Token ID: 10, Word: commission, TF-IDF Value: 0.09571524670882778
Token ID: 11, Word: comprehensive, TF-IDF Value: 0.09356122424444609
Token ID: 12, Word: either, TF-IDF Value: 0.07937580029708532
Token ID: 13, Word: elon, TF-IDF Value: 0.0884529890869541
Token ID: 14, Word: estimate, TF-IDF Value: 0.10184207648270188
Token ID: 15, Word: forcing, TF-IDF Value: 0.095

In [9]:
# Set up logging
#logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
logging.basicConfig(level=logging.INFO)

#LDA TRAINING
lda_model = LdaModel(
    corpus = tfidf_corpus, #corpus in tfidf
    id2word = dictionary, #dizionario (vocabolario dei miei documenti)
    #chunksize=2000, #documenti processati ad ogni iterazione
    alpha = 'auto',
    eta = 'auto',
    num_topics = 10,
    random_state = 42, #per riproducibilità
    passes = 40,
    per_word_topics = True) #aggiunge maggiori info

lda_model.save('lda_model')
lda_model.show_topics(formatted=False)

INFO:gensim.models.ldamodel:using autotuned alpha, starting with [0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1]
INFO:gensim.models.ldamodel:using serial LDA version on this node
INFO:gensim.models.ldamodel:running online (multi-pass) LDA training, 10 topics, 40 passes over the supplied corpus of 70000 documents, updating model once every 2000 documents, evaluating perplexity every 20000 documents, iterating 50x with a convergence threshold of 0.001000
INFO:gensim.models.ldamodel:PROGRESS: pass 0, at document #2000/70000
INFO:gensim.models.ldamodel:optimized alpha [0.089398384, 0.091010675, 0.09062004, 0.08825481, 0.08823691, 0.09017202, 0.08979661, 0.09082118, 0.09243118, 0.08907737]
INFO:gensim.models.ldamodel:merging changes from 2000 documents into a model of 70000 documents
INFO:gensim.models.ldamodel:topic #4 (0.088): 0.008*"biden" + 0.004*"president" + 0.004*"republican" + 0.004*"medicare" + 0.004*"trump" + 0.003*"american" + 0.003*"harris" + 0.003*"administration" + 0.003*"h

[(0,
  [('god', 0.011319469),
   ('faith', 0.011066476),
   ('transportation', 0.010756059),
   ('hey', 0.009411766),
   ('facility', 0.008248098),
   ('study', 0.006883903),
   ('greed', 0.006773527),
   ('rise', 0.0063835005),
   ('spring', 0.0063322256),
   ('gold', 0.006261106)]),
 (1,
  [('biden', 0.005606483),
   ('amp', 0.0054543475),
   ('american', 0.005339086),
   ('trump', 0.005243356),
   ('people', 0.004777965),
   ('year', 0.004697832),
   ('president', 0.004614741),
   ('republican', 0.0044469857),
   ('state', 0.0042390795),
   ('right', 0.004199476)]),
 (2,
  [('deserve', 0.01075291),
   ('water', 0.009379721),
   ('billionaire', 0.0062786783),
   ('growth', 0.005573682),
   ('grant', 0.0053440444),
   ('living', 0.005320508),
   ('incredible', 0.0052812747),
   ('emergency', 0.005243022),
   ('article', 0.0047259326),
   ('ceiling', 0.004672427)]),
 (3,
  [('via', 0.021805989),
   ('writes', 0.011525905),
   ('mike', 0.01102325),
   ('episode', 0.010117837),
   ('hous

In [3]:

# Load the saved LDA model
lda_model_up = LdaModel.load("lda_model")

# Print the topics and associated words
topics = lda_model_up.show_topics(num_topics=10, num_words=10)  # You can adjust the number of words as needed
print("\n----- TOPIC -----")
for topic in topics:
    topic_words = topic[1].split("+")
    words = [word.split("*")[1].strip() for word in topic_words]
    topic_str = ', '.join(words)
    print(f"Topic: {topic_str}")


----- TOPIC -----
Topic: "god", "faith", "transportation", "hey", "facility", "study", "greed", "rise", "spring", "gold"
Topic: "biden", "amp", "american", "trump", "people", "year", "president", "republican", "state", "right"
Topic: "deserve", "water", "billionaire", "growth", "grant", "living", "incredible", "emergency", "article", "ceiling"
Topic: "via", "writes", "mike", "episode", "housing", "rest", "throughout", "guest", "tour", "application"
Topic: "jersey", "resign", "birthday", "mom", "legislature", "training", "bribery", "ai", "tuberville", "crossing"
Topic: "improve", "strike", "loan", "relief", "rail", "rural", "starting", "ny", "honored", "fun"
Topic: "sen", "solution", "extremist", "healthcare", "colleague", "mandate", "lawsuit", "priority", "caucus", "nj"
Topic: "act", "care", "community", "woman", "health", "child", "economy", "cost", "covid", "law"
Topic: "colorado", "full", "local", "save", "penny", "texas", "word", "education", "air", "stage"
Topic: "pm", "et", "def

In [5]:
#pyLDAvis (https://siqi-zhu.medium.com/ldavis-a-deep-dive-into-the-popular-topic-modeling-tool-d0c61a03e969)

vis = pyLDAvis.gensim_models.prepare(lda_model_up, tfidf_corpus, dictionary, sort_topics=True)
pyLDAvis.display(vis)