In [7]:
import gensim #pip install gensim
import pprint
from gensim import corpora, models
from gensim.models import LdaModel
from gensim.models.coherencemodel import CoherenceModel
from nltk.tokenize import word_tokenize
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import logging

In [2]:
#1. Creazione del TF-IDF (Term Frequency - Inverse Document Frequency), assegnamo un peso ad ogni parola
#  - TF: frequenza del termine nel documento: num. occorrenze / num. totale di parole nel documento
#  - IDF: inverso della frequenza del termine nei documenti: log(num. totale di documenti / num. documenti che contengono il termine)
#  - TF-IDF: prodotto delle due precedenti (risulta elevato quando la parola è molto presente nel documento e poco presente negli altri documenti)

source = './doc/cleaned.csv'
df = pd.read_csv(source)

#Tokenizzazione
tweets = df['lemmatized_text'].apply(lambda text: word_tokenize(text))

# Create a dictionary
dictionary = corpora.Dictionary(tweets)

# Create a corpus
corpus = [dictionary.doc2bow(doc) for doc in tweets]

# Train the TF-IDF model
tfidf = models.TfidfModel(corpus)

# Transform the corpus into TF-IDF vectors
tfidf_corpus = tfidf[corpus]

In [3]:
# Show some results
for i, doc in enumerate(tfidf_corpus):
    if i > 2:
        break
    print(f"TF-IDF values for document {i}:")
    for token_id, tfidf_value in doc:
        word = dictionary[token_id]  # Get the word corresponding to the token_id
        print(f"Token ID: {token_id}, Word: {word}, TF-IDF Value: {tfidf_value}")
    print("\n")

TF-IDF values for document 0:
Token ID: 0, Word: abuse, TF-IDF Value: 0.08351972565693129
Token ID: 1, Word: abused, TF-IDF Value: 0.10561463665339407
Token ID: 2, Word: action, TF-IDF Value: 0.05909593110836793
Token ID: 3, Word: agency, TF-IDF Value: 0.3698914715979529
Token ID: 4, Word: along, TF-IDF Value: 0.07650986140193991
Token ID: 5, Word: call, TF-IDF Value: 0.11628774127377883
Token ID: 6, Word: came, TF-IDF Value: 0.07522756557898667
Token ID: 7, Word: can, TF-IDF Value: 0.10543304589762349
Token ID: 8, Word: clearly, TF-IDF Value: 0.08667131008946559
Token ID: 9, Word: comment, TF-IDF Value: 0.08179051475953035
Token ID: 10, Word: commission, TF-IDF Value: 0.09571524670882778
Token ID: 11, Word: comprehensive, TF-IDF Value: 0.09356122424444609
Token ID: 12, Word: either, TF-IDF Value: 0.07937580029708532
Token ID: 13, Word: elon, TF-IDF Value: 0.0884529890869541
Token ID: 14, Word: estimate, TF-IDF Value: 0.10184207648270188
Token ID: 15, Word: forcing, TF-IDF Value: 0.095

In [8]:
# Set up logging
#logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
logging.basicConfig(level=logging.INFO)

#LDA TRAINING
lda_model = LdaModel(
    corpus = tfidf_corpus, #corpus in tfidf
    id2word = dictionary, #dizionario (vocabolario dei miei documenti)
    #chunksize=2000, #documenti processati ad ogni iterazione
    alpha = 'auto',
    eta = 'auto',
    num_topics = 10,
    random_state = 42, #per riproducibilità
    passes = 20,
    per_word_topics = True) #aggiunge maggiori info

lda_model.save('lda_model')
lda_model.show_topics(formatted=False)

2023-10-07 15:56:30,244 : INFO : using autotuned alpha, starting with [0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1]
2023-10-07 15:56:30,251 : INFO : using serial LDA version on this node
2023-10-07 15:56:30,289 : INFO : running online (multi-pass) LDA training, 10 topics, 20 passes over the supplied corpus of 70000 documents, updating model once every 2000 documents, evaluating perplexity every 0 documents, iterating 100x with a convergence threshold of 0.001000
2023-10-07 15:56:30,383 : INFO : PROGRESS: pass 0, at document #2000/70000
2023-10-07 15:56:30,773 : INFO : optimized alpha [0.09201823, 0.08914799, 0.09011962, 0.09120183, 0.089642875, 0.08952327, 0.08950704, 0.09045545, 0.088908836, 0.08964733]
2023-10-07 15:56:30,786 : INFO : merging changes from 2000 documents into a model of 70000 documents
2023-10-07 15:56:30,807 : INFO : topic #8 (0.089): 0.005*"biden" + 0.004*"bidenomics" + 0.004*"abortion" + 0.004*"trump" + 0.004*"president" + 0.003*"republican" + 0.003*"job" + 0.

In [14]:

# Load the saved LDA model
lda_model_up = LdaModel.load("lda_model")

# Print the topics and associated words
topics = lda_model_up.show_topics(num_topics=10, num_words=10)  # You can adjust the number of words as needed
print("\n----- TOPIC -----")
for topic in topics:
    topic_words = topic[1].split("+")
    words = [word.split("*")[1].strip() for word in topic_words]
    topic_str = ', '.join(words)
    print(f"Topic: {topic_str}")

2023-10-07 16:13:40,740 : INFO : loading LdaModel object from lda_model
2023-10-07 16:13:40,745 : INFO : loading expElogbeta from lda_model.expElogbeta.npy with mmap=None
2023-10-07 16:13:40,747 : INFO : setting ignored attribute dispatcher to None
2023-10-07 16:13:40,748 : INFO : setting ignored attribute state to None
2023-10-07 16:13:40,749 : INFO : setting ignored attribute id2word to None
2023-10-07 16:13:40,750 : INFO : LdaModel lifecycle event {'fname': 'lda_model', 'datetime': '2023-10-07T16:13:40.750183', 'gensim': '4.3.2', 'python': '3.10.13 (main, Aug 25 2023, 13:20:03) [GCC 9.4.0]', 'platform': 'Linux-5.15.90.1-microsoft-standard-WSL2-x86_64-with-glibc2.31', 'event': 'loaded'}
2023-10-07 16:13:40,750 : INFO : loading LdaState object from lda_model.state
2023-10-07 16:13:40,753 : INFO : LdaState lifecycle event {'fname': 'lda_model.state', 'datetime': '2023-10-07T16:13:40.753551', 'gensim': '4.3.2', 'python': '3.10.13 (main, Aug 25 2023, 13:20:03) [GCC 9.4.0]', 'platform': '


----- TOPIC -----
Topic: "join", "passed", "believe", "discus", "pm", "deserve", "learn", "force", "deal", "hear"
Topic: "writes", "ron", "limit", "ramaswamy", "caucus", "airline", "deserves", "va", "nomination", "hurt"
Topic: "ready", "extremist", "bridge", "lol", "racist", "apply", "focused", "immigrant", "ensuring", "stuff"
Topic: "amp", "american", "trump", "biden", "u", "people", "president", "year", "republican", "state"
Topic: "colorado", "energy", "bipartisan", "union", "clean", "legislation", "critical", "sen", "save", "affordable"
Topic: "airport", "celebrating", "advocate", "birthday", "attention", "wishing", "wonderful", "fetterman", "legislature", "decline"
Topic: "default", "destroying", "mom", "sacrifice", "wake", "champion", "nazi", "toxic", "fellow", "emission"
Topic: "via", "mayor", "labor", "excited", "push", "billionaire", "faith", "congratulation", "god", "global"
Topic: "town", "hall", "afford", "ally", "training", "university", "kick", "attacking", "george", "ea