In [10]:
import gensim #pip install gensim
import pprint
from gensim import corpora, models
from gensim.models import LdaModel, LdaMulticore
from gensim.models.coherencemodel import CoherenceModel
from nltk.tokenize import word_tokenize
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import logging
import pyLDAvis.gensim #pip install pyLDAvis
import pyLDAvis.gensim_models
pyLDAvis.enable_notebook()


''' 
 default_term_info = default_term_info.sort_values(
        by='saliency', ascending=False).head(R).drop(labels='saliency', axis=1) #FIX pyLDAVIS
'''

" \n default_term_info = default_term_info.sort_values(\n        by='saliency', ascending=False).head(R).drop(labels='saliency', axis=1) #FIX pyLDAVIS\n"

In [2]:
#1. Creazione del TF-IDF (Term Frequency - Inverse Document Frequency), assegnamo un peso ad ogni parola
#  - TF: frequenza del termine nel documento: num. occorrenze / num. totale di parole nel documento
#  - IDF: inverso della frequenza del termine nei documenti: log(num. totale di documenti / num. documenti che contengono il termine)
#  - TF-IDF: prodotto delle due precedenti (risulta elevato quando la parola è molto presente nel documento e poco presente negli altri documenti)

source = './doc/cleaned.csv'
df = pd.read_csv(source)

#Tokenizzazione
tweets = df['lemmatized_text'].apply(lambda text: word_tokenize(text))

# Create a dictionary
dictionary = corpora.Dictionary(tweets)

# Create a corpus
corpus = [dictionary.doc2bow(doc) for doc in tweets]

# Train the TF-IDF model
tfidf = models.TfidfModel(corpus)

# Transform the corpus into TF-IDF vectors
tfidf_corpus = tfidf[corpus]

In [3]:
# Show some results
for i, doc in enumerate(tfidf_corpus):
    if i > 2:
        break
    print(f"TF-IDF values for document {i}:")
    for token_id, tfidf_value in doc:
        word = dictionary[token_id]  # Get the word corresponding to the token_id
        print(f"Token ID: {token_id}, Word: {word}, TF-IDF Value: {tfidf_value}")
    print("\n")

TF-IDF values for document 0:
Token ID: 0, Word: abuse, TF-IDF Value: 0.08351972565693129
Token ID: 1, Word: abused, TF-IDF Value: 0.10561463665339407
Token ID: 2, Word: action, TF-IDF Value: 0.05909593110836793
Token ID: 3, Word: agency, TF-IDF Value: 0.3698914715979529
Token ID: 4, Word: along, TF-IDF Value: 0.07650986140193991
Token ID: 5, Word: call, TF-IDF Value: 0.11628774127377883
Token ID: 6, Word: came, TF-IDF Value: 0.07522756557898667
Token ID: 7, Word: can, TF-IDF Value: 0.10543304589762349
Token ID: 8, Word: clearly, TF-IDF Value: 0.08667131008946559
Token ID: 9, Word: comment, TF-IDF Value: 0.08179051475953035
Token ID: 10, Word: commission, TF-IDF Value: 0.09571524670882778
Token ID: 11, Word: comprehensive, TF-IDF Value: 0.09356122424444609
Token ID: 12, Word: either, TF-IDF Value: 0.07937580029708532
Token ID: 13, Word: elon, TF-IDF Value: 0.0884529890869541
Token ID: 14, Word: estimate, TF-IDF Value: 0.10184207648270188
Token ID: 15, Word: forcing, TF-IDF Value: 0.095

In [13]:
# Set up logging
#logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
logging.basicConfig(level=logging.INFO)

lda_model = LdaMulticore(
    corpus = tfidf_corpus, #corpus in tfidf
    id2word = dictionary, #dizionario (vocabolario dei miei documenti)
    num_topics = 10,
    random_state = 42, #per riproducibilità
    passes = 20,
    per_word_topics = True,
    workers = 3
    ) 
'''#LDA TRAINING
lda_model = LdaModel(
    corpus = tfidf_corpus, #corpus in tfidf
    id2word = dictionary, #dizionario (vocabolario dei miei documenti)
    #chunksize=2000, #documenti processati ad ogni iterazione
    alpha = 'auto',
    eta = 'auto',
    num_topics = 10,
    random_state = 42, #per riproducibilità
    passes = 40,
    per_word_topics = True) #aggiunge maggiori info'''

lda_model.save('lda_model')
lda_model.show_topics(formatted=False)

NotImplementedError: auto-tuning alpha not implemented in LdaMulticore; use plain LdaModel.

In [14]:

# Load the saved LDA model
lda_model_up = LdaModel.load("lda_model")

# Print the topics and associated words
topics = lda_model_up.show_topics(num_topics=10, num_words=10)  # You can adjust the number of words as needed
print("\n----- TOPIC -----")
for topic in topics:
    topic_words = topic[1].split("+")
    words = [word.split("*")[1].strip() for word in topic_words]
    topic_str = ', '.join(words)
    print(f"Topic: {topic_str}")

INFO:gensim.utils:loading LdaModel object from lda_model
INFO:gensim.utils:loading expElogbeta from lda_model.expElogbeta.npy with mmap=None
INFO:gensim.utils:setting ignored attribute id2word to None
INFO:gensim.utils:setting ignored attribute dispatcher to None
INFO:gensim.utils:setting ignored attribute state to None
INFO:gensim.utils:LdaMulticore lifecycle event {'fname': 'lda_model', 'datetime': '2023-10-07T17:54:20.419243', 'gensim': '4.3.2', 'python': '3.10.13 (main, Aug 25 2023, 13:20:03) [GCC 9.4.0]', 'platform': 'Linux-5.15.90.1-microsoft-standard-WSL2-x86_64-with-glibc2.31', 'event': 'loaded'}
INFO:gensim.utils:loading LdaState object from lda_model.state
INFO:gensim.utils:LdaState lifecycle event {'fname': 'lda_model.state', 'datetime': '2023-10-07T17:54:20.424723', 'gensim': '4.3.2', 'python': '3.10.13 (main, Aug 25 2023, 13:20:03) [GCC 9.4.0]', 'platform': 'Linux-5.15.90.1-microsoft-standard-WSL2-x86_64-with-glibc2.31', 'event': 'loaded'}



----- TOPIC -----
Topic: "american", "job", "border", "cost", "tax", "inflation", "biden", "energy", "year", "million"
Topic: "trump", "biden", "house", "impeachment", "joe", "president", "former", "say", "mccarthy", "election"
Topic: "musk", "cia", "channel", "compete", "elon", "yorkers", "mar", "nancy", "politico", "cruel"
Topic: "newsom", "de", "gavin", "rubio", "lean", "reassignment", "damn", "judicial", "marco", "modernize"
Topic: "amp", "community", "care", "gun", "help", "veteran", "health", "today", "work", "family"
Topic: "airline", "flight", "airport", "faa", "passenger", "hey", "rsvp", "hillary", "floridian", "yeah"
Topic: "et", "pm", "tune", "join", "tonight", "live", "elder", "episode", "watch", "hall"
Topic: "abortion", "right", "court", "woman", "supreme", "reproductive", "decision", "ban", "voting", "doctor"
Topic: "trump", "vote", "election", "people", "like", "get", "republican", "democrat", "would", "biden"
Topic: "thank", "happy", "day", "great", "today", "woman", 

In [15]:
# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model_up, texts=tweets, dictionary=dictionary, coherence='u_mass')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)

INFO:gensim.topic_coherence.text_analysis:CorpusAccumulator accumulated stats from 1000 documents
INFO:gensim.topic_coherence.text_analysis:CorpusAccumulator accumulated stats from 2000 documents
INFO:gensim.topic_coherence.text_analysis:CorpusAccumulator accumulated stats from 3000 documents
INFO:gensim.topic_coherence.text_analysis:CorpusAccumulator accumulated stats from 4000 documents
INFO:gensim.topic_coherence.text_analysis:CorpusAccumulator accumulated stats from 5000 documents
INFO:gensim.topic_coherence.text_analysis:CorpusAccumulator accumulated stats from 6000 documents
INFO:gensim.topic_coherence.text_analysis:CorpusAccumulator accumulated stats from 7000 documents
INFO:gensim.topic_coherence.text_analysis:CorpusAccumulator accumulated stats from 8000 documents
INFO:gensim.topic_coherence.text_analysis:CorpusAccumulator accumulated stats from 9000 documents
INFO:gensim.topic_coherence.text_analysis:CorpusAccumulator accumulated stats from 10000 documents
INFO:gensim.topic_c


Coherence Score:  -7.981395765992124


In [16]:
#pyLDAvis (https://siqi-zhu.medium.com/ldavis-a-deep-dive-into-the-popular-topic-modeling-tool-d0c61a03e969)

vis = pyLDAvis.gensim_models.prepare(lda_model_up, tfidf_corpus, dictionary, sort_topics=True)
pyLDAvis.display(vis)