In [1]:
#TUTORIAL: https://pyro.ai/examples/prodlda.html
import math
import torch
import torch.nn as nn #pip install torch torchvision torchaudio
import torch.nn.functional as F
import pyro #pip3 install pyro-ppl 
from pyro.infer import SVI, TraceMeanField_ELBO,  MCMC, NUTS 
import pyro.distributions as dist
from tqdm import trange #pip install ipywidgets
import os
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from gensim.models.coherencemodel import CoherenceModel
from gensim.corpora import Dictionary


In [2]:
#Class to execute prodLDAModel
class Encoder(nn.Module):
    # Base class for the encoder net, used in the guide
    def __init__(self, vocab_size, num_topics, hidden, dropout):
        super().__init__()
        self.drop = nn.Dropout(dropout)  # to avoid component collapse
        self.fc1 = nn.Linear(vocab_size, hidden)
        self.fc2 = nn.Linear(hidden, hidden)
        self.fcmu = nn.Linear(hidden, num_topics)
        self.fclv = nn.Linear(hidden, num_topics)
        # NB: here we set `affine=False` to reduce the number of learning parameters
        # See https://pytorch.org/docs/stable/generated/torch.nn.BatchNorm1d.html
        # for the effect of this flag in BatchNorm1d
        self.bnmu = nn.BatchNorm1d(num_topics, affine=False)  # to avoid component collapse
        self.bnlv = nn.BatchNorm1d(num_topics, affine=False)  # to avoid component collapse

    def forward(self, inputs):
        h = F.softplus(self.fc1(inputs))
        h = F.softplus(self.fc2(h))
        h = self.drop(h)
        # μ and Σ are the outputs
        logtheta_loc = self.bnmu(self.fcmu(h))
        logtheta_logvar = self.bnlv(self.fclv(h))
        logtheta_scale = (0.5 * logtheta_logvar).exp()  # Enforces positivity
        return logtheta_loc, logtheta_scale


class Decoder(nn.Module):
    # Base class for the decoder net, used in the model
    def __init__(self, vocab_size, num_topics, dropout):
        super().__init__()
        self.beta = nn.Linear(num_topics, vocab_size, bias=False)
        self.bn = nn.BatchNorm1d(vocab_size, affine=False)
        self.drop = nn.Dropout(dropout)

    def forward(self, inputs):
        inputs = self.drop(inputs)
        # the output is σ(βθ)
        return F.softmax(self.bn(self.beta(inputs)), dim=1)


class ProdLDA(nn.Module):
    def __init__(self, vocab_size, num_topics, hidden, dropout):
        super().__init__()
        self.vocab_size = vocab_size
        self.num_topics = num_topics
        self.encoder = Encoder(vocab_size, num_topics, hidden, dropout)
        self.decoder = Decoder(vocab_size, num_topics, dropout)

    def model(self, docs):
        pyro.module("decoder", self.decoder)
        with pyro.plate("documents", docs.shape[0]):
            # Dirichlet prior 𝑝(𝜃|𝛼) is replaced by a logistic-normal distribution
            logtheta_loc = docs.new_zeros((docs.shape[0], self.num_topics))
            logtheta_scale = docs.new_ones((docs.shape[0], self.num_topics))
            logtheta = pyro.sample(
                "logtheta", dist.Normal(logtheta_loc, logtheta_scale).to_event(1))
            theta = F.softmax(logtheta, -1)

            # conditional distribution of 𝑤𝑛 is defined as
            # 𝑤𝑛|𝛽,𝜃 ~ Categorical(𝜎(𝛽𝜃))
            count_param = self.decoder(theta)
            # Currently, PyTorch Multinomial requires `total_count` to be homogeneous.
            # Because the numbers of words across documents can vary,
            # we will use the maximum count accross documents here.
            # This does not affect the result because Multinomial.log_prob does
            # not require `total_count` to evaluate the log probability.
            total_count = int(docs.sum(-1).max())
            pyro.sample(
                'obs',
                dist.Multinomial(total_count, count_param),
                obs=docs
            )

    def guide(self, docs):
        pyro.module("encoder", self.encoder)
        with pyro.plate("documents", docs.shape[0]):
            # Dirichlet prior 𝑝(𝜃|𝛼) is replaced by a logistic-normal distribution,
            # where μ and Σ are the encoder network outputs
            logtheta_loc, logtheta_scale = self.encoder(docs)
            logtheta = pyro.sample(
                "logtheta", dist.Normal(logtheta_loc, logtheta_scale).to_event(1))

    def beta(self):
        # beta matrix elements are the weights of the FC layer on the decoder
        return self.decoder.beta.weight.cpu().detach().T

In [9]:
#List of all topics
def get_topics(model, vocab, num_topics):
    topics = []
    for i in range(0, num_topics):
        topic = model.beta()[i] #ottengo i pesi
        sorted_, indices = torch.sort(topic, descending=True)
        df = pd.DataFrame(indices.numpy(), columns=['index'])
        words = pd.merge(df, vocab[['index', 'word']], how='left', on='index')['word'].values.tolist()
        topics.append(words)

    return topics

def compute_coherence(model, n_topic, texts, vocab):
    #List of all topics
    topic_list = get_topics(model,vocab, n_topic)
    dictionary = Dictionary(topic_list)
    
    #texts = list of lists of the documents (tweets)
    #topics = list of lists of the topics
    #dictionary = dizionario
    coherence_cv = CoherenceModel(model = None, texts = texts,  topics = topic_list, dictionary=dictionary, coherence='c_v').get_coherence()
    coherence_umass = CoherenceModel(model = None, texts = texts,  topics = topic_list, dictionary=dictionary, coherence='u_mass').get_coherence()
    coherence_cuci = CoherenceModel(model = None, texts = texts,  topics = topic_list, dictionary=dictionary, coherence='c_uci').get_coherence()
    coherence_cnpmi = CoherenceModel(model = None, texts = texts,  topics = topic_list, dictionary=dictionary, coherence='c_npmi').get_coherence()
    
    return  coherence_umass,coherence_cv, coherence_cuci, coherence_cnpmi

In [4]:
def run_prodLDA(num_topics, batch_size, learning_rate, num_epochs, docs):
    # training
    pyro.clear_param_store()

    prodLDA = ProdLDA(
        vocab_size=docs.shape[1],
        num_topics=num_topics,
        hidden=100,
        dropout=0.2
    )
    prodLDA.to(device)

    optimizer = pyro.optim.Adam({"lr": learning_rate})
    svi = SVI(prodLDA.model, prodLDA.guide, optimizer, loss=TraceMeanField_ELBO())
    num_batches = int(math.ceil(docs.shape[0] / batch_size)) 

    bar = trange(num_epochs)
    for epoch in bar:
        running_loss = 0.0
        for i in range(num_batches):
            batch_docs = docs[i * batch_size:(i + 1) * batch_size, :]
            loss = svi.step(batch_docs)
            running_loss += loss / batch_docs.size(0)

        bar.set_postfix(epoch_loss='{:.2e}'.format(running_loss))
        
    #Saving the model
    path = f"models_prodLDA/prodLDA_{num_topics}_{num_epochs}.pth"
    torch.save(prodLDA.state_dict(), path)
    
    #Show the topic  (10 words)
    for i in range(0, num_topics):
        topic = prodLDA.beta()[i] #ottengo i pesi
        sorted_, indices = torch.sort(topic, descending=True)
        df = pd.DataFrame(indices[:10].numpy(), columns=['index'])
        words = pd.merge(df, vocab[['index', 'word']], how='left', on='index')['word'].values.tolist()
        print(f"topic{i+1} with {words}")

In [5]:
#Prepare dataset to be used in the model
source = '../doc/cleaned.csv'
df = pd.read_csv(source)

# Split the text data into words based on spaces
tweets = df['lemmatized_text'].apply(lambda text: text.split())

# Create a CountVectorizer 
# max_df is used for removing terms that appear too frequently, also known as "corpus-specific stop words"
# min_df is used for removing terms that appear too infrequently, at least in 20 documents
vectorizer = CountVectorizer(max_df=0.5, min_df=20)

# Convert the tokenized text data into a document-term matrix
docs = torch.from_numpy(vectorizer.fit_transform([" ".join(tweet) for tweet in tweets]).toarray())

vocab = pd.DataFrame(columns=['word', 'index'])
vocab['word'] = vectorizer.get_feature_names_out()
vocab['index'] = vocab.index


In [8]:
#Show the initial data

print('Dictionary size: %d' % len(vocab)) #vocab_size
print('Corpus size: {}'.format(docs.shape)) # (num_docs, vocab_size)

Dictionary size: 5590
Corpus size: torch.Size([70000, 5590])


In [6]:
# setting global variables
seed = 0
torch.manual_seed(seed)
pyro.set_rng_seed(seed)
device = torch.device("cpu")
#device = torch.device("cuda:0") # Uncomment this to run on GPU 

In [7]:
#upload data on device
docs = docs.float().to(device)

In [9]:
#Experiment
run_prodLDA(num_topics=5, batch_size=32, learning_rate=1e-3, num_epochs=50, docs=docs)

100%|██████████| 50/50 [32:41<00:00, 39.23s/it, epoch_loss=1.98e+05]

topic1 with ['biden', 'president', 'state', 'job', 'joe', 'infrastructure', 'business', 'hunter', 'federal', 'america']
topic2 with ['independence', 'sent', 'july', 'playing', 'october', 'file', 'pres', 'earlier', 'ticket', 'involvement']
topic3 with ['colorado', 'vaccine', 'tool', 'fire', 'emergency', 'area', 'road', 'read', 'california', 'accountable']
topic4 with ['right', 'republican', 'woman', 'day', 'today', 'vote', 'house', 'freedom', 'abortion', 'democrat']
topic5 with ['cost', 'american', 'care', 'family', 'act', 'health', 'tax', 'year', 'prescription', 'need']





In [10]:
#Experiment
run_prodLDA(num_topics=6, batch_size=32, learning_rate=1e-3, num_epochs=50, docs=docs)

100%|██████████| 50/50 [33:44<00:00, 40.49s/it, epoch_loss=1.98e+05]

topic1 with ['buck', 'certain', 'collectively', 'manchin', 'windfall', 'treating', 'undermines', 'stunt', 'abusing', 'politicized']
topic2 with ['trump', 'house', 'right', 'election', 'republican', 'abortion', 'vote', 'woman', 'gun', 'court']
topic3 with ['biden', 'health', 'care', 'child', 'border', 'joe', 'million', 'veteran', 'access', 'hunter']
topic4 with ['year', 'cost', 'inflation', 'insulin', 'price', 'drug', 'war', 'medicare', 'energy', 'act']
topic5 with ['join', 'event', 'live', 'watch', 'honor', 'day', 'tomorrow', 'celebrate', 'tonight', 'loved']
topic6 with ['people', 'work', 'working', 'country', 'job', 'worker', 'american', 'like', 'america', 'economy']





In [11]:
#Experiment
run_prodLDA(num_topics=7, batch_size=32, learning_rate=1e-3, num_epochs=50, docs=docs)

100%|██████████| 50/50 [32:28<00:00, 38.98s/it, epoch_loss=1.98e+05]

topic1 with ['discus', 'debate', 'week', 'join', 'watch', 'news', 'county', 'presidential', 'desantis', 'campaign']
topic2 with ['lost', 'pact', 'shooting', 'story', 'officer', 'killed', 'police', 'one', 'service', 'loved']
topic3 with ['people', 'country', 'america', 'work', 'like', 'nation', 'american', 'fight', 'world', 'know']
topic4 with ['giant', 'normal', 'populist', 'authoritarian', 'withhold', 'divisive', 'direct', 'slogan', 'stuff', 'elite']
topic5 with ['republican', 'vote', 'house', 'democrat', 'law', 'right', 'abortion', 'election', 'court', 'party']
topic6 with ['border', 'child', 'health', 'care', 'crisis', 'help', 'need', 'funding', 'security', 'program']
topic7 with ['biden', 'president', 'job', 'trump', 'cost', 'joe', 'price', 'economy', 'american', 'big']





In [12]:
#Experiment
#run_prodLDA(num_topics=8, batch_size=32, learning_rate=1e-3, num_epochs=50, docs=docs)

In [13]:
#Experiment
run_prodLDA(num_topics=9, batch_size=32, learning_rate=1e-3, num_epochs=50, docs=docs)

100%|██████████| 50/50 [32:25<00:00, 38.91s/it, epoch_loss=1.99e+05]

topic1 with ['fox', 'apply', 'check', 'read', 'haley', 'interview', 'news', 'hit', 'deadline', 'learn']
topic2 with ['join', 'water', 'town', 'live', 'discus', 'district', 'hall', 'team', 'looking', 'senator']
topic3 with ['right', 'trump', 'election', 'abortion', 'vote', 'woman', 'candidate', 'voter', 'republican', 'reproductive']
topic4 with ['day', 'today', 'life', 'black', 'honor', 'nation', 'celebrate', 'year', 'love', 'history']
topic5 with ['family', 'health', 'cost', 'child', 'pay', 'care', 'student', 'million', 'tax', 'drug']
topic6 with ['democrat', 'republican', 'house', 'energy', 'job', 'bill', 'act', 'party', 'gop', 'senate']
topic7 with ['biden', 'president', 'gun', 'border', 'joe', 'hunter', 'administration', 'national', 'community', 'safety']
topic8 with ['worker', 'america', 'business', 'work', 'support', 'like', 'working', 'world', 'china', 'economy']
topic9 with ['meantime', 'withhold', 'repealing', 'tackling', 'smarter', 'enact', 'accept', 'overwhelmingly', 'promisi




In [14]:
#Experiment
run_prodLDA(num_topics=10, batch_size=32, learning_rate=1e-3, num_epochs=50, docs=docs)

100%|██████████| 50/50 [32:33<00:00, 39.06s/it, epoch_loss=2.00e+05]

topic1 with ['cost', 'tax', 'price', 'drug', 'insulin', 'prescription', 'government', 'spending', 'medicare', 'money']
topic2 with ['supported', 'withhold', 'ohioan', 'meantime', 'illusion', 'outsider', 'percentage', 'implemented', 'lifting', 'elderly']
topic3 with ['republican', 'right', 'trump', 'democrat', 'house', 'election', 'abortion', 'vote', 'state', 'gop']
topic4 with ['gun', 'security', 'military', 'national', 'ukraine', 'violence', 'war', 'israel', 'weapon', 'stop']
topic5 with ['health', 'care', 'school', 'veteran', 'child', 'access', 'student', 'need', 'benefit', 'like']
topic6 with ['wishing', 'ohioan', 'meantime', 'wish', 'unified', 'outsider', 'remark', 'largely', 'punished', 'percentage']
topic7 with ['day', 'today', 'year', 'life', 'black', 'honor', 'ago', 'woman', 'nation', 'remember']
topic8 with ['border', 'energy', 'investment', 'clean', 'infrastructure', 'crisis', 'climate', 'funding', 'water', 'migrant']
topic9 with ['biden', 'president', 'people', 'joe', 'ameri




In [10]:
#GET THE COHERENCE
model5 = ProdLDA(vocab_size=docs.shape[1],num_topics=5,  hidden=100,dropout=0.2)
model5.load_state_dict(torch.load("models_prodLDA/prodLDA_5_50.pth"))
umass,cv,cuci,cnpmi = compute_coherence(model5, 5, tweets, vocab)
print(f"Model 5 topics: umass {umass}, cv {cv}, cuci {cuci}, cnpmi {cnpmi}\n")

model6 = ProdLDA(vocab_size=docs.shape[1],num_topics=6,  hidden=100,dropout=0.2)
model6.load_state_dict(torch.load("models_prodLDA/prodLDA_6_50.pth"))
umass,cv,cuci,cnpmi = compute_coherence(model6, 6, tweets, vocab)
print(f"Model 6 topics: umass {umass}, cv {cv}, cuci {cuci}, cnpmi {cnpmi}\n")

model7 = ProdLDA(vocab_size=docs.shape[1],num_topics=7,  hidden=100,dropout=0.2)
model7.load_state_dict(torch.load("models_prodLDA/prodLDA_7_50.pth"))
umass,cv,cuci,cnpmi = compute_coherence(model7, 7, tweets, vocab)
print(f"Model 7 topics: umass {umass}, cv {cv}, cuci {cuci}, cnpmi {cnpmi}\n")

model8 = ProdLDA(vocab_size=docs.shape[1],num_topics=8,  hidden=100,dropout=0.2)
model8.load_state_dict(torch.load("models_prodLDA/prodLDA_8_50.pth"))
umass,cv,cuci,cnpmi = compute_coherence(model8, 8, tweets, vocab)
print(f"Model 8 topics: umass {umass}, cv {cv}, cuci {cuci}, cnpmi {cnpmi}\n")

model9 = ProdLDA(vocab_size=docs.shape[1],num_topics=9,  hidden=100,dropout=0.2)
model9.load_state_dict(torch.load("models_prodLDA/prodLDA_9_50.pth"))
umass,cv,cuci,cnpmi = compute_coherence(model9, 9, tweets, vocab)
print(f"Model 9 topics: umass {umass}, cv {cv}, cuci {cuci}, cnpmi {cnpmi}\n")

model10 = ProdLDA(vocab_size=docs.shape[1],num_topics=10,  hidden=100,dropout=0.2)
model10.load_state_dict(torch.load("models_prodLDA/prodLDA_10_50.pth"))
umass,cv,cuci,cnpmi = compute_coherence(model10, 10, tweets, vocab)
print(f"Model 10 topics: umass {umass}, cv {cv}, cuci {cuci}, cnpmi {cnpmi}\n")

Model 5 topics: umass -6.442153045738394, cv 0.4885890115641576, cuci -2.889770153163904, cnpmi -0.07811261595651942

Model 6 topics: umass -6.413647822049828, cv 0.5275921933368924, cuci -2.1169069550891573, cnpmi -0.0434046053669024

Model 7 topics: umass -5.76732096299721, cv 0.545204573964516, cuci -1.59275476550026, cnpmi -0.020834793564301846

Model 8 topics: umass -6.094559950416134, cv 0.5716934331399013, cuci -2.054536039432442, cnpmi -0.03250586457527792

Model 9 topics: umass -5.961885053765611, cv 0.5103509865089653, cuci -1.809696119943097, cnpmi -0.028238792136889297

Model 10 topics: umass -6.961382892863424, cv 0.5624399578927939, cuci -2.3985323530216536, cnpmi -0.03918545963545757



: 