In [None]:
!pip install pyLDAvis

In [1]:
import pandas as pd
import numpy as np
import contractions
import string
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
nltk.download('averaged_perceptron_tagger')
from collections import Counter
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation, NMF
import matplotlib.pyplot as plt
import seaborn as sns
import gensim
from gensim import corpora
from gensim.models.coherencemodel import CoherenceModel
import pyLDAvis
import pyLDAvis.gensim 
from gensim import corpora, models
import numpy as np
from scipy.special import psi, polygamma

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/cep4u/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [2]:
df = pd.read_json('News_Category_Dataset_v3.json', lines=True)

In [3]:
df.tail()

Unnamed: 0,link,headline,category,short_description,authors,date
209522,https://www.huffingtonpost.com/entry/rim-ceo-t...,RIM CEO Thorsten Heins' 'Significant' Plans Fo...,TECH,Verizon Wireless and AT&T are already promotin...,"Reuters, Reuters",2012-01-28
209523,https://www.huffingtonpost.com/entry/maria-sha...,Maria Sharapova Stunned By Victoria Azarenka I...,SPORTS,"Afterward, Azarenka, more effusive with the pr...",,2012-01-28
209524,https://www.huffingtonpost.com/entry/super-bow...,"Giants Over Patriots, Jets Over Colts Among M...",SPORTS,"Leading up to Super Bowl XLVI, the most talked...",,2012-01-28
209525,https://www.huffingtonpost.com/entry/aldon-smi...,Aldon Smith Arrested: 49ers Linebacker Busted ...,SPORTS,CORRECTION: An earlier version of this story i...,,2012-01-28
209526,https://www.huffingtonpost.com/entry/dwight-ho...,Dwight Howard Rips Teammates After Magic Loss ...,SPORTS,The five-time all-star center tore into his te...,,2012-01-28


In [4]:
class DataCleaner:
    def __init__(self, df, column_name, stop_words, wnl ):
        self.df = df
        self.column_name = column_name
        self.stop_words = stop_words
        self.wnl = wnl
    
    def remove_null(self, df, column_name):
        df = df[df[column_name].notnull()]
        return df

    def remove_contractions(self, df, column_name):
        df[f'RemoveContractions_{column_name}'] = df[column_name].apply(lambda x: [contractions.fix(word) for word in x.split()])
        return df

    def rebuild_string(self, df, column_name):
        df[f'{column_name}_string_nocont'] = [' '.join(map(str, l)) for l in df[f'RemoveContractions_{column_name}']]
        return df

    def tokenize(self, df, column_name):
        df[f'tokenized_{column_name}'] = df[f'{column_name}_string_nocont'].apply(word_tokenize)
        return df
    
    def token_cleanup(self, df, column_name):
        edge_cases = ["``", "’", "''", "image", "title", "alt", "src", "width", "img", "http", "cbc", "jpg", "16x9_460", "buzzfeed", "com", "h1", "href", "href=", 'p', '/p', '/a' "rel", "www", "reuters", "timesofindia", "indiatimes", "margin", "nofollow", '8217', '8230']
        df[f'tokenized_{column_name}'] = df[f'tokenized_{column_name}'].apply(lambda x: [word.lower() for word in x])
        df[f'tokenized_{column_name}'] = df[f'tokenized_{column_name}'].apply(lambda x: [word for word in x if word not in string.punctuation])
        df[f'tokenized_{column_name}'] = df[f'tokenized_{column_name}'].apply(lambda x: [word for word in x if word not in self.stop_words])
        df[f'tokenized_{column_name}'] = df[f'tokenized_{column_name}'].apply(lambda x: [word for word in x if '/' not in word])
        df[f'tokenized_{column_name}'] = df[f'tokenized_{column_name}'].apply(lambda x: [word for word in x if word not in edge_cases])
        return df

    def make_bigrams(self, df, column_name):
        bigram = gensim.models.Phrases(df[f'tokenized_{column_name}'], min_count=5, threshold=100)
        bigram_mod = gensim.models.phrases.Phraser(bigram)

        def get_bigrams(tokens_list):
            return bigram_mod[tokens_list]

        df[f'bigrams_{column_name}'] = df[f'tokenized_{column_name}'].apply(get_bigrams)
        
        return df
        
    def lemmatize_tokens(self, df, column_name):
        clean_up = ["'s", "--"]
        df[f'lemmatized_{column_name}'] = df[f'bigrams_{column_name}'].apply(lambda x: [self.wnl.lemmatize(word) for word in x])
        df[f'lemmatized_{column_name}'] = df[f'lemmatized_{column_name}'].apply(lambda x: [word for word in x if word not in clean_up])
        return df

    def clean(self):
        df = self.remove_null(self.df, self.column_name)
        df = self.remove_contractions(df, self.column_name)
        df = self.rebuild_string(df, self.column_name)
        df = self.tokenize(df, self.column_name)
        df = self.token_cleanup(df, self.column_name)
        df = self.make_bigrams(df, self.column_name)
        df = self.lemmatize_tokens(df, self.column_name)
        return df

In [5]:
wnl = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

desc_cleaner_news = DataCleaner(df, 'short_description', stop_words, wnl)
cleaned_df_desc = desc_cleaner_news.clean()

In [6]:
cleaned_df_desc['lemmatized_short_description'][13]

['one',
 'man',
 'claim',
 'scammed',
 'people',
 'platform',
 'caused',
 'several',
 'popular',
 'streamer',
 'consider',
 'twitch',
 'boycott']

In [7]:
class LDAModeler:
    def __init__(self, df, column_name):
        self.df = df
        self.column_name = column_name
    
    def create_corpus(self, df, column_name):
        
        id2word = corpora.Dictionary(df[f'lemmatized_{self.column_name}'])

        texts = df[f'lemmatized_{self.column_name}']

        corpus = [id2word.doc2bow(text) for text in texts]
        
        return corpus

In [8]:
# Create Dictionary
id2word = corpora.Dictionary(cleaned_df_desc['lemmatized_short_description'])

# Create Corpus
texts = cleaned_df_desc['lemmatized_short_description']

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View the first entry in the corpus
print(corpus[0])

[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1)]


In [9]:
bigram_lda_model_5 = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=5, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [10]:
bigram_lda_model_5.save("bigram_lda_model_5")

In [11]:
print(bigram_lda_model_5.print_topics())
doc_lda = bigram_lda_model_5[corpus]

[(0, '0.029*"one" + 0.017*"u" + 0.016*"people" + 0.015*"week" + 0.012*"take" + 0.011*"thing" + 0.008*"much" + 0.008*"good" + 0.008*"go" + 0.008*"something"'), (1, '0.019*"day" + 0.018*"like" + 0.014*"love" + 0.011*"fashion" + 0.010*"child" + 0.009*"two" + 0.009*"woman" + 0.008*"home" + 0.007*"best" + 0.007*"photo"'), (2, '0.025*"year" + 0.022*"make" + 0.018*"know" + 0.015*"want" + 0.015*"need" + 0.013*"think" + 0.012*"come" + 0.012*"say" + 0.012*"last" + 0.010*"today"'), (3, '0.030*"look" + 0.023*"many" + 0.017*"even" + 0.016*"show" + 0.014*"every" + 0.012*"work" + 0.011*"check" + 0.011*"wedding" + 0.010*"new_york" + 0.009*"might"'), (4, '0.030*"time" + 0.022*"new" + 0.020*"life" + 0.020*"get" + 0.019*"way" + 0.018*"would" + 0.018*"may" + 0.013*"back" + 0.013*"see" + 0.013*"world"')]


In [12]:
# Compute Perplexity
print('\nPerplexity: ', bigram_lda_model_5.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
bigram_coherence_model_lda_5 = CoherenceModel(model=bigram_lda_model_5, texts= cleaned_df_desc['lemmatized_short_description'], dictionary=id2word, coherence='c_v')
bigram_coherence_lda_5 = bigram_coherence_model_lda_5.get_coherence()
print('\nCoherence Score: ', bigram_coherence_lda_5)


Perplexity:  -9.584815517078322

Coherence Score:  0.40842829804234393


In [13]:
pyLDAvis.enable_notebook()
bigram_vis_5 = pyLDAvis.gensim.prepare(bigram_lda_model_5, corpus, id2word)
bigram_vis_5

In [14]:
import numpy as np
import random

# Initialization
D = len(texts)  # Number of documents
V = len(id2word)  # Number of unique words
T = 10  # Number of topics

# Initialize topic assignment randomly
topic_assignment = [[random.randint(0, T-1) for _ in document] for document in texts]

# Initialize Count matrices
# N_{d, t}: count of words in document d assigned to topic t
ndt = np.zeros((D, T))  
# N_{t, v}: count of assignments to topic t of word v
ntv = np.zeros((T, V))
# N_{t}: total count of words assigned to topic t
nt = np.zeros(T)

# Iterate over corpus to fill Count matrices
for d in range(D):
    for i, v in enumerate(texts[d]):
        t = topic_assignment[d][i]
        ntv[t, id2word.token2id[v]] += 1
        ndt[d, t] += 1
        nt[t] += 1

# Iteratively update topic assignments
for d in range(D):
    for i, v in enumerate(texts[d]):
        t = topic_assignment[d][i]
        
        # Decrement count matrices for old assignment
        ntv[t, id2word.token2id[v]] -= 1
        ndt[d, t] -= 1
        nt[t] -= 1
        
        # Compute conditional distribution for new assignment
        p = ((ndt[d, :] + 0.001) / (ndt[d, :].sum() + 0.001 * T)) * ((ntv[:, id2word.token2id[v]] + 0.001) / (nt.sum() + 0.001 * V))
        assert np.all(p >= 0), "Negative probabilities found!"
        assert np.all(p <= 1), "Probabilities above 1 found!"
        assert not np.isnan(p).any(), "Probabilities are NaN!"
        t = np.random.multinomial(1, p / p.sum()).argmax()
        
        # Increment count matrices for new assignment
        ntv[t, id2word.token2id[v]] += 1
        ndt[d, t] += 1
        nt[t] += 1

        # Update topic assignment
        topic_assignment[d][i] = t

In [None]:
N = 10  # Number of words to display per topic

for t in range(T):
    print(f"Topic {t}:")
    
    # Get the top N word indices for this topic
    top_word_indices = ntv[t].argsort()[::-1][:N]
    
    # Print the words
    for i in top_word_indices:
        print(f"\t{id2word[i]}")


In [None]:


# Initialization
N = sum(len(doc) for doc in texts)  # Total number of words in all documents
D = len(texts)  # Number of documents
V = len(id2word)  # Number of unique words
T = 10  # Number of topics
alpha = 0.1  # Prior on the topic distributions per document

# Initialize phi and gamma
phi = np.ones((N, T)) / T
gamma = np.full((T,), alpha + N / T)

# Initialize term-topic matrix beta (you might use different method to initialize it)
beta = np.random.dirichlet(alpha=np.ones(len(id2word)), size=T)

# Map from word to its index
word2id = {word: i for doc in texts for word in doc}


# Expectation step
def e_step():
    global gamma  # Ensure we're using the global gamma variable
    for n, doc in enumerate(texts):
        for i, word in enumerate(doc):
            for t in range(T):
                phi[n][t] = beta[t][id2word.token2id[word]] * np.exp(psi(gamma[t]))
            # Normalize phi
            phi[n] /= phi[n].sum()
        # Update gamma after processing each document
        gamma = alpha + phi.sum(axis=0)

# Maximization step
def m_step():
    for t in range(T):
        for n, doc in enumerate(texts):
            for i, word in enumerate(doc):
                beta[t][word2id[word]] += phi[n][t]
        # Normalize beta
        beta[t] /= beta[t].sum()

# Iterate until convergence
max_iter = 100
for iteration in range(max_iter):
    e_step()
    m_step()


In [None]:
def display_topics(beta, id2word, n_top_words):
    for i, topic_dist in enumerate(beta):
        topic_words = np.array(id2word)[np.argsort(topic_dist)][:-(n_top_words+1):-1]
        print('Topic {}: {}'.format(i, ' '.join(topic_words)))

# Call the function with your beta and id2word dictionary:
n_top_words = 10
display_topics(beta, list(id2word.values()), n_top_words)

In [None]:
# Perplexity
def compute_perplexity():
    log_likelihood = 0
    for n, doc in enumerate(texts):
        for i, word in enumerate(doc):
            log_likelihood += np.log(np.sum(phi[n, :] * beta[:, word2id[word]]))
    perplexity = np.exp(-1. * log_likelihood / N)
    return perplexity

print("Perplexity: ", compute_perplexity())

# Coherence (UMass measure)
def compute_coherence():
    topic_words = beta.argsort(axis=-1)[:, :10]  # Top 10 words per topic
    total_score = 0
    for t in range(T):
        for i in range(len(topic_words[t]) - 1):
            for j in range(i + 1, len(topic_words[t])):
                score = np.log((beta[t, topic_words[t, j]] + 1.) / beta[t, topic_words[t, i]])
                total_score += score
    coherence = total_score / T
    return coherence

print("Coherence: ", compute_coherence())


In [None]:
import numpy as np
import random

# Initialization
D = len(texts)  # Number of documents
V = len(id2word)  # Number of unique words
T = 10  # Number of topics
alpha = 0.1  # Prior on the topic distributions per document
beta = 0.01  # Prior on the word distributions per topic

# Initialize topic assignment randomly
topic_assignment = [[random.randint(0, T-1) for _ in document] for document in texts]

# Initialize Count matrices
# N_{d, t}: count of words in document d assigned to topic t
ndt = np.zeros((D, T))  
# N_{t, v}: count of assignments to topic t of word v
ntv = np.zeros((T, V))
# N_{t}: total count of words assigned to topic t
nt = np.zeros(T)

# Iterate over corpus to fill Count matrices
for d in range(D):
    for i, v in enumerate(texts[d]):
        t = topic_assignment[d][i]
        ntv[t, id2word.token2id[v]] += 1
        ndt[d, t] += 1
        nt[t] += 1

# Collapsed Gibbs Sampling
for iteration in range(100):  # Choose the number of iterations
    for d in range(D):
        for i, v in enumerate(texts[d]):
            t = topic_assignment[d][i]
            
            # Decrement count matrices for old assignment
            ntv[t, id2word.token2id[v]] -= 1
            ndt[d, t] -= 1
            nt[t] -= 1
            
            # Sample new topic assignment from conditional distribution
            p = ((ndt[d, :] + alpha) / (np.sum(ndt[d, :]) + T * alpha)) * ((ntv[:, id2word.token2id[v]] + beta) / (nt + V * beta))
            t = np.random.multinomial(1, p / p.sum()).argmax()
            
            # Increment count matrices for new assignment
            ntv[t, id2word.token2id[v]] += 1
            ndt[d, t] += 1
            nt[t] += 1
            
            # Update topic assignment
            topic_assignment[d][i] = t

# Get the word distributions for each topic
phi = (ntv + beta) / (nt[:, None] + V * beta)

In [None]:
bigram_lda_model_10 = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=10, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

bigram_lda_model_10.save("bigram_lda_model_10")

In [None]:
print(bigram_lda_model_10.print_topics())
doc_lda = bigram_lda_model_10[corpus]

# Compute Perplexity
print('\nPerplexity: ', bigram_lda_model_10.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda_10 = CoherenceModel(model=bigram_lda_model_10, texts= cleaned_df_desc['lemmatized_short_description'], dictionary=id2word, coherence='c_v')
coherence_lda_10 = coherence_model_lda_10.get_coherence()
print('\nCoherence Score: ', coherence_lda_10)

In [None]:
pyLDAvis.enable_notebook()
bigram_vis_10 = pyLDAvis.gensim.prepare(bigram_lda_model_10, corpus, id2word)
bigram_vis_10

In [None]:
class DataCleaner:
    def __init__(self, df, column_name, stop_words, wnl ):
        self.df = df
        self.column_name = column_name
        self.stop_words = stop_words
        self.wnl = wnl
    
    def remove_null(self, df, column_name):
        df = df[df[column_name].notnull()]
        return df

    def remove_contractions(self, df, column_name):
        df[f'RemoveContractions_{column_name}'] = df[column_name].apply(lambda x: [contractions.fix(word) for word in x.split()])
        return df

    def rebuild_string(self, df, column_name):
        df[f'{column_name}_string_nocont'] = [' '.join(map(str, l)) for l in df[f'RemoveContractions_{column_name}']]
        return df

    def tokenize(self, df, column_name):
        df[f'tokenized_{column_name}'] = df[f'{column_name}_string_nocont'].apply(word_tokenize)
        return df
    
    def token_cleanup(self, df, column_name):
        edge_cases = ["``", "’", "''", "image", "title", "alt", "src", "width", "img", "http", "cbc", "jpg", "16x9_460", "buzzfeed", "com", "h1", "href", "href=", 'p', '/p', '/a' "rel", "www", "reuters", "timesofindia", "indiatimes", "margin", "nofollow", '8217', '8230']
        df[f'tokenized_{column_name}'] = df[f'tokenized_{column_name}'].apply(lambda x: [word.lower() for word in x])
        df[f'tokenized_{column_name}'] = df[f'tokenized_{column_name}'].apply(lambda x: [word for word in x if word not in string.punctuation])
        df[f'tokenized_{column_name}'] = df[f'tokenized_{column_name}'].apply(lambda x: [word for word in x if word not in self.stop_words])
        df[f'tokenized_{column_name}'] = df[f'tokenized_{column_name}'].apply(lambda x: [word for word in x if '/' not in word])
        df[f'tokenized_{column_name}'] = df[f'tokenized_{column_name}'].apply(lambda x: [word for word in x if word not in edge_cases])
        return df

    def make_bigrams(self, df, column_name):
        bigram = gensim.models.Phrases(df[f'tokenized_{column_name}'], min_count=5, threshold=100)
        bigram_mod = gensim.models.phrases.Phraser(bigram)

        def get_bigrams(tokens_list):
            return bigram_mod[tokens_list]

        df[f'bigrams_{column_name}'] = df[f'tokenized_{column_name}'].apply(get_bigrams)
        
        return df
        
    def lemmatize_tokens(self, df, column_name):
        clean_up = ["'s", "--"]
        df[f'lemmatized_{column_name}'] = df[f'tokenized_{column_name}'].apply(lambda x: [self.wnl.lemmatize(word) for word in x])
        df[f'lemmatized_{column_name}'] = df[f'lemmatized_{column_name}'].apply(lambda x: [word for word in x if word not in clean_up])
        return df


    def clean(self):
        df = self.remove_null(self.df, self.column_name)
        df = self.remove_contractions(df, self.column_name)
        df = self.rebuild_string(df, self.column_name)
        df = self.tokenize(df, self.column_name)
        df = self.token_cleanup(df, self.column_name)
       # df = self.make_bigrams(df, self.column_name)
        df = self.lemmatize_tokens(df, self.column_name)
        return df

In [None]:
df = pd.read_json('News_Category_Dataset_v3.json', lines=True)
wnl = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

desc_cleaner_news_uni = DataCleaner(df, 'short_description', stop_words, wnl)
cleaned_df_desc_uni = desc_cleaner_news_uni.clean()

In [None]:
# Create Dictionary
id2word_uni = corpora.Dictionary(cleaned_df_desc_uni['lemmatized_short_description'])

# Create Corpus
texts_uni = cleaned_df_desc_uni['lemmatized_short_description']

# Term Document Frequency
corpus_uni = [id2word_uni.doc2bow(text) for text in texts_uni]

# View the first entry in the corpus
print(corpus_uni[0])

In [None]:
lda_model_uni = gensim.models.ldamodel.LdaModel(corpus=corpus_uni,
                                           id2word=id2word_uni,
                                           num_topics=5, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [None]:
lda_model_uni.save("unigram_lda_model")

In [None]:
print(lda_model_uni.print_topics())
doc_lda_uni = lda_model_uni[corpus_uni]

In [None]:
# Compute Perplexity
print('\nPerplexity: ', lda_model_uni.log_perplexity(corpus_uni))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda_uni = CoherenceModel(model=lda_model_uni, texts= cleaned_df_desc['lemmatized_short_description'], dictionary=id2word_uni, coherence='c_v')
coherence_lda_uni = coherence_model_lda_uni.get_coherence()
print('\nCoherence Score: ', coherence_lda_uni)

In [None]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model_uni, corpus_uni, id2word_uni)
vis

In [None]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=20, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [None]:
lda_model.save("bigram_lda_model_5")

In [None]:
print(lda_model.print_topics())
doc_lda = lda_model[corpus]

In [None]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts= cleaned_df_desc['lemmatized_short_description'], dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)