# Topic Modeling

In [2]:
# Load libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import datetime, json, os, re, zstandard
from nltk.corpus import stopwords
from zst_processor import read_lines_zst, write_line_zst


In [3]:
subreddits = ["Conservative", "progressive",
              "democrats", "Republican",
              "NeutralPolitics", "PoliticalDiscussion", "politics"]

Find main topics being discussed in the subreddits.

Apply topic modeling techniques such as Latent Dirichlet Allocation (LDA) or Non-negative Matrix Factorization (NMF) to extract the main topics discussed within each subreddit. This will help you identify the prevalent themes and subjects.


In [4]:
input_submissions = [f"data/{s}/{s}_submissions_clean.zst" for s in subreddits]

**Latent Dirichlet Allocation (LDA)**

In [5]:
from gensim.corpora import Dictionary
from gensim.models import CoherenceModel
from gensim.models.ldamodel import LdaModel
from gensim.utils import tokenize
from nltk.corpus import stopwords

In [15]:
def clean_submission(text: str, stop_words: list) -> str:
    """Clean text by removing non-alphabetical characters, stop words,
    and other words"""
    
    # Lowercase
    text = text.lower()

    # Remove stop words
    text = ' '.join([word for word in text.split() if word not in stop_words])

    # Remove 's 
    text = text.replace("'s ", ' ')

    # Remove non-alphabetical characters
    text = re.sub(r'[^a-z ]+', '', text)



    return text

In [16]:
# Function to find topics across multiple texts using LDA model
def create_corpus(input_paths: list) -> None:
    """Find topics across multiple texts using LDA model"""

    # Create empty list to store texts
    texts = []

    # Load stop words using nltk
    stop_words = stopwords.words('english')

    # Custom stop words
    custom_stop_words = ['biden', 'trump', 'republican', 'democrat', 'politics']
    stop_words.extend(custom_stop_words)

    # Loop through input paths
    for path in input_paths:

        # Read lines
        lines = read_lines_zst(path)

        # Loop through lines
        for line, _ in lines:

            # Convert the line to a json object
            obj = json.loads(line)

            # Get text and title
            text = obj['selftext']
            title = obj['title']

            # Skip if text is deleted, or removed
            if (text == 'deleted') or (text == 'removed'):
                continue

            # Combine title and text
            full_text = title + ' ' + text

            # Clean text
            full_text = clean_submission(full_text, stop_words)

            # Skip if text is empty
            if len(full_text) == 0:
                continue

            # Add to list
            texts.append(full_text)

    # Tokenize texts
    tokenized_texts = [list(tokenize(text, lowercase=True)) for text in texts]
 
    # Create dictionary
    dictionary = Dictionary(tokenized_texts)

    # Filter extremes (remove words that appear in more than 30% of documents and less than 10 documents)
    dictionary.filter_extremes(no_below=10, no_above=0.3)

    # Create corpus
    corpus = [dictionary.doc2bow(tokens) for tokens in tokenized_texts]

    return corpus, dictionary, tokenized_texts


In [17]:
# Create corpus
corpus, dictionary, tokenized_texts = create_corpus(input_submissions)

In [None]:
# Find the optimal number of topics using coherence scores
def find_optimal_num_topics(corpus: list, dictionary: Dictionary, texts: list, limit: int, start: int=2, step: int=3) -> None:
    """Find the optimal number of topics"""

    # Create empty list to store models
    models = []

    # Create empty list to store coherence scores
    coherence_scores = []

    # Loop through number of topics
    for num_topics in range(start, limit, step):

        # Create model
        model = LdaModel(corpus=corpus, num_topics=num_topics, id2word=dictionary)

        # Save model
        models.append(model)

        # Create coherence model
        coherence_model = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')

        # Save coherence score
        coherence_scores.append(coherence_model.get_coherence())

    # Create dataframe of coherence scores
    df = pd.DataFrame({'num_topics': range(start, limit, step), 'coherence_score': coherence_scores})

    return models, coherence_scores, df
    

In [None]:
# Find optimal number of topics
lda_models, lda_coherence_scores, lda_df = find_optimal_num_topics(corpus, dictionary, tokenized_texts, limit=10, start=4, step=1)

In [None]:
# Save results
lda_df.to_csv('analysis/topic_modeling/lda_coherence_scores.csv', index=False)

# Save models
for i, model in enumerate(lda_models):
    model.save(f'analysis/topic_modeling/lda_model_{i+3}.model')
    

In [29]:
lda_df

Unnamed: 0,num_topics,coherence_score
0,3,0.566669
1,4,0.589104
2,5,0.558101
3,6,0.500125
4,7,0.526317
5,8,0.523894
6,9,0.4957


**Hierarchical Dirichlet Process (HDP)**

In [9]:
from gensim.models import HdpModel

In [42]:
# Run HDP model
hdp_model = HdpModel(corpus=corpus, id2word=dictionary)


In [43]:
for idx, topic in hdp_model.show_topics(formatted=False, num_words=10):
    print('Topic: {} \nWords: {}'.format(idx, ', '.join([w[0] for w in topic])))

Topic: 0 
Words: us, would, president, people, house, election, like, state, one, states
Topic: 1 
Words: thread, happened, state, news, week, like, local, people, political, think
Topic: 2 
Words: cartoons, political, cartoon, toplevel, thread, means, comment, must, share, saturday
Topic: 3 
Words: im, like, people, dont, one, white, us, know, get, would
Topic: 4 
Words: epstein, ms, page, defendant, girls, jeffrey, sex, epsteins, sexual, testified
Topic: 5 
Words: people, dont, us, im, week, like, know, time, right, liberals
Topic: 6 
Words: cnn, us, government, would, caught, gt, cnns, claims, hillary, people
Topic: 7 
Words: sen, former, cnn, pm, rep, night, mayor, gov, apps, new
Topic: 8 
Words: security, negligence, attorney, general, resignation, deputy, care, us, people, letter
Topic: 9 
Words: gun, year, deaths, people, die, per, number, estimates, population, million
Topic: 10 
Words: us, one, left, im, think, people, american, ive, economy, time
Topic: 11 
Words: party, peop

**Classify submissions**

In [68]:
# Select lda model with highest coherence score
optimal_lda_model = LdaModel.load('analysis/topic_modeling/lda_model_7.model')

In [75]:
for idx, topic in optimal_lda_model.show_topics(formatted=False, num_words=10):
    print('Topic: {} \nWords: {}'.format(idx, ', '.join([w[0] for w in topic])))

Topic: 0 
Words: house, court, supreme, white, news, committee, judge, state, georgia, senate
Topic: 1 
Words: jan, says, s, us, desantis, election, maralago, donald, fbi, joe
Topic: 2 
Words: us, ukraine, covid, arizona, war, says, health, years, china, care
Topic: 3 
Words: new, bill, s, texas, states, law, tax, governor, plan, state
Topic: 4 
Words: s, senate, vote, capitol, gop, says, race, doj, us, general
Topic: 5 
Words: s, abortion, election, gop, voting, voters, poll, climate, party, ic
Topic: 6 
Words: people, one, dont, opinion, time, like, student, right, get, president


In [71]:
# Write function to classify submissions using lda model
def classify_submissions(input_paths: list,
                         output_paths: list,
                         lda_model: LdaModel,
                         dictionary: Dictionary) -> None:
    """Classify submissions using lda model"""

    # Loop through input paths
    for in_path, out_path in zip(input_paths, output_paths):

        # Create the zst handler
        handle = zstandard.ZstdCompressor().stream_writer(open(out_path, 'wb'))

        # Save the data to zst file
        with open(out_path, mode="w", newline="") as file:

            for line, file_bytes_processed in read_lines_zst(in_path):
                obj = json.loads(line)

                # Get text and title
                text = obj['selftext']
                title = obj['title']

                # Skip if text is deleted, or removed
                if (text == 'deleted') or (text == 'removed'):
                    continue

                # Combine title and text
                full_text = title + ' ' + text

                # Skip if text is empty
                if len(full_text) == 0:
                    continue

                # Get topic distribution
                topic_dist = lda_model.get_document_topics(dictionary.doc2bow(full_text.split()), minimum_probability=0.0)

                # Add topic distribution to object (make it serializable)
                obj['topic_dist'] = str(topic_dist)
                
                # Write the data to the zst file
                new_line = json.dumps(obj)
                write_line_zst(handle, new_line)

In [56]:
output_submissions = [f"data/topic_modeling/{s}_submissions_classified.zst" for s in subreddits]

In [80]:
# Classify submissions
classify_submissions(input_submissions, output_submissions, optimal_lda_model, dictionary)