# Topic Modelling using Latent Dirichlet Allocation

### Data Preparation and Preprocessing

In [2]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from itertools import combinations
import numpy as np

# Download NLTK resources if not already present
nltk.download('stopwords')
nltk.download('wordnet')

# Load your data
df = pd.read_csv('../globalwarming_posts.csv')

# Data preprocessing
def preprocess(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    tokens = text.split()
    stop_words = set(stopwords.words('english'))
    tokens = [w for w in tokens if w not in stop_words]
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(w) for w in tokens]
    return ' '.join(tokens)

df['clean_text'] = df['text'].astype(str).apply(preprocess)

# Vectorize
vectorizer = CountVectorizer(max_df=0.95, min_df=2)
X = vectorizer.fit_transform(df['clean_text'])

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jpads\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\jpads\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


### Model Training, Topic Extraction, and Model Evaluation

In [4]:
# LDA model
lda = LatentDirichletAllocation(n_components=3, random_state=42)
lda.fit(X)

# Display topics
def print_topics(model, feature_names, n_top_words=10):
    for idx, topic in enumerate(model.components_):
        print(f"Topic {idx + 1}: ", " ".join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]))

print_topics(lda, vectorizer.get_feature_names_out())

# Perplexity
perplexity = lda.perplexity(X)
print(f"\nModel Perplexity: {perplexity:.2f}")

# Simple coherence score: average pairwise word co-occurrence for top words in each topic
def simple_coherence_score(X, feature_names, lda_model, top_n=10):
    scores = []
    Xc = (X.T * X)  # term-term co-occurrence matrix
    Xc.setdiag(0)
    for topic_idx, topic in enumerate(lda_model.components_):
        top_indices = topic.argsort()[:-top_n - 1:-1]
        # Compute average pairwise co-occurrence
        pairs = list(combinations(top_indices, 2))
        if not pairs:
            continue
        pair_scores = [Xc[i, j] for i, j in pairs]
        if pair_scores:
            scores.append(np.mean(pair_scores))
    return np.mean(scores) if scores else 0

coherence = simple_coherence_score(X, vectorizer.get_feature_names_out(), lda)
print(f"Simple Coherence Score (higher is better): {coherence:.2f}")

# from gensim.models.coherencemodel import CoherenceModel
# from gensim.corpora.dictionary import Dictionary
# # Evaluate: Coherence Score (using Gensim)
# # Prepare data for Gensim
# texts = [t.split() for t in df['clean_text']]
# dictionary = Dictionary(texts)
# corpus = [dictionary.doc2bow(text) for text in texts]
#
# # Get topics in Gensim format
# topics = []
# for topic_weights in lda.components_:
#     top_words = [vectorizer.get_feature_names_out()[i] for i in topic_weights.argsort()[:-11:-1]]
#     topics.append(top_words)
#
# # Compute coherence
# coherence_model = CoherenceModel(topics=topics, texts=texts, dictionary=dictionary, coherence='c_v')
# coherence = coherence_model.get_coherence()
# print(f"Model Coherence Score: {coherence:.2f}")

Topic 1:  warming global px think one climate carbon co scientist people
Topic 2:  gif smilies year weather one like co see dont earth
Topic 3:  warming vp global http fuel ice change energy fossil bear

Model Perplexity: 817.59
Simple Coherence Score (higher is better): 20.09
