In [14]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from gensim.models import LdaModel
from gensim.corpora import Dictionary
from gensim.models.coherencemodel import CoherenceModel

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\markl\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\markl\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\markl\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [15]:
# Initialize stopwords and lemmatizer outside the function
stop_words = set(stopwords.words('english'))
custom_stopwords = {
    # Domain-specific noise
    'http', 'https', 'www', 'com', 'org', 'smilies', 'gif', 'hi', 'hey',
    # Forum-specific terms
    'vp', 'ps', 'like', 'think', 'know', 'would', 'get', 'go', 'say', 'need', 'want', 'real', 'said', 'yes', 'man', 'make', 'term', 'agw', 'one', 'really', 'even', 'come', 'made', 'link', 'based', 'nothing', 'right', 'mind', 'problem', 'since', 'week',  'sure', 'good', 'kind', 'last', 'true', 'keep', 'also', 'help', 'give', 'mean', 'case', 'best', 'give', 'make', 'go', 'think', 'know', 'like', 'free', 'must', 'else', 'soon', 'read', 'part', 'fact', 'may', 'would', 'could', 'might', 'also', 'even',
    # Vague terms
    'thing', 'something', 'anything', 'someone', 'way', 'much', 'many',
    # Contractions
    'don', 't', 'll', 've', 're', 'm', 's', 'd', 'nt'
    # Add weak signal words from your topics
    'well', 'may', 'claim', 'wrong', 'work',
    # Remove question words
    'question',
    # Temporal words that dilute topics
    'time', 'year'
}
stop_words.update(custom_stopwords)

# Special handling for forum text
forum_patterns = [
    (r'smilies?\w+\s?\w*\s?\d+', ''),  # Remove smiley codes
    (r'//.*', ''),                      # Remove code-like comments
    (r'\.{2,}', ' '),                   # Replace multiple dots
    (r'[^\w\s]', ' ')                   # Keep only words and spaces
]

lemmatizer = WordNetLemmatizer()

In [16]:
# Load the CSV file
file_path = "C:\\Users\\markl\\Downloads\\archive\\climate_posts_clean.csv"
try:
    df = pd.read_csv(file_path)
    # Get random sample of 250 entries
    sample_size = 250
    if len(df) > sample_size:
        df = df.sample(n=sample_size, random_state=42)
    documents = df['text'].astype(str).tolist()
except Exception as e:
    print(f"Error loading file: {e}")
    exit()

In [None]:
def preprocess_text(text, bigram_model=None):
    """
    Enhanced text preprocessing for LDA with bigram support:
    1. Comprehensive cleaning (URLs, emails, etc.)
    2. Advanced contraction handling
    3. Multi-stage filtering (stopwords, length, alphanumeric)
    4. Optional bigram processing
    """
    # Convert to string and lowercase
    text = str(text).lower()
    text = text.replace('\n', ' ').replace('\r', ' ')  # Remove line breaks

    # Advanced cleaning
    text = re.sub(r'https?://\S+|www\.\S+', '', text)  # URLs
    text = re.sub(r'\S*@\S*\s?', '', text)             # Emails
    text = re.sub(r'[^\w\s]|\d', ' ', text)            # Remove punctuation/numbers
    text = re.sub(r'\s+', ' ', text).strip()           # Trim whitespace
    text = re.sub(r'\b(may|would|could|might)\b', '', text)  # Remove modals
    text = re.sub(r'\b\w{1,3}\b', '', text)  # Remove very short words
    text = re.sub(r'\b(?:look|interesting|using|well|find)\b', '', text)
    text = re.sub(r'\b\w{5,20}\b', lambda x: x.group() if x.group() in {
        'climate', 'change', 'global', 'warming',
        'science', 'data', 'research'
    } else '', text)

    # Enhanced contraction handling
    contractions = {
        "don't": "do not", "can't": "cannot", "won't": "will not",
        "it's": "it is", "i'm": "i am", "you're": "you are",
        "they're": "they are", "that's": "that is", "there's": "there is",
        "he's": "he is", "she's": "she is", "what's": "what is"
    }
    for cont, expanded in contractions.items():
        text = text.replace(cont, expanded)

    # Tokenization and lemmatization
    try:
        tokens = word_tokenize(text)
        tokens = [
            lemmatizer.lemmatize(token)
            for token in tokens
            if (token not in stop_words and
                3 <= len(token) <= 25 and
                token.isalpha())
        ]

        # Apply bigram model if provided
        if bigram_model:
            tokens = bigram_model[tokens]

        return tokens
    except Exception as e:
        print(f"Error processing text: {e}")
        return []

# Preprocess documents
processed_docs = [preprocess_text(text) for text in documents]
processed_docs = [doc for doc in processed_docs if len(doc) > 0]

term_merges = {
    'climate': 'climate_science',
    'science': 'climate_science',
    'global': 'global_warming',
    'warming': 'global_warming'
}

processed_docs = [
    [term_merges.get(word, word) for word in doc]
    for doc in processed_docs
]

# Create dictionary and corpus
dictionary = Dictionary(processed_docs)
dictionary.filter_extremes(no_below=5, no_above=0.5)
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

# LDA Model with 3 topics
lda_model = LdaModel(
    corpus=bow_corpus,
    id2word=dictionary,
    num_topics=3,
    random_state=42,
    passes=50,
    alpha='asymmetric'
)

# Evaluate LDA
coherence_model = CoherenceModel(
    model=lda_model,
    texts=processed_docs,
    dictionary=dictionary,
    coherence='c_v'
)
coherence_score = coherence_model.get_coherence()

# Calculate perplexity
perplexity = lda_model.log_perplexity(bow_corpus)  # This returns bound, need to convert
actual_perplexity = 2 ** (-perplexity)  # Convert to actual perplexity

print("\nLDA Topics:")
for idx, topic in lda_model.print_topics(-1, num_words=10):
    print(f"Topic {idx}: {topic}")
print(f"\nCoherence Score: {coherence_score:.4f}")
print(f"Perplexity: {actual_perplexity:.4f}")

# Interpretation notes
print("\nModel Evaluation Notes:")
print("- Higher coherence scores (closer to 1) indicate better topic quality")
print("- Lower perplexity scores indicate better model performance")
print("- Ideal model has high coherence and low perplexity")

# Test multiple topic numbers
coherence_scores = []
for num_topics in range(2, 8):
    lda = LdaModel(
        corpus=bow_corpus,
        id2word=dictionary,
        num_topics=num_topics,
        passes=30
    )
    cm = CoherenceModel(
        model=lda,
        texts=processed_docs,
        dictionary=dictionary,
        coherence='c_v'
    )
    coherence_scores.append(cm.get_coherence())

# Plot results
import matplotlib.pyplot as plt
plt.plot(range(2,8), coherence_scores)
plt.xlabel("Number of Topics")
plt.ylabel("Coherence Score")
plt.show()


LDA Topics:
Topic 0: 0.448*"data" + 0.210*"change" + 0.164*"climate_science" + 0.142*"fuel" + 0.025*"global_warming" + 0.006*"nasa" + 0.005*"research"
Topic 1: 0.713*"climate_science" + 0.149*"change" + 0.088*"nasa" + 0.042*"research" + 0.003*"global_warming" + 0.002*"fuel" + 0.002*"data"
Topic 2: 0.828*"global_warming" + 0.076*"research" + 0.072*"change" + 0.011*"climate_science" + 0.004*"fuel" + 0.004*"nasa" + 0.004*"data"

Coherence Score: 0.5077
Perplexity: 3.2539

Model Evaluation Notes:
- Higher coherence scores (closer to 1) indicate better topic quality
- Lower perplexity scores indicate better model performance
- Ideal model has high coherence and low perplexity
