---
title: "Unsupervised Learning"
format:
    html: 
        code-fold: false
---

<!-- After digesting the instructions, you can delete this cell, these are assignment instructions and do not need to be included in your final submission.  -->

{{< include unsupervised.qmd >}} 

# Codes

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation, NMF, PCA
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.cluster import DBSCAN
from sklearn.neighbors import NearestNeighbors
from sklearn.manifold import TSNE
import seaborn as sns
import spacy
import gensim
from gensim import corpora, models
import pyLDAvis.gensim_models

In [None]:
# Load data
file_path_text_clean = "data/processed-data/text_clean.csv"
df = pd.read_csv(file_path_text_clean)

# Additional domain-specific stopwords
domain_stopwords = {
    'trump', 'biden', 'republican', 'democrat', 'like', 'im', 'dont', 'people',
    'think', 'know', 'would', 'said', 'one', 'year', 'state', 'time'
}

# Load SpaCy model for better text processing
nlp = spacy.load('en_core_web_sm')

def improved_text_preprocessing(text):
    # Handle non-string inputs
    if not isinstance(text, str):
        if pd.isna(text):  
            return ""
        text = str(text)  
    
    doc = nlp(text)
    # Keep only nouns, adjectives, verbs, and adverbs
    tokens = [token.lemma_.lower() for token in doc 
             if (token.pos_ in ['NOUN', 'ADJ', 'VERB', 'ADV']) 
             and (token.lemma_.lower() not in domain_stopwords)
             and (len(token.lemma_) > 2)]
    return ' '.join(tokens)

# Convert text column to string type and handle NaN values
df['text'] = df['text'].fillna("").astype(str)

df['text'] = df['text'].apply(improved_text_preprocessing)
submissions = df[df['type'].str.contains('submission')].copy()

In [None]:
def perform_dimensionality_reduction(tfidf_matrix):
    """
    Perform and compare PCA and t-SNE dimensionality reduction
    """
    # Convert sparse matrix to dense
    X_dense = tfidf_matrix.toarray()
    
    # 1. PCA Analysis
    print("\nPerforming PCA analysis...")
    pca = PCA()
    X_pca = pca.fit_transform(X_dense)
    
    # Calculate explained variance ratio
    cumsum_variance_ratio = np.cumsum(pca.explained_variance_ratio_)
    
    # Plot explained variance ratio
    plt.figure(figsize=(10, 5))
    plt.plot(range(1, len(cumsum_variance_ratio) + 1), cumsum_variance_ratio)
    plt.xlabel('Number of Components')
    plt.ylabel('Cumulative Explained Variance Ratio')
    plt.title('PCA: Explained Variance Ratio vs Number of Components')
    plt.axhline(y=0.8, color='r', linestyle='--', label='80% Threshold')
    plt.legend()
    plt.show()
    
    # Find optimal number of components for 80% variance
    n_components_80 = np.argmax(cumsum_variance_ratio >= 0.8) + 1
    print(f"Number of components needed for 80% variance: {n_components_80}")
    
    # 2. t-SNE Analysis with different perplexity values
    perplexities = [5, 30, 50]
    plt.figure(figsize=(15, 5))
    
    # First reduce dimensionality with PCA to 50 components for efficiency
    pca_50 = PCA(n_components=50)
    X_pca_50 = pca_50.fit_transform(X_dense)
    
    for idx, perp in enumerate(perplexities):
        print(f"\nPerforming t-SNE with perplexity {perp}...")
        tsne = TSNE(n_components=2, perplexity=perp, random_state=42)
        X_tsne = tsne.fit_transform(X_pca_50)
        
        plt.subplot(1, 3, idx + 1)
        plt.scatter(X_tsne[:, 0], X_tsne[:, 1], alpha=0.5)
        plt.title(f't-SNE (perplexity={perp})')
        plt.xlabel('t-SNE 1')
        plt.ylabel('t-SNE 2')
    
    plt.tight_layout()
    plt.show()
    
    # Get optimal PCA and scale to non-negative values
    pca_optimal = PCA(n_components=n_components_80)
    X_pca_optimal = pca_optimal.fit_transform(X_dense)
    
    # Scale PCA results to [0, 1] range for NMF
    scaler = MinMaxScaler()
    X_pca_scaled = scaler.fit_transform(X_pca_optimal)
    
    # Final t-SNE
    tsne_final = TSNE(n_components=2, perplexity=30, random_state=42)
    X_tsne_final = tsne_final.fit_transform(X_pca_50)
    
    return X_pca_scaled, X_tsne_final, pca_optimal


In [None]:
def perform_nmf_analysis(texts, n_topics=5):
    # Create TF-IDF vectors
    tfidf_vectorizer = TfidfVectorizer(
        max_features=1000,
        ngram_range=(1, 2)
    )
    tfidf = tfidf_vectorizer.fit_transform(texts)
    
    # Perform dimensionality reduction
    X_pca, X_tsne, pca_model = perform_dimensionality_reduction(tfidf)
    
    # Apply NMF on scaled PCA-reduced data
    nmf_model = NMF(
        n_components=n_topics,
        random_state=42
    )
    nmf_topics = nmf_model.fit_transform(X_pca)
    
    # Visualize topics in reduced dimensional space
    plt.figure(figsize=(15, 5))
    
    # Plot PCA
    plt.subplot(1, 2, 1)
    plt.scatter(X_pca[:, 0], X_pca[:, 1], c=np.argmax(nmf_topics, axis=1), cmap='tab10')
    plt.title('Topics visualized in PCA space')
    plt.xlabel('First Principal Component')
    plt.ylabel('Second Principal Component')
    
    # Plot t-SNE
    plt.subplot(1, 2, 2)
    plt.scatter(X_tsne[:, 0], X_tsne[:, 1], c=np.argmax(nmf_topics, axis=1), cmap='tab10')
    plt.title('Topics visualized in t-SNE space')
    plt.xlabel('t-SNE 1')
    plt.ylabel('t-SNE 2')
    
    plt.tight_layout()
    plt.show()
    
    return np.argmax(nmf_topics, axis=1), nmf_model, tfidf_vectorizer


In [None]:
def perform_dbscan_validation(texts, tfidf_vectorizer):
    print("\nPerforming DBSCAN validation...")
    
    # Create TF-IDF vectors
    tfidf = tfidf_vectorizer.transform(texts)
    
    # Perform dimensionality reduction
    X_pca, X_tsne, _ = perform_dimensionality_reduction(tfidf)
    
    # Apply DBSCAN on t-SNE results
    dbscan = DBSCAN(eps=0.5, min_samples=5)
    dbscan_labels = dbscan.fit_predict(X_tsne)
    
    # Visualize clustering results
    plt.figure(figsize=(10, 5))
    plt.scatter(X_tsne[:, 0], X_tsne[:, 1], c=dbscan_labels, cmap='tab10')
    plt.title('DBSCAN Clustering Results (t-SNE space)')
    plt.xlabel('t-SNE 1')
    plt.ylabel('t-SNE 2')
    plt.colorbar(label='Cluster')
    plt.show()
    
    # Count number of clusters
    n_clusters = len(set(dbscan_labels)) - (1 if -1 in dbscan_labels else 0)
    print(f"DBSCAN found {n_clusters} clusters")
    print(f"Number of samples in each cluster: {pd.Series(dbscan_labels).value_counts().sort_index()}")
    
    return dbscan_labels

In [None]:
# Apply all methods
print("Performing topic analysis...")

# NMF Analysis with dimensionality reduction
nmf_topics, nmf_model, tfidf_vectorizer = perform_nmf_analysis(submissions['text'])
submissions['nmf_topic'] = nmf_topics

# Function to display top terms for each topic
def display_top_terms(model, feature_names, n_top_words=10):
    for topic_idx, topic in enumerate(model.components_):
        top_terms = [feature_names[i] for i in topic.argsort()[:-n_top_words-1:-1]]
        print(f"Topic {topic_idx}: {', '.join(top_terms)}")

# Display results
print("\nNMF Topics:")
display_top_terms(nmf_model, tfidf_vectorizer.get_feature_names_out())

# DBSCAN Clustering
submissions['dbscan_cluster'] = perform_dbscan_validation(submissions['text'], tfidf_vectorizer)

# Compare NMF and DBSCAN results
print("\nComparison of NMF topics and DBSCAN clusters:")
print(pd.crosstab(submissions['nmf_topic'], submissions['dbscan_cluster']))

In [None]:
# Create simplified topic labels (select the top term)
topic_labels = {
    0: "abortion",  
    1: "guns",      
    2: "tax",       
    3: "climate",   
    4: "politics"   
}

# Add topic labels to submissions
submissions['nmf_topic'] = submissions['nmf_topic'].map(topic_labels)


# Create a new column 'nmf_topic' to store the topic label for each row
df['nmf_topic'] = None

# Variables to store the current submission ID and its topic
current_submission_id = None
current_topic = None

# Iterate through the dataset and assign the topic of the parent submission to each comment
for idx, row in df.iterrows():
    if 'submission' in row['type']:  # If the current row is a submission
        # Get the current submission's ID and its topic
        current_submission_id = row['id']
        current_topic = submissions[submissions['id'] == current_submission_id]['nmf_topic'].values
        if current_topic.size > 0:
            current_topic = current_topic[0]
        df.at[idx, 'nmf_topic'] = current_topic  
    # If the row is a comment, assign the parent submission's topic
    if 'comment' in row['type'] and current_submission_id is not None:
        df.at[idx, 'nmf_topic'] = current_topic

In [None]:
# Save results
df_text_topic = df

file_path_text_topic = "data/processed-data/text_topic.csv"
df_text_topic.to_csv(file_path_text_topic, index=False)

print(f"\nModeling complete. Results saved to {file_path_text_topic}")
df_text_topic.head(6)

# Display sample results
print("\nSample results:")
df_text_topic.head(6)