In [None]:
from google.colab import drive
import os
import pandas as pd
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import silhouette_score
from typing import List, Dict, Tuple
import logging
from dataclasses import dataclass
from pathlib import Path
import json
from collections import defaultdict
import re

# Mount Google Drive
drive.mount('/content/drive')

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

Mounted at /content/drive


In [None]:
import os
import nltk
from nltk.tokenize import TreebankWordTokenizer
from nltk import pos_tag
from pathlib import Path

def setup_nltk():
    print(f"Using NLTK version: {nltk.__version__}")

    # Step 1: Set up directory
    nltk_data_dir = Path('/root/nltk_data')

    # Step 2: Set environment variable
    os.environ['NLTK_DATA'] = str(nltk_data_dir)

    # Step 3: Download correct packages
    print("\nDownloading NLTK packages...")
    packages = [
        'punkt',
        'averaged_perceptron_tagger_eng',  # Use the correct package name
        'universal_tagset'
    ]

    for package in packages:
        nltk.download(package, download_dir=str(nltk_data_dir))
        print(f"✓ Downloaded {package}")

    # Step 4: Verify installation
    print("\nVerifying installation...")
    test_sentence = "Students struggle with exams and worry about grades."

    # Test tokenization
    tokenizer = TreebankWordTokenizer()
    tokens = tokenizer.tokenize(test_sentence)
    print("✓ Tokenization successful:", tokens)

    # Test POS tagging
    tags = pos_tag(tokens)
    print("\n✓ POS tagging successful:")
    for word, tag in tags:
        print(f"{word}: {tag}")

    # Show nouns and verbs specifically
    nouns_verbs = [(word, tag) for word, tag in tags
                   if tag.startswith(('NN', 'VB'))]

    print("\nIdentified nouns and verbs:")
    for word, tag in nouns_verbs:
        print(f"{word}: {tag}")

    print("\n✓ Setup completed successfully!")
    return True

# Run the setup
if __name__ == "__main__":
    setup_nltk()

Using NLTK version: 3.9.1

Downloading NLTK packages...


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


✓ Downloaded punkt
✓ Downloaded averaged_perceptron_tagger_eng
✓ Downloaded universal_tagset

Verifying installation...
✓ Tokenization successful: ['Students', 'struggle', 'with', 'exams', 'and', 'worry', 'about', 'grades', '.']


[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.
[nltk_data] Downloading package universal_tagset to /root/nltk_data...
[nltk_data]   Unzipping taggers/universal_tagset.zip.



✓ POS tagging successful:
Students: NNS
struggle: VBP
with: IN
exams: NNS
and: CC
worry: VBP
about: IN
grades: NNS
.: .

Identified nouns and verbs:
Students: NNS
struggle: VBP
exams: NNS
worry: VBP
grades: NNS

✓ Setup completed successfully!


In [None]:
@dataclass
class Config:
    """Configuration parameters for the analysis"""
    model_name: str = "cardiffnlp/twitter-roberta-base-sentiment-latest"
    min_posts: int = 1000  # Minimum posts per college
    year: str = "2019"
    max_sequence_length: int = 512
    num_clusters: int = 5
    batch_size: int = 32
    device: str = "cuda" if torch.cuda.is_available() else "cpu"

    # Paths
    base_path: Path = Path('/content/drive/My Drive/CS470_GroupProject')
    data_path: Path = base_path / 'roberta2022'
    output_path: Path = base_path / 'results'

# Initialize config
config = Config()
config.output_path.mkdir(exist_ok=True)

FileNotFoundError: [Errno 2] No such file or directory: '/content/drive/My Drive/CS470_GroupProject/results'

In [None]:
class DataProcessor:
    """Handles data loading and preprocessing"""

    def __init__(self, config: Config):
        self.config = config

    def load_and_filter_data(self) -> pd.DataFrame:
        """Load and filter data for specific year and minimum posts"""
        dfs = []

        for file in self.config.data_path.glob("*.csv"):
            if self.config.year not in file.name:
                continue

            try:
                df = pd.read_csv(file)
                if len(df) >= self.config.min_posts:
                    df['college'] = file.stem.split('_')[0]  # Extract college name
                    dfs.append(df)
                    logger.info(f"Loaded {len(df)} posts from {file.name}")
            except Exception as e:
                logger.error(f"Error loading {file}: {str(e)}")

        return pd.concat(dfs, ignore_index=True) if dfs else pd.DataFrame()

In [None]:
import torch
print("CUDA available:", torch.cuda.is_available())
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

class SentimentAnalyzer:
    """Handles sentiment analysis and attention weight extraction"""

    def __init__(self, config: Config):
        self.config = config
        self.device = device
        logger.info(f"Using device: {self.device}")

        self.tokenizer = AutoTokenizer.from_pretrained(config.model_name)
        self.model = AutoModelForSequenceClassification.from_pretrained(
            config.model_name
        ).to(self.device)
        self.model.eval()
        logger.info("Model loaded and moved to GPU")

    def process_batch(self, texts: List[str]) -> Tuple[np.ndarray, np.ndarray]:
        """Process a batch of texts and return sentiment scores and attention weights"""
        with torch.no_grad():
            inputs = self.tokenizer(
                texts,
                return_tensors="pt",
                padding=True,
                truncation=True,
                max_length=self.config.max_sequence_length
            ).to(self.device)

            outputs = self.model(**inputs, output_attentions=True)

            # Get attention weights from last layer
            attentions = outputs.attentions[-1].mean(dim=1).cpu().numpy()

            return attentions

    def analyze_posts(self, df: pd.DataFrame) -> Dict:
        """Analyze posts and extract attention patterns"""
        # Find all negative posts first
        negative_posts = df[
            (df['emo_pred_neg'] > df['emo_pred_pos']) &
            (df['emo_pred_neg'] > df['emo_pred_neu'])
        ]

        print(f"\nTotal posts in dataset: {len(df)}")
        print(f"Total negative posts: {len(negative_posts)}")
        print(f"Percentage negative: {(len(negative_posts)/len(df))*100:.2f}%\n")

        results = defaultdict(list)
        processed_count = 0

        # Process only the negative posts
        for i in range(0, len(negative_posts), self.config.batch_size):
            batch_texts = negative_posts['body'].iloc[i:i + self.config.batch_size].tolist()
            attentions = self.process_batch(batch_texts)

            for j, (text, attention) in enumerate(zip(batch_texts, attentions)):
                tokens = self.tokenizer.convert_ids_to_tokens(
                    self.tokenizer(text, truncation=True)['input_ids']
                )

                # Get attention scores for each token
                token_attention = list(zip(tokens, attention.mean(axis=0).tolist()))

                results['texts'].append(text)
                results['colleges'].append(negative_posts.iloc[i + j]['college'])
                results['attention_patterns'].append(token_attention)

            processed_count += len(batch_texts)
            print(f"Processed {processed_count}/{len(negative_posts)} negative posts")

        logger.info(f"Attention analysis complete for {processed_count} negative posts")
        return dict(results)

CUDA available: True
Using device: cuda


In [None]:
import numpy as np
from collections import defaultdict
from tqdm import tqdm
from scipy.sparse import csr_matrix
import pandas as pd  # Ensure pandas is imported
import re  # For token cleanup

class ThemeAnalyzer:
    def __init__(self, config):
        self.config = config
        self.n_clusters = 4

    def create_matrix_from_raw_data(self, attention_data, pos_tags):
        """Create matrix and dataframe from raw data"""
        print("Creating matrix from attention patterns and POS tags...\n")

        # Preview the JSON data for understanding
        print("Preview of 'attention_data':")
        print({key: attention_data[key][:2] if isinstance(attention_data[key], list) else attention_data[key]
               for key in list(attention_data.keys())[:3]})  # Display first 2 entries of first 3 keys

        print("\nPreview of 'pos_tags':")
        print({key: pos_tags[key][:2] for key in list(pos_tags.keys())[:3]})  # Display first 2 entries for first 3 documents

        # Process documents
        processed_docs = []
        vocab = set()
        word_to_idx = {}
        rows = []
        cols = []
        data = []
        top_words_examples = []  # To store examples of top words for printing later

        def clean_token(token):
            """Clean token by removing unwanted characters and artifacts."""
            token = re.sub(r"[^\w]", "", token)  # Remove non-alphanumeric characters
            return token.lower().strip()  # Convert to lowercase and strip whitespace

        for idx, (text, college, patterns) in enumerate(zip(
            attention_data['texts'],
            attention_data['colleges'],
            attention_data['attention_patterns']
        )):
            doc_id = f"{college}_{idx}"
            if doc_id not in pos_tags:
                continue

            # Get POS tags for this document
            doc_pos_tags = pos_tags[doc_id]

            # Filter for nouns and verbs with weights
            noun_verb_weights = []
            for (token, weight), (_, pos) in zip(patterns, doc_pos_tags):
                cleaned_token = clean_token(token)
                if pos.startswith(('NN', 'VB')) and cleaned_token and cleaned_token != "s":
                    noun_verb_weights.append((cleaned_token, float(weight)))

            if noun_verb_weights:
                # Sort by weight and keep top 5% (after filtering out <s>)
                noun_verb_weights.sort(key=lambda x: x[1], reverse=True)
                cutoff = max(1, int(len(noun_verb_weights) * 0.05))
                top_words = noun_verb_weights[:cutoff]

                # Collect examples for later printing
                if len(top_words_examples) < 5:  # Store examples for 5 documents
                    top_words_examples.append((idx, top_words))

                # Update vocabulary
                for word, _ in top_words:
                    if word not in word_to_idx:
                        word_to_idx[word] = len(word_to_idx)

                # Add to matrix construction lists
                for word, weight in top_words:
                    rows.append(len(processed_docs))
                    cols.append(word_to_idx[word])
                    data.append(weight)

                processed_docs.append({
                    'college': college,
                    'text': text,
                    'important_words': top_words
                })

            if idx % 1000 == 0:
                print(f"Processed {idx}/{len(attention_data['texts'])} documents")

        # Create sparse matrix
        X = csr_matrix((data, (rows, cols)),
                       shape=(len(processed_docs), len(word_to_idx)))

        # Create dataframe
        df = pd.DataFrame(processed_docs)

        # Print top words examples
        print("\nExamples of top-ranked nouns and verbs before clustering:")
        for doc_idx, words in top_words_examples:
            print(f"Document {doc_idx}:")
            print(", ".join([f"{word} ({weight:.2f})" for word, weight in words]))

        print(f"\nCreated matrix of shape {X.shape}")
        return X, df

    def process_existing_matrix(self, X, df):
        """Process existing sparse matrix with fixed clusters"""
        print(f"\nProcessing matrix of shape {X.shape} with {self.n_clusters} clusters")

        # Check for CUDA
        try:
            import cupy as cp
            from cuml.cluster import KMeans as cuKMeans
            print("CUDA available - using GPU acceleration")

            # Convert to GPU
            X_gpu = cp.sparse.csr_matrix(X)
            kmeans = cuKMeans(n_clusters=self.n_clusters, random_state=42, n_init='auto')
            df['cluster'] = kmeans.fit_predict(X_gpu)
            cluster_centers = cp.asnumpy(kmeans.cluster_centers_)

        except (ImportError, ModuleNotFoundError):
            print("CUDA not available - using CPU")
            from sklearn.cluster import KMeans

            kmeans = KMeans(n_clusters=self.n_clusters, random_state=42, n_init='auto')
            df['cluster'] = kmeans.fit_predict(X)
            cluster_centers = kmeans.cluster_centers_

        # Analyze clusters
        print("\nAnalyzing clusters...")
        themes = {}

        for i in range(self.n_clusters):
            cluster_docs = df[df['cluster'] == i]

            # Aggregate word weights in cluster
            word_weights = defaultdict(float)
            for words in cluster_docs['important_words']:
                for word, weight in words:
                    word_weights[word] += weight

            # Get top words by total weight
            top_words = sorted(word_weights.items(), key=lambda x: x[1], reverse=True)[:15]

            themes[i] = {
                'size': len(cluster_docs),
                'percentage': len(cluster_docs) / len(df) * 100,
                'top_words': top_words,
                'sample_posts': cluster_docs['text'].sample(min(5, len(cluster_docs))).tolist(),
                'colleges': cluster_docs['college'].value_counts().to_dict()
            }

            print(f"\nCluster {i}:")
            print(f"Size: {len(cluster_docs)} posts ({themes[i]['percentage']:.1f}%)")
            print("Top words (weight):", ', '.join([f"{w}({s:.2f})" for w, s in top_words[:5]]))
            print("Sample post:", themes[i]['sample_posts'][0][:200] + "...")

        return {
            'cluster_themes': themes,
            'metadata': {
                'total_posts': len(df),
                'n_clusters': self.n_clusters,
                'vocabulary_size': X.shape[1]
            }
        }


In [None]:
def main():
    """Main execution function starting from JSON files"""
    try:
        # Initialize config
        config = Config()

        # Load our saved files
        logger.info("Loading saved data...")
        with open(config.output_path / 'attention_patterns.json', 'r') as f:
            attention_data = json.load(f)
        with open(config.output_path / 'pos_tags.json', 'r') as f:
            pos_tags = json.load(f)

        logger.info(f"Loaded data for {len(attention_data['texts'])} documents")

        # Create matrix and run clustering
        theme_analyzer = ThemeAnalyzer(config)

        # First create matrix
        X, df = theme_analyzer.create_matrix_from_raw_data(attention_data, pos_tags)

        # Then do clustering
        logger.info("Starting clustering with 4 clusters...")
        cluster_results = theme_analyzer.process_existing_matrix(X, df)

        # Save results
        results_file = config.output_path / 'theme_clusters.json'
        with open(results_file, 'w') as f:
            json.dump(cluster_results, f, indent=2)

        logger.info(f"Analysis complete! Results saved to {results_file}")

    except Exception as e:
        logger.error(f"An error occurred: {str(e)}")
        raise

if __name__ == "__main__":
    import json
    import logging
    import pandas as pd

    logging.basicConfig(level=logging.INFO)
    logger = logging.getLogger(__name__)

    main()

Creating matrix from attention patterns and POS tags...

Preview of 'attention_data':
{'texts': ['I wish. Most of the classes they didn’t accept were either dual credit classes that I took in HS or courses I took P/F. Additionally I had an internship that was worth 4cr that didn’t transfer in, and a required freshman seminar from my last college didn’t translate to anything.\n\nIt’s fair. But it sucks.', 'The university no longer does guaranteed longevity pay due to annual increases of the minimum wage. :('], 'colleges': ['BostonU', 'BostonU'], 'attention_patterns': [[['<s>', 0.2755829691886902], ['I', 0.008264525793492794], ['Ġwish', 0.00782431848347187], ['.', 0.007175407372415066], ['ĠMost', 0.008431607857346535], ['Ġof', 0.005328712053596973], ['Ġthe', 0.006881313864141703], ['Ġclasses', 0.00990503840148449], ['Ġthey', 0.009608980268239975], ['Ġdidn', 0.005576973780989647], ['âĢ', 0.005375291220843792], ['Ļ', 0.006462315563112497], ['t', 0.004150718916207552], ['Ġaccept', 0.0067180

In [None]:
import matplotlib.pyplot as plt
from wordcloud import WordCloud

def visualize_clusters(cluster_results_path: Path = config.output_path / 'theme_clusters.json'):
    """Visualize the clusters using word clouds"""
    with open(cluster_results_path) as f:
        clusters = json.load(f)

    for cluster_id, data in clusters.items():
        # Create word frequency dict from tokens
        text = ' '.join(data['tokens'])

        # Generate word cloud
        wordcloud = WordCloud(
            width=800,
            height=400,
            background_color='white',
            colormap='viridis'
        ).generate(text)

        # Display
        plt.figure(figsize=(10, 5))
        plt.imshow(wordcloud, interpolation='bilinear')
        plt.axis('off')
        plt.title(f'Theme Cluster {cluster_id}')
        plt.show()

        # Print some statistics
        print(f"\nCluster {cluster_id} Statistics:")
        colleges = data['colleges']
        college_counts = pd.Series(colleges).value_counts()
        print("\nTop colleges in this cluster:")
        print(college_counts.head())
        print("-" * 50)


visualize_clusters()