
source
try:
    import nltk
    nltk.download('punkt', quiet=True)
    nltk.download('stopwords', quiet=True)
    nltk.download('wordnet', quiet=True)
    from nltk.tokenize import word_tokenize
    from nltk.corpus import stopwords
    from nltk.stem import WordNetLemmatizer
    print("NLTK resources ready")
except Exception as e:
    print(f"NLTK resources unavailable or download failed: {e}")
    # Provide lightweight fallbacks so the notebook cell still runs
    def word_tokenize(text):
        return str(text).split()
    stopwords = set()
    class _DummyLemmatizer:
        def lemmatize(self, w):
            return w
    WordNetLemmatizer = lambda : _DummyLemmatizer()

# Advanced text preprocessing with tokenization, stopword removal, and lemmatization
def advanced_preprocess_text(text):
    """Advanced preprocessing with tokenization, stopword removal, and lemmatization."""
    # Basic cleaning first
    text = preprocess_text(text)

    # TOKENIZATION
    tokens = word_tokenize(text)

    # STOPWORD REMOVAL
    try:
        sw = set(stopwords.words('english')) if hasattr(stopwords, 'words') else set()
    except Exception:
        sw = set()
    tokens = [w for w in tokens if w not in sw and len(w) > 2]

    # LEMMATIZATION
    try:
        lemmatizer = WordNetLemmatizer()
        tokens = [lemmatizer.lemmatize(w) for w in tokens]
    except Exception:
        tokens = [w.lower() for w in tokens]

    return ' '.join(tokens)

print("=== ADVANCED TEXT PREPROCESSING ===")
print("Applying tokenization, stopword removal, and lemmatization...")

# Auto-detect the text column instead of hardcoding 'reviews.text'
candidates = ['reviews.text', 'reviewText', 'text', 'content']
text_column = None
for c in candidates:
    if c in df.columns:
        text_column = c
        break
if text_column is None:
    # Fallback: pick the first column that looks like text (contains 'review' or 'text')
    for c in df.columns:
        if 'review' in c.lower() or 'text' in c.lower():
            text_column = c
            break
# As a last resort, pick the first column
if text_column is None:
    text_column = df.columns[0]
print(f"Using text column: {text_column}")

# Apply advanced preprocessing to the cleaned text column if present, else to the detected text column
source_col = 'cleaned_text' if 'cleaned_text' in df_processed.columns else text_column
df_processed['processed_text'] = df_processed[source_col].apply(advanced_preprocess_text)

# Remove empty texts after advanced processing
df_processed = df_processed[df_processed['processed_text'].str.len() > 0].reset_index(drop=True)

print(f"Dataset size after advanced preprocessing: {len(df_processed)}")

# Show examples (if enough rows exist)
print("\n=== ADVANCED PREPROCESSING EXAMPLES ===\")
for i in range(min(3, len(df_processed))):
    cleaned = str(df_processed.iloc[i][source_col])[:80] + '...'
    processed = df_processed.iloc[i]['processed_text'][:80] + '...'
    print(f"Cleaned:   {cleaned}")
    print(f"Processed: {processed}\n")

# Final dataset statistics
print("=== FINAL PREPROCESSING STATISTICS ===")
avg_length_original = df_processed[text_column].astype(str).str.len().mean() if text_column in df_processed.columns else 0
avg_length_processed = df_processed['processed_text'].str.len().mean()
avg_words_processed = df_processed['processed_text'].str.split().str.len().mean()

print(f"Average original text length: {avg_length_original:.1f} characters")
print(f"Average processed text length: {avg_length_processed:.1f} characters")
print(f"Average words after processing: {avg_words_processed:.1f} words")

In [2]:
import sys
print(f"Python version: {sys.version}")
print(f"Python executable: {sys.executable}")

Python version: 3.10.11 (main, Oct 15 2025, 22:06:50) [Clang 17.0.0 (clang-1700.3.19.1)]
Python executable: /Users/enriqueestevezalvarez/Documents/Ironhack/Projects/NLP Automated customers/project-nlp-automated-customer-reviews/.venv/bin/python


In [3]:
# Import all required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Fix notebook visualization dependencies
import sys
import subprocess
import nbformat
import kaleido

# Traditional ML libraries
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix, classification_report
from xgboost import XGBClassifier
from sklearn.ensemble import GradientBoostingClassifier, ExtraTreesClassifier

# NLP libraries
import nltk
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Deep Learning libraries
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline

# Add these new imports for fine-tuning
from torch.utils.data import Dataset, DataLoader
from transformers import Trainer, TrainingArguments, EarlyStoppingCallback
import os

# Visualization
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import time

# Dataset loading
from datasets import load_dataset

# Only add these missing imports in cell 22:
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC  # You imported SVC but need LinearSVC
from sklearn.metrics import precision_score, recall_score, f1_score  # Individual metrics
from sklearn.preprocessing import LabelEncoder
import time

In [4]:
# Download required NLTK data resources for text preprocessing
try:
    # 'punkt' is used for tokenization (splitting text into words/sentences)
    nltk.download('punkt', quiet=True)
    # 'stopwords' provides lists of common words to filter out
    nltk.download('stopwords', quiet=True)
    # 'wordnet' is used for lemmatization (reducing words to their base form)
    nltk.download('wordnet', quiet=True)
    # 'omw-1.4' is a multilingual WordNet resource
    nltk.download('omw-1.4', quiet=True)
    print("NLTK data downloaded successfully!")
except:
    print("NLTK data download failed. Please check your internet connection.")

NLTK data downloaded successfully!


## STEP 2: Data Collection

Loading the Amazon customer reviews dataset from HuggingFace. We'll use a subset to ensure manageable computational requirements.

In [5]:
# Load Amazon US Reviews dataset from HuggingFace
# We'll use the "Electronics" category for manageable size
print("Loading Amazon US Reviews dataset...")

try:
    print("⚠️  NOTE: The original Amazon US Reviews dataset is no longer available on HuggingFace")
    print("Trying alternative datasets from HuggingFace...")
    
    # Try the newer Amazon reviews dataset first
    try:
        dataset = load_dataset("amazon_reviews_multi", "en", split="train")
        df = dataset.to_pandas()
        # Rename columns to match expected format
        df = df.rename(columns={
            'review_body': 'reviews.text',
            'stars': 'reviews.rating'
        })
        print("Successfully loaded Amazon Reviews Multi dataset")
    except Exception as e1:
        print(f"Amazon Reviews Multi not available: {e1}")
        print("Trying IMDB dataset as HuggingFace alternative...")
        
        try:
            # Alternative: Use IMDB dataset and adapt it
            dataset = load_dataset("imdb", split="train")
            df = dataset.to_pandas()
            # Convert IMDB labels (0=negative, 1=positive) to ratings (1-5 scale)
            df['reviews.rating'] = df['label'].map({0: 2, 1: 5})  # Map to low and high ratings
            df = df.rename(columns={'text': 'reviews.text'})
            df = df.drop('label', axis=1)
            print("Successfully loaded IMDB dataset as HuggingFace alternative")
        except Exception as e2:
            print(f"IMDB dataset also failed: {e2}")
            raise Exception("All HuggingFace datasets failed")
    
    # Take a sample to manage computational resources (adjust size based on your needs)
    sample_size = min(30000, len(df))  # Use up to 30k reviews from HuggingFace
    df_huggingface = df.sample(n=sample_size, random_state=42).reset_index(drop=True)
    
    print(f"📊 Successfully loaded {len(df_huggingface)} reviews from HuggingFace dataset")
    
    # Now also load local archive data to combine with HuggingFace data
    print("🔄 Also loading local archive data to combine datasets...")
    df_local = None
    
except Exception as e:
    print(f"Error loading HuggingFace dataset: {e}")
    print("Loading dataset from local archive folder only...")
    df_huggingface = None
    
# Load dataset from archive folder (for combination or as fallback)
try:
    import os
    archive_path = "archive"
    
    # Look for CSV files in the archive folder
    if os.path.exists(archive_path):
        csv_files = [f for f in os.listdir(archive_path) if f.endswith('.csv')]
        print(f"📁 Found CSV files in archive: {csv_files}")
        
        if csv_files:
            # Load and combine all CSV files
            dataframes = []
            for csv_file in csv_files:
                file_path = os.path.join(archive_path, csv_file)
                try:
                    temp_df = pd.read_csv(file_path, encoding='utf-8')
                except UnicodeDecodeError:
                    temp_df = pd.read_csv(file_path, encoding='latin-1')
                print(f"📄 Loaded {len(temp_df)} rows from {csv_file}")
                dataframes.append(temp_df)
            
            # Combine all CSV dataframes
            df_local = pd.concat(dataframes, ignore_index=True)
            print(f"📊 Successfully combined all CSV files: {len(df_local)} total rows")
            
            # Take a sample from local data (leave room for HuggingFace data)
            local_sample_size = 30000 if df_huggingface is not None else 50000
            if len(df_local) > local_sample_size:
                df_local = df_local.sample(n=local_sample_size, random_state=42).reset_index(drop=True)
                print(f"📊 Sampled local data to {len(df_local)} rows")
            
            # Combine HuggingFace and local data if both available
            if df_huggingface is not None:
                print("🔗 Combining HuggingFace and local datasets...")
                
                # Standardize column names for both datasets
                # HuggingFace data already has 'reviews.text' and 'reviews.rating'
                # Local data might have different column names, so map them
                if 'reviews.text' not in df_local.columns:
                    # Find text column in local data
                    text_cols = ['reviews.text', 'review_body', 'review_text', 'text', 'body']
                    for col in text_cols:
                        if col in df_local.columns:
                            df_local = df_local.rename(columns={col: 'reviews.text'})
                            break
                
                if 'reviews.rating' not in df_local.columns:
                    # Find rating column in local data  
                    rating_cols = ['reviews.rating', 'star_rating', 'rating', 'stars']
                    for col in rating_cols:
                        if col in df_local.columns:
                            df_local = df_local.rename(columns={col: 'reviews.rating'})
                            break
                
                # Add source identifier
                df_huggingface['data_source'] = 'HuggingFace_IMDB'
                df_local['data_source'] = 'Local_Amazon'
                
                # Combine datasets
                df = pd.concat([df_huggingface, df_local], ignore_index=True)
                print(f"🎯 COMBINED DATASET: {len(df)} total reviews")
                print(f"   - HuggingFace (IMDB): {len(df_huggingface)} reviews")
                print(f"   - Local (Amazon): {len(df_local)} reviews")
                
            else:
                # Only local data available
                df = df_local
                df['data_source'] = 'Local_Amazon'
                print(f"📊 Using local dataset only: {len(df)} reviews")
                
        else:
            raise FileNotFoundError("No CSV files found in archive folder")
    else:
        raise FileNotFoundError("Archive folder not found")
        
except Exception as e2:
    print(f"Error loading from archive: {e2}")
    
    # If we have HuggingFace data but no local data, use HuggingFace only
    if 'df_huggingface' in locals() and df_huggingface is not None:
        df = df_huggingface
        df['data_source'] = 'HuggingFace_IMDB'
        print(f"Using HuggingFace dataset only: {len(df)} reviews")
    else:
        # Neither source worked, use dummy data
        print("Using dummy data for demonstration. Please check your archive folder path.")
        df = pd.DataFrame({
            'reviews.text': ['This product is amazing!', 'Poor quality, disappointed', 'Average product, okay'],
            'reviews.rating': [5, 2, 4],
            'data_source': ['Dummy', 'Dummy', 'Dummy']
        })
        print("📊 Using dummy data for demonstration.")

# Final dataset summary
print(f"\nFINAL DATASET LOADED:")
print(f"Total reviews: {len(df):,}")
if 'data_source' in df.columns:
    source_counts = df['data_source'].value_counts()
    for source, count in source_counts.items():
        print(f"{source}: {count:,} reviews")
print(f"Columns: {list(df.columns)}")

Loading Amazon US Reviews dataset...
⚠️  NOTE: The original Amazon US Reviews dataset is no longer available on HuggingFace
Trying alternative datasets from HuggingFace...
Amazon Reviews Multi not available: Dataset scripts are no longer supported, but found amazon_reviews_multi.py
Trying IMDB dataset as HuggingFace alternative...
Amazon Reviews Multi not available: Dataset scripts are no longer supported, but found amazon_reviews_multi.py
Trying IMDB dataset as HuggingFace alternative...
Successfully loaded IMDB dataset as HuggingFace alternative
📊 Successfully loaded 25000 reviews from HuggingFace dataset
🔄 Also loading local archive data to combine datasets...
📁 Found CSV files in archive: ['Datafiniti_Amazon_Consumer_Reviews_of_Amazon_Products_May19.csv', '1429_1.csv', 'Datafiniti_Amazon_Consumer_Reviews_of_Amazon_Products.csv']
Successfully loaded IMDB dataset as HuggingFace alternative
📊 Successfully loaded 25000 reviews from HuggingFace dataset
🔄 Also loading local archive data 

## STEP 3: Data Understanding

Exploring the dataset structure, checking columns, and examining data distribution and quality.

In [6]:
# Basic dataset information
print("=== DATASET OVERVIEW ===")
print(f"Dataset shape: {df.shape}")
print(f"Columns: {list(df.columns)}")

"""
FINE-TUNING RESULTS SUMMARY


# Fine-tuning is now handled in fine-tuning.py
# To run fine-tuning, import and call run_fine_tuning from fine-tuning.py
# Example:
# from fine_tuning import run_fine_tuning
# fine_tuned_results = run_fine_tuning(transformer_data, models_to_finetune, device)
# If not running fine-tuning, ensure fine_tuned_results is an empty dict:
fine_tuned_results = {}  # Guard for evaluation cells
        models_to_finetune = {
            'DistilBERT': './offline_models/models--distilbert-base-uncased',
            'RoBERTa': './offline_models/models--roberta-base'
        }
        print(f"\n✅ Using locally cached models from: ./offline_models/")

        print(f"\n📋 Models selected for fine-tuning:")
        for model_name, model_path in models_to_finetune.items():
            print(f"   • {model_name}: {model_path}")

        print(f"\n✅ Setup complete! Ready for fine-tuning.")
        print(f"  {col}: {sample_val}...")
        
        I CANNOT ASUME FINE-TUNING WOULD RUN SUCCESSFULLY"""

=== DATASET OVERVIEW ===
Dataset shape: (55000, 28)
Columns: ['reviews.text', 'reviews.rating', 'data_source', 'id', 'dateAdded', 'dateUpdated', 'name', 'asins', 'brand', 'categories', 'primaryCategories', 'imageURLs', 'keys', 'manufacturer', 'manufacturerNumber', 'reviews.date', 'reviews.dateSeen', 'reviews.didPurchase', 'reviews.doRecommend', 'reviews.id', 'reviews.numHelpful', 'reviews.sourceURLs', 'reviews.title', 'reviews.username', 'sourceURLs', 'reviews.dateAdded', 'reviews.userCity', 'reviews.userProvince']


'\nFINE-TUNING RESULTS SUMMARY\n\n\n# Fine-tuning is now handled in fine-tuning.py\n# To run fine-tuning, import and call run_fine_tuning from fine-tuning.py\n# Example:\n# from fine_tuning import run_fine_tuning\n# fine_tuned_results = run_fine_tuning(transformer_data, models_to_finetune, device)\n# If not running fine-tuning, ensure fine_tuned_results is an empty dict:\nfine_tuned_results = {}  # Guard for evaluation cells\n        models_to_finetune = {\n            \'DistilBERT\': \'./offline_models/models--distilbert-base-uncased\',\n            \'RoBERTa\': \'./offline_models/models--roberta-base\'\n        }\n        print(f"\n✅ Using locally cached models from: ./offline_models/")\n\n        print(f"\n📋 Models selected for fine-tuning:")\n        for model_name, model_path in models_to_finetune.items():\n            print(f"   • {model_name}: {model_path}")\n\n        print(f"\n✅ Setup complete! Ready for fine-tuning.")\n        print(f"  {col}: {sample_val}...")\n        \n   

## STEP 4: Target Variable Creation

Transforming ratings into sentiment labels according to the specified logic:
- Scores 1, 2, 3 → "Negative"
- Score 4 → "Neutral" 
- Score 5 → "Positive"

In [7]:
# Create sentiment labels based on star ratings
def create_sentiment_labels(rating):
    """
    Transform numerical ratings to sentiment labels
    1, 2, 3 -> Negative
    4 -> Neutral
    5 -> Positive
    """
    if rating in [1, 2, 3]:
        return 'Negative'
    elif rating == 4:
        return 'Neutral'
    elif rating == 5:
        return 'Positive'
    else:
        return 'Unknown'  # For any unexpected values

# Apply the transformation
rating_column = 'reviews.rating'

if rating_column in df.columns:
    df['sentiment'] = df[rating_column].apply(create_sentiment_labels)
    
    print("=== SENTIMENT TRANSFORMATION RESULTS ===")
    print(f"Using rating column: '{rating_column}'")
    sentiment_counts = df['sentiment'].value_counts()
    print("Sentiment distribution:")
    print(sentiment_counts)
    
    # Calculate percentages
    sentiment_percentages = (sentiment_counts / len(df) * 100).round(2)
    print("\nSentiment percentages:")
    for sentiment, percentage in sentiment_percentages.items():
        print(f"{sentiment}: {percentage}%")
    
    # Visualize the new sentiment distribution
    try:
        fig = px.pie(values=sentiment_counts.values, names=sentiment_counts.index, 
                     title='Sentiment Distribution After Transformation')
        fig.show()
    except Exception as plot_error:
        print(f"Plotly visualization error: {plot_error}")
        print("Using matplotlib as fallback:")
        plt.figure(figsize=(8, 8))
        plt.pie(sentiment_counts.values, labels=sentiment_counts.index, autopct='%1.1f%%')
        plt.title('Sentiment Distribution After Transformation')
        plt.show()
    
    # Show the mapping visually
    mapping_df = df.groupby([rating_column, 'sentiment']).size().reset_index(name='count')
    print(f"\n=== MAPPING VERIFICATION ===")
    display(mapping_df)
    
else:
    print(f"Rating column '{rating_column}' not found. Please check your dataset structure.")
    print("Available columns:", df.columns.tolist())
    
    # Try to find alternative rating columns as fallback
    possible_alternatives = ['rating', 'star_rating', 'score', 'stars', 'overall']
    found_alternative = None
    for alt_col in possible_alternatives:
        if alt_col in df.columns:
            found_alternative = alt_col
            break
    
    if found_alternative:
        print(f"Found alternative rating column: '{found_alternative}'. Using this instead.")
        df['sentiment'] = df[found_alternative].apply(create_sentiment_labels)
        rating_column = found_alternative  # Update for later use
    else:
        print("No suitable rating column found.")

=== SENTIMENT TRANSFORMATION RESULTS ===
Using rating column: 'reviews.rating'
Sentiment distribution:
sentiment
Positive    33252
Negative    14958
Neutral      6776
Unknown        14
Name: count, dtype: int64

Sentiment percentages:
Positive: 60.46%
Negative: 27.2%
Neutral: 12.32%
Unknown: 0.03%



=== MAPPING VERIFICATION ===


Unnamed: 0,reviews.rating,sentiment,count
0,1.0,Negative,673
1,2.0,Negative,12982
2,3.0,Negative,1303
3,4.0,Neutral,6776
4,5.0,Positive,33252


In [8]:
# Clean and prepare the final dataset
print("=== FINAL DATASET PREPARATION ===")

# Define the text column name
text_column = 'reviews.text'

# Remove rows with missing essential data
if text_column and 'sentiment' in df.columns:
    # Keep only rows with valid text and sentiment
    df_clean = df.dropna(subset=[text_column, 'sentiment']).copy()
    
    # Remove very short reviews (less than 10 characters)
    df_clean = df_clean[df_clean[text_column].str.len() >= 10].copy()
    
    # Remove 'Unknown' sentiment labels if any
    df_clean = df_clean[df_clean['sentiment'] != 'Unknown'].copy()
    
    print(f"Original dataset size: {len(df)}")
    print(f"Clean dataset size: {len(df_clean)}")
    print(f"Removed {len(df) - len(df_clean)} rows")
    
    # Update the main dataframe
    df = df_clean.reset_index(drop=True)
    
    print(f"\n=== FINAL DATASET SUMMARY ===")
    print(f"Total reviews: {len(df)}")
    print(f"Text column: '{text_column}'")
    print(f"Target column: 'sentiment'")
    print(f"Sentiment distribution:")
    sentiment_final = df['sentiment'].value_counts()
    display(sentiment_final)
    
else:
    print("Cannot proceed without valid text column and sentiment labels.")
    print("Available columns:", df.columns.tolist())


=== FINAL DATASET PREPARATION ===
Original dataset size: 55000
Clean dataset size: 54617
Removed 383 rows

=== FINAL DATASET SUMMARY ===
Total reviews: 54617
Text column: 'reviews.text'
Target column: 'sentiment'
Sentiment distribution:
Original dataset size: 55000
Clean dataset size: 54617
Removed 383 rows

=== FINAL DATASET SUMMARY ===
Total reviews: 54617
Text column: 'reviews.text'
Target column: 'sentiment'
Sentiment distribution:


sentiment
Positive    32953
Negative    14941
Neutral      6723
Name: count, dtype: int64

Imbalance handling strategy (3-class: ~60% Positive, 27% Negative, 12% Neutral)

What we changed:
- Class weights: Enabled class_weight='balanced' for Logistic Regression and Linear SVM; balanced_subsample for Random Forest; balanced for Extra Trees. This upweights under-represented classes during training without duplicating data.
- Mild over-sampling (train only): Upsampled Neutral to match Negative (did not fully balance to Positive) with RandomOverSampler. Test set remains untouched to keep evaluation honest.
- XGBoost weighting: Passed per-sample weights from class weights during fit (multi-class).
- Metrics: Added macro-F1 to monitor minority-class performance; still report weighted metrics and per-class precision/recall/F1.

Why this setup:
- Class weights shift the decision boundary toward minority classes with minimal risk of overfitting.
- Mild over-sampling gives Neutral more representation without over-amplifying noise (we avoid fully balancing to Positive).
- Macro-F1 counters accuracy inflation from the dominant Positive class and surfaces Neutral/Negative recall.
- This approach is robust for real-world imbalanced data, avoids overfitting, and keeps test evaluation honest.
- By not fully balancing to Positive, we avoid introducing too much synthetic data/noise for the majority class.
- Macro-F1 and per-class metrics help us track Neutral/Negative performance, not just overall accuracy.

Consequences:
- Models are less likely to ignore Neutral/Negative, but may still have lower recall for Neutral if the signal is weak.
- If Neutral recall is still low, consider threshold tuning, stronger regularization, or slightly more Neutral upsampling.
- For transformers, we use a class-weighted CrossEntropyLoss in fine-tuning (WeightedTrainer) instead of resampling.
- Always evaluate with macro-F1 and per-class recall, not just accuracy.
- This setup is easy to maintain and extend if class distributions shift in future data.

## STEP 5: Traditional NLP & ML Approach

Implementing traditional machine learning approach with text preprocessing, vectorization, and multiple ML algorithms.

### 5.1 Data Preprocessing for Traditional ML

Text cleaning, tokenization, lemmatization, and vectorization for traditional machine learning algorithms.

In [9]:
# Text preprocessing function
def preprocess_text(text):
    """
    Clean and preprocess text data for traditional ML
    """
    # Convert to string and lowercase
    text = str(text).lower()
    
    # Remove special characters and digits, keep only letters and spaces
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # Remove extra whitespace
    text = ' '.join(text.split())
    
    return text

# Apply text preprocessing
print("=== TEXT PREPROCESSING ===")
print("Applying text cleaning and preprocessing...")

# Create a copy for processing
df_processed = df.copy()

# Apply preprocessing to text column
df_processed['cleaned_text'] = df_processed[text_column].apply(preprocess_text)

# Remove empty texts after cleaning
df_processed = df_processed[df_processed['cleaned_text'].str.len() > 0].reset_index(drop=True)

print(f"Dataset size after text cleaning: {len(df_processed)}")
print(f"Removed {len(df) - len(df_processed)} rows with empty text after cleaning")

# Show examples of cleaned text
print("\n=== PREPROCESSING EXAMPLES ===")
for i in range(3):
    original = str(df_processed.iloc[i][text_column])[:100] + "..."
    cleaned = df_processed.iloc[i]['cleaned_text'][:100] + "..."
    print(f"Original: {original}")
    print(f"Cleaned:  {cleaned}\n")

=== TEXT PREPROCESSING ===
Applying text cleaning and preprocessing...
Dataset size after text cleaning: 54616
Removed 1 rows with empty text after cleaning

=== PREPROCESSING EXAMPLES ===
Original: Dumb is as dumb does, in this thoroughly uninteresting, supposed black comedy. Essentially what star...
Cleaned:  dumb is as dumb does in this thoroughly uninteresting supposed black comedy essentially what starts ...

Original: I dug out from my garage some old musicals and this is another one of my favorites. It was written b...
Cleaned:  i dug out from my garage some old musicals and this is another one of my favorites it was written by...

Original: After watching this movie I was honestly disappointed - not because of the actors, story or directin...
Cleaned:  after watching this movie i was honestly disappointed not because of the actors story or directing i...

Dataset size after text cleaning: 54616
Removed 1 rows with empty text after cleaning

=== PREPROCESSING EXAMPLES ===
Origin

In [10]:
# Download additional NLTK resources needed for advanced preprocessing
try:
    nltk.download('punkt_tab', quiet=True)
    print("Downloaded punkt_tab tokenizer")
except:
    print("punkt_tab download failed, trying alternative...")

# Advanced text preprocessing with NLTK
def advanced_preprocess_text(text):
    """
    Advanced preprocessing with tokenization, stopword removal, and lemmatization
    
    This function performs three key NLP preprocessing steps:
    
    1. TOKENIZATION: Breaking text into individual words/tokens
       - Purpose: Converts sentences into lists of words for analysis
       - Example: "I love this product!" → ["I", "love", "this", "product"]
       - Why needed: ML algorithms work with individual features, not sentences
    
    2. STOPWORD REMOVAL: Filtering out common, non-informative words
       - Purpose: Remove words like "the", "and", "is" that don't carry sentiment
       - Example: ["I", "love", "this", "product"] → ["love", "product"]
       - Why needed: Focuses on meaningful words, reduces noise and dimensionality
    
    3. LEMMATIZATION: Converting words to their root/base form
       - Purpose: Groups related word forms together (running→run, better→good)
       - Example: ["running", "runs", "ran"] → ["run", "run", "run"]
       - Why needed: Reduces vocabulary size, improves feature consistency
    
    4. VECTORIZATION (happens later): Converting text to numerical vectors
       - Purpose: Transform words into numbers that ML algorithms can process
       - Methods: Count (word frequency) or TF-IDF (importance weighting)
       - Why needed: ML models require numerical input, not text
    """
    # Basic cleaning
    text = preprocess_text(text)
    
    try:
        # TOKENIZATION: Split text into individual words/tokens
        # Example: "great product quality" → ["great", "product", "quality"]
        tokens = word_tokenize(text)
    except LookupError:
        # Fallback to simple split if NLTK tokenizer fails
        tokens = text.split()
    
    # STOPWORD REMOVAL: Filter out common, non-informative words
    # Removes: "the", "and", "is", "in", "to", "of", etc.
    # Keeps: meaningful words that carry sentiment or content information
    try:
        stop_words = set(stopwords.words('english'))
        tokens = [word for word in tokens if word not in stop_words and len(word) > 2]
    except LookupError:
        # If stopwords not available, just filter by length
        tokens = [word for word in tokens if len(word) > 2]
    
    # LEMMATIZATION: Convert words to their base/root form
    # Examples: "running" → "run", "better" → "good", "cats" → "cat"
    # This groups similar word forms together for better feature consistency
    try:
        lemmatizer = WordNetLemmatizer()
        tokens = [lemmatizer.lemmatize(word) for word in tokens]
    except LookupError:
        # If lemmatizer not available, just lowercase
        tokens = [word.lower() for word in tokens]
    
    return ' '.join(tokens)

print("=== ADVANCED TEXT PREPROCESSING ===")
print("Applying tokenization, stopword removal, and lemmatization...")

# Fix text column identification - use the correct column name
text_column = 'reviews.text'
print(f"Using text column: {text_column}")

# Apply advanced preprocessing
# This will transform: "I really love this amazing product!" 
# Into: "really love amazing product" (tokenized, stopwords removed, lemmatized)
df_processed['processed_text'] = df_processed['cleaned_text'].apply(advanced_preprocess_text)

# Remove empty texts after advanced processing
df_processed = df_processed[df_processed['processed_text'].str.len() > 0].reset_index(drop=True)

print(f"Dataset size after advanced preprocessing: {len(df_processed)}")

# Show examples
print("\n=== ADVANCED PREPROCESSING EXAMPLES ===")
for i in range(3):
    cleaned = df_processed.iloc[i]['cleaned_text'][:80] + "..."
    processed = df_processed.iloc[i]['processed_text'][:80] + "..."
    print(f"Cleaned:   {cleaned}")
    print(f"Processed: {processed}\n")

# Final dataset statistics
print("=== FINAL PREPROCESSING STATISTICS ===")
avg_length_original = df_processed[text_column].astype(str).str.len().mean()
avg_length_processed = df_processed['processed_text'].str.len().mean()
avg_words_processed = df_processed['processed_text'].str.split().str.len().mean()

print(f"Average original text length: {avg_length_original:.1f} characters")
print(f"Average processed text length: {avg_length_processed:.1f} characters")
print(f"Average words after processing: {avg_words_processed:.1f} words")

Downloaded punkt_tab tokenizer
=== ADVANCED TEXT PREPROCESSING ===
Applying tokenization, stopword removal, and lemmatization...
Using text column: reviews.text
Dataset size after advanced preprocessing: 54615

=== ADVANCED PREPROCESSING EXAMPLES ===
Cleaned:   dumb is as dumb does in this thoroughly uninteresting supposed black comedy esse...
Processed: dumb dumb thoroughly uninteresting supposed black comedy essentially start chris...

Cleaned:   i dug out from my garage some old musicals and this is another one of my favorit...
Processed: dug garage old musical another one favorite written jay alan lerner directed vin...

Cleaned:   after watching this movie i was honestly disappointed not because of the actors ...
Processed: watching movie honestly disappointed actor story directing disappointed film adv...

=== FINAL PREPROCESSING STATISTICS ===
Average original text length: 688.7 characters
Average processed text length: 428.3 characters
Average words after processing: 62.2 words

### 5.2 Vectorization

Converting text data into numerical vectors using CountVectorizer and TF-IDF Vectorizer.

In [11]:
"""
VECTORIZATION AND DATA PREPARATION FOR MACHINE LEARNING

This cell converts preprocessed text data into numerical vectors that machine learning algorithms can understand.
It prepares the data in two different vectorization formats for model comparison.

KEY PURPOSES:
1. Data Splitting: Divide dataset into training and testing sets
2. Count Vectorization: Convert text to word frequency vectors
3. TF-IDF Vectorization: Convert text to importance-weighted vectors
4. Feature Engineering: Create numerical representations of text data

WHY THIS STEP IS ESSENTIAL:
- Machine learning algorithms only work with numbers, not text
- Vectorization transforms words into mathematical features
- Different vectorization methods capture different aspects of text meaning
- Proper train/test split ensures unbiased model evaluation
"""

# Prepare data for vectorization
print("=== VECTORIZATION SETUP ===")

# STEP 1: Prepare features (X) and target variable (y)
# Features: The processed text that will be converted to numbers
# Target: The sentiment labels we want to predict
X = df_processed['processed_text']  # Input features (text)
y = df_processed['sentiment']       # Target variable (Negative/Neutral/Positive)

print(f"Feature shape: {X.shape}")
print(f"Target distribution:")
print(y.value_counts())

# STEP 2: Train-Test Split
# Purpose: Separate data for training models and testing their performance
# - 80% for training (model learns from this)
# - 20% for testing (unbiased evaluation)
# - stratify=y ensures balanced sentiment distribution in both sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"\nTrain set size: {len(X_train)}")
print(f"Test set size: {len(X_test)}")
print(f"Train set distribution:")
print(y_train.value_counts())

# STEP 3: COUNT VECTORIZATION
# Purpose: Convert text to numerical vectors based on word frequency
# How it works: Each word becomes a feature, value = how many times it appears
# Example: "love product" → [0, 1, 0, 1, 0] (if vocabulary is [bad, love, hate, product, terrible])
print("\n=== COUNT VECTORIZATION ===")
count_vectorizer = CountVectorizer(
    max_features=5000,  # Limit vocabulary to top 5000 most frequent words
    ngram_range=(1, 2),  # Include single words (unigrams) and word pairs (bigrams)
    min_df=2,  # Ignore words that appear in less than 2 documents (remove rare words)
    max_df=0.8  # Ignore words that appear in more than 80% of documents (remove too common words)
)

# Transform training data (fit learns vocabulary, transform converts to numbers)
X_train_count = count_vectorizer.fit_transform(X_train)
# Transform test data (only transform, don't learn new vocabulary)
X_test_count = count_vectorizer.transform(X_test)

print(f"Count vectorizer vocabulary size: {len(count_vectorizer.vocabulary_)}")
print(f"Count matrix shape - Train: {X_train_count.shape}, Test: {X_test_count.shape}")

# STEP 4: TF-IDF VECTORIZATION
# Purpose: Convert text to numerical vectors based on word importance
# How it works: TF-IDF = Term Frequency × Inverse Document Frequency
# - TF: How often a word appears in a document
# - IDF: How rare a word is across all documents
# - Rare words in specific documents get higher weights
# Example: "love" in many reviews = lower weight, "exceptional" in few reviews = higher weight
print("\n=== TF-IDF VECTORIZATION ===")
tfidf_vectorizer = TfidfVectorizer(
    max_features=5000,     # Same parameters as CountVectorizer for fair comparison
    ngram_range=(1, 2),    # Include unigrams and bigrams
    min_df=2,              # Ignore rare terms
    max_df=0.8,            # Ignore too common terms
    sublinear_tf=True      # Apply sublinear tf scaling (dampens effect of very high frequencies)
)

# Transform training and test data
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

print(f"TF-IDF vectorizer vocabulary size: {len(tfidf_vectorizer.vocabulary_)}")
print(f"TF-IDF matrix shape - Train: {X_train_tfidf.shape}, Test: {X_test_tfidf.shape}")

# STEP 5: Feature Analysis
# Show the vocabulary that was learned (most important words/phrases for analysis)
print("\n=== TOP FEATURES ===")
feature_names = count_vectorizer.get_feature_names_out()
print("Top 20 features by CountVectorizer:")
print(feature_names[:20])

"""
VECTORIZATION COMPARISON:
- Count Vectorizer: Simple word frequency counting
  * Pros: Simple, fast, good baseline
  * Cons: Doesn't consider word importance across documents
  
- TF-IDF Vectorizer: Importance-weighted word frequency
  * Pros: Considers word rarity, better for distinguishing documents
  * Cons: Slightly more complex, can be sensitive to document collection

NEXT STEPS:
Both vectorized datasets (X_train_count, X_train_tfidf) will be used to train
different machine learning models to compare which vectorization method works
better for sentiment analysis on this specific dataset.
"""

=== VECTORIZATION SETUP ===
Feature shape: (54615,)
Target distribution:
sentiment
Positive    32951
Negative    14941
Neutral      6723
Name: count, dtype: int64

Train set size: 43692
Test set size: 10923
Train set distribution:
sentiment
Positive    26361
Negative    11953
Neutral      5378
Name: count, dtype: int64

=== COUNT VECTORIZATION ===

Train set size: 43692
Test set size: 10923
Train set distribution:
sentiment
Positive    26361
Negative    11953
Neutral      5378
Name: count, dtype: int64

=== COUNT VECTORIZATION ===
Count vectorizer vocabulary size: 5000
Count matrix shape - Train: (43692, 5000), Test: (10923, 5000)

=== TF-IDF VECTORIZATION ===
Count vectorizer vocabulary size: 5000
Count matrix shape - Train: (43692, 5000), Test: (10923, 5000)

=== TF-IDF VECTORIZATION ===
TF-IDF vectorizer vocabulary size: 5000
TF-IDF matrix shape - Train: (43692, 5000), Test: (10923, 5000)

=== TOP FEATURES ===
Top 20 features by CountVectorizer:
['aaa' 'abandoned' 'abc' 'ability' 'a

"\nVECTORIZATION COMPARISON:\n- Count Vectorizer: Simple word frequency counting\n  * Pros: Simple, fast, good baseline\n  * Cons: Doesn't consider word importance across documents\n  \n- TF-IDF Vectorizer: Importance-weighted word frequency\n  * Pros: Considers word rarity, better for distinguishing documents\n  * Cons: Slightly more complex, can be sensitive to document collection\n\nNEXT STEPS:\nBoth vectorized datasets (X_train_count, X_train_tfidf) will be used to train\ndifferent machine learning models to compare which vectorization method works\nbetter for sentiment analysis on this specific dataset.\n"

### 5.3 Traditional ML Model Training

Training multiple traditional machine learning algorithms and comparing their performance.

In [33]:
"""
TRADITIONAL ML MODELS TRAINING AND EVALUATION
This cell trains multiple machine learning algorithms for sentiment classification, comparing their performance
on the vectorized text data. We use both basic and advanced ensemble methods to find the best approach.
ALGORITHM SELECTION RATIONALE:
- Covers different ML paradigms: probabilistic, linear, kernel-based, and ensemble methods
- Includes both traditional and modern high-performance algorithms
- Allows comprehensive comparison to identify optimal approach for sentiment analysis
"""

# --- Imbalance handling: class weights and mild over-sampling ---
from collections import Counter
from sklearn.utils.class_weight import compute_class_weight
try:
    from imblearn.over_sampling import RandomOverSampler
except ImportError:
    import sys, subprocess
    subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'imbalanced-learn'])
    from imblearn.over_sampling import RandomOverSampler

# Neutral class is often under-predicted, so we address this with class weights and mild over-sampling
# Compute class weights from y_train
class_names = np.array(['Negative', 'Neutral', 'Positive'])
cw = compute_class_weight(class_weight='balanced', classes=class_names, y=y_train.values)
class_weight_dict = {cls: w for cls, w in zip(class_names, cw)}
# Penalize missing Neutral predictions by increasing its weight
class_weight_dict['Neutral'] *= 1.5
print('Class weights:', class_weight_dict)

models = {
    'Naive Bayes': MultinomialNB(),
    'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000, multi_class='multinomial', solver='lbfgs', class_weight='balanced'),
    'SVM': LinearSVC(random_state=42, max_iter=10000, class_weight='balanced'),
    'Random Forest': RandomForestClassifier(random_state=42, n_estimators=400, class_weight='balanced_subsample', max_depth=None, min_samples_leaf=1, min_samples_split=2),
    'XGBoost': XGBClassifier(random_state=42, n_estimators=200, max_depth=8, learning_rate=0.2, eval_metric='mlogloss', verbosity=0),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42, n_estimators=100, max_depth=6, learning_rate=0.1),
    'Extra Trees': ExtraTreesClassifier(random_state=42, n_estimators=100, max_depth=10, class_weight='balanced')
}

# Mild over-sampling: upsample Neutral to match Negative (keep Positive as is)
counts = Counter(y_train)
sampling_strategy = {
    'Negative': counts['Negative'],
    'Neutral': counts['Negative'],
    'Positive': counts['Positive']
}
print('Before ROS:', counts)
ros = RandomOverSampler(random_state=42, sampling_strategy=sampling_strategy)
X_train_count_bal, y_train_count_bal = ros.fit_resample(X_train_count, y_train)
X_train_tfidf_bal, y_train_tfidf_bal = ros.fit_resample(X_train_tfidf, y_train)
print('After ROS (Count):', Counter(y_train_count_bal))
print('After ROS (TF-IDF):', Counter(y_train_tfidf_bal))

# Precompute sample_weight for XGBoost only (others use class_weight internally)
xgb_sw_count = np.array([class_weight_dict[y] for y in y_train_count_bal])
xgb_sw_tfidf = np.array([class_weight_dict[y] for y in y_train_tfidf_bal])

results = {'Count': {}, 'TF-IDF': {}}

def evaluate_model(model, X_train, X_test, y_train, y_test, model_name, vectorizer_name, sample_weight_train=None):
    print(f"\n--- Training {model_name} with {vectorizer_name} ---")
    start_time = time.time()
    if model_name == 'XGBoost' and sample_weight_train is not None:
        # Encode y for XGB
        le_local = LabelEncoder().fit(['Negative','Neutral','Positive'])
        y_train_enc = le_local.transform(y_train)
        model.fit(X_train, y_train_enc, sample_weight=sample_weight_train)
        y_pred = le_local.inverse_transform(model.predict(X_test))
    else:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
    training_time = time.time() - start_time
    accuracy = accuracy_score(y_test, y_pred)
    precision_w = precision_score(y_test, y_pred, average='weighted', zero_division=0)
    recall_w = recall_score(y_test, y_pred, average='weighted', zero_division=0)
    f1_w = f1_score(y_test, y_pred, average='weighted', zero_division=0)
    f1_macro = f1_score(y_test, y_pred, average='macro', zero_division=0)
    precision_per_class = precision_score(y_test, y_pred, average=None, labels=['Negative','Neutral','Positive'], zero_division=0)
    recall_per_class = recall_score(y_test, y_pred, average=None, labels=['Negative','Neutral','Positive'], zero_division=0)
    f1_per_class = f1_score(y_test, y_pred, average=None, labels=['Negative','Neutral','Positive'], zero_division=0)
    results[vectorizer_name][model_name] = {
        'accuracy': accuracy,
        'precision': precision_w,
        'recall': recall_w,
        'f1': f1_w,
        'f1_macro': f1_macro,
        'training_time': training_time,
        'precision_per_class': precision_per_class,
        'recall_per_class': recall_per_class,
        'f1_per_class': f1_per_class,
        'y_pred': y_pred
    }
    print(f"Training Time: {training_time:.2f}s  Acc: {accuracy:.4f}  F1(w): {f1_w:.4f}  F1(macro): {f1_macro:.4f}")
    return model

# Train models with Count Vectorizer (balanced train)
print("\n🔢 TRAINING WITH COUNT VECTORIZATION (balanced train)")
trained_models_count = {}
for model_name, model in models.items():
    sw = xgb_sw_count if model_name == 'XGBoost' else None
    trained_models_count[model_name] = evaluate_model(
        model, X_train_count_bal, X_test_count, y_train_count_bal, y_test, model_name, 'Count', sample_weight_train=sw
)

# Train models with TF-IDF Vectorizer (balanced train)
print("\n📊 TRAINING WITH TF-IDF VECTORIZATION (balanced train)")
trained_models_tfidf = {}
for model_name, model in models.items():
    model_copy = model.__class__(**model.get_params())
    sw = xgb_sw_tfidf if model_name == 'XGBoost' else None
    trained_models_tfidf[model_name] = evaluate_model(
        model_copy, X_train_tfidf_bal, X_test_tfidf, y_train_tfidf_bal, y_test, model_name, 'TF-IDF', sample_weight_train=sw
)

print("\n✅ All models trained successfully!")
print("Results stored in 'results' dictionary for further analysis.")

# --- Sanity check: print class distributions and macro-F1 for top model ---
print('Balanced train class distribution (TF-IDF):', Counter(y_train_tfidf_bal))
print('Test class distribution:', Counter(y_test))
# Find top model by macro-F1
top_model = max(results['TF-IDF'], key=lambda m: results['TF-IDF'][m]['f1_macro'])
print(f'Top model by macro-F1: {top_model}')
print(f"Macro-F1: {results['TF-IDF'][top_model]['f1_macro']:.4f}")

Class weights: {np.str_('Negative'): np.float64(1.2184388856354054), np.str_('Neutral'): np.float64(4.062104871699517), np.str_('Positive'): np.float64(0.5524828344903456)}
Before ROS: Counter({'Positive': 26361, 'Negative': 11953, 'Neutral': 5378})
After ROS (Count): Counter({'Positive': 26361, 'Negative': 11953, 'Neutral': 11953})
After ROS (TF-IDF): Counter({'Positive': 26361, 'Negative': 11953, 'Neutral': 11953})

🔢 TRAINING WITH COUNT VECTORIZATION (balanced train)

--- Training Naive Bayes with Count ---
Training Time: 0.06s  Acc: 0.5130  F1(w): 0.5299  F1(macro): 0.5310

--- Training Logistic Regression with Count ---
Training Time: 2.35s  Acc: 0.6955  F1(w): 0.7175  F1(macro): 0.6582

--- Training SVM with Count ---
Training Time: 12.84s  Acc: 0.7292  F1(w): 0.7427  F1(macro): 0.6716

--- Training Random Forest with Count ---
Training Time: 212.02s  Acc: 0.7985  F1(w): 0.7870  F1(macro): 0.7080

--- Training XGBoost with Count ---
Training Time: 4.01s  Acc: 0.5791  F1(w): 0.607

"""## TF-IDF vs Count Vectorization: Understanding the Difference

### What is TF-IDF?

**TF-IDF (Term Frequency-Inverse Document Frequency)** is a numerical statistic that reflects how important a word is to a document within a collection of documents (corpus).

### Mathematical Formula

```
TF-IDF(word, document) = TF(word, document) × IDF(word, corpus)
```

**Where:**

1. **TF (Term Frequency)** = (Number of times word appears in document) / (Total words in document)
   - Measures how frequently a word appears in a specific document
   - Higher TF = word appears more often in this document

2. **IDF (Inverse Document Frequency)** = log(Total documents / Documents containing the word)
   - Measures how rare or common a word is across all documents
   - Higher IDF = word is rare across the corpus (more distinctive)
   - Lower IDF = word is common across many documents (less distinctive)

### Practical Example

Consider the word "love" in a customer review:
- **Document**: "I love this product, love the quality, amazing!"
- **TF**: "love" appears 2 times out of 8 words = 2/8 = 0.25
- **IDF**: If "love" appears in 500 out of 1000 total reviews = log(1000/500) = 0.301
- **TF-IDF**: 0.25 × 0.301 = 0.075

Compare with word "exceptional":
- **TF**: "exceptional" appears 1 time out of 8 words = 1/8 = 0.125
- **IDF**: If "exceptional" appears in only 10 out of 1000 reviews = log(1000/10) = 2.0
- **TF-IDF**: 0.125 × 2.0 = 0.25 *(Higher score despite lower frequency!)*

### TF-IDF vs Count Vectorization

| **Count Vectorizer** | **TF-IDF Vectorizer** |
|---------------------|----------------------|
| Simple word frequency counting | Importance-weighted word frequency |
| Each word's value = how many times it appears | Considers both frequency AND rarity across documents |
| Example: "love" appears 3 times → value = 3 | Words common across all documents get lower weights |
| Problem: Common words like "the", "and" get high scores but carry little meaning | Words unique to specific documents get higher weights |
| | Better at identifying distinctive/meaningful words for classification |
| | Automatically reduces impact of stop words without explicitly removing them |

### Why Use TF-IDF for Sentiment Analysis?

1. **Noise Reduction**: Automatically downweights common words that don't contribute to sentiment
2. **Feature Importance**: Emphasizes words that are distinctive to specific sentiment categories
3. **Better Classification**: Often leads to improved model performance for text classification tasks
4. **Industry Standard**: Widely used baseline approach in NLP and information retrieval"""

### 5.4 Traditional ML Results Analysis

Analyzing and visualizing the performance of traditional ML models.

In [34]:
# Results analysis and comparison
print("=== TRADITIONAL ML RESULTS SUMMARY ===")

# Create results comparison DataFrame
comparison_data = []
for vectorizer in ['Count', 'TF-IDF']:
    for model_name in models.keys():
        result = results[vectorizer][model_name]
        comparison_data.append({
            'Vectorizer': vectorizer,
            'Model': model_name,
            'Accuracy': result['accuracy'],
            'Precision': result['precision'],
            'Recall': result['recall'],
            'F1-Score': result['f1']
        })

results_df = pd.DataFrame(comparison_data)

print("Performance Comparison:")
display(results_df.round(4))

# Find best performing model
best_model = results_df.loc[results_df['Accuracy'].idxmax()]
print(f"\nBest performing model: {best_model['Model']} with {best_model['Vectorizer']} vectorizer")
print(f"Best accuracy: {best_model['Accuracy']:.4f}")

# Detailed results for best model
best_vectorizer = best_model['Vectorizer']
best_model_name = best_model['Model']
best_result = results[best_vectorizer][best_model_name]

print(f"\n=== DETAILED RESULTS FOR BEST MODEL ===")
print(f"Model: {best_model_name} with {best_vectorizer} vectorizer")
print(f"Overall Accuracy: {best_result['accuracy']:.4f}")
print(f"Overall Precision: {best_result['precision']:.4f}")
print(f"Overall Recall: {best_result['recall']:.4f}")
print(f"Overall F1-Score: {best_result['f1']:.4f}")

print(f"\nPer-class metrics:")
classes = ['Negative', 'Neutral', 'Positive']
for i, class_name in enumerate(classes):
    print(f"{class_name}:")
    print(f"  Precision: {best_result['precision_per_class'][i]:.4f}")
    print(f"  Recall: {best_result['recall_per_class'][i]:.4f}")
    print(f"  F1-Score: {best_result['f1_per_class'][i]:.4f}")

# Confusion Matrix for best model
print(f"\n=== CONFUSION MATRIX FOR BEST MODEL ===")
best_y_pred = best_result['y_pred']
cm = confusion_matrix(y_test, best_y_pred, labels=classes)
cm_df = pd.DataFrame(cm, index=classes, columns=classes)
print("Confusion Matrix:")
display(cm_df)

# Visualize results
try:
    # Performance comparison plot
    fig = px.bar(results_df, x='Model', y='Accuracy', color='Vectorizer',
                 title='Traditional ML Models Performance Comparison',
                 barmode='group')
    fig.show()
except Exception as e:
    print(f"Plotly error: {e}")
    # Matplotlib fallback
    plt.figure(figsize=(12, 6))
    models_list = results_df['Model'].unique()
    x = np.arange(len(models_list))
    width = 0.35
    
    count_accuracies = [results_df[(results_df['Model'] == model) & (results_df['Vectorizer'] == 'Count')]['Accuracy'].iloc[0] for model in models_list]
    tfidf_accuracies = [results_df[(results_df['Model'] == model) & (results_df['Vectorizer'] == 'TF-IDF')]['Accuracy'].iloc[0] for model in models_list]
    
    plt.bar(x - width/2, count_accuracies, width, label='Count Vectorizer')
    plt.bar(x + width/2, tfidf_accuracies, width, label='TF-IDF Vectorizer')
    
    plt.xlabel('Models')
    plt.ylabel('Accuracy')
    plt.title('Traditional ML Models Performance Comparison')
    plt.xticks(x, models_list, rotation=45)
    plt.legend()
    plt.tight_layout()
    plt.show()

=== TRADITIONAL ML RESULTS SUMMARY ===
Performance Comparison:


Unnamed: 0,Vectorizer,Model,Accuracy,Precision,Recall,F1-Score
0,Count,Naive Bayes,0.513,0.759,0.513,0.5299
1,Count,Logistic Regression,0.6955,0.7723,0.6955,0.7175
2,Count,SVM,0.7292,0.7665,0.7292,0.7427
3,Count,Random Forest,0.7985,0.7949,0.7985,0.787
4,Count,XGBoost,0.5791,0.778,0.5791,0.6071
5,Count,Gradient Boosting,0.7556,0.7521,0.7556,0.7477
6,Count,Extra Trees,0.3722,0.7131,0.3722,0.2803
7,TF-IDF,Naive Bayes,0.5923,0.7294,0.5923,0.6139
8,TF-IDF,Logistic Regression,0.7026,0.7886,0.7026,0.7248
9,TF-IDF,SVM,0.7398,0.781,0.7398,0.7543



Best performing model: Random Forest with TF-IDF vectorizer
Best accuracy: 0.8032

=== DETAILED RESULTS FOR BEST MODEL ===
Model: Random Forest with TF-IDF vectorizer
Overall Accuracy: 0.8032
Overall Precision: 0.7980
Overall Recall: 0.8032
Overall F1-Score: 0.7931

Per-class metrics:
Negative:
  Precision: 0.8377
  Recall: 0.7600
  F1-Score: 0.7970
Neutral:
  Precision: 0.6876
  Recall: 0.3911
  F1-Score: 0.4986
Positive:
  Precision: 0.8025
  Recall: 0.9068
  F1-Score: 0.8515

=== CONFUSION MATRIX FOR BEST MODEL ===
Confusion Matrix:


Unnamed: 0,Negative,Neutral,Positive
Negative,2271,59,658
Neutral,6,526,813
Positive,434,180,5976


In [35]:
# --- Grid Search for XGBoost hyperparameters ---
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier
import numpy as np
 
le_local = LabelEncoder().fit(['Negative','Neutral','Positive'])


# Reuse LabelEncoder from previous cells (assumed variable name: le_local)
y_train_tfidf_bal_encoded = le_local.transform(y_train_tfidf_bal)
y_test_encoded = le_local.transform(y_test)
 
xgb_param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [4, 6, 8],
    'learning_rate': [0.05, 0.1, 0.2]
}
 
xgb_grid = GridSearchCV(
    XGBClassifier(random_state=42, eval_metric='mlogloss', verbosity=0),
    param_grid=xgb_param_grid,
    scoring='f1_macro',
    cv=3,
    n_jobs=-1
 )
 
xgb_grid.fit(X_train_tfidf_bal, y_train_tfidf_bal_encoded, sample_weight=xgb_sw_tfidf)
 
print('Best XGBoost parameters:', xgb_grid.best_params_)
print('Best macro-F1:', xgb_grid.best_score_)
 
# Predict and print confusion matrix for best XGBoost model
best_xgb = xgb_grid.best_estimator_
y_pred_xgb_encoded = best_xgb.predict(X_test_tfidf)
y_pred_xgb = le_local.inverse_transform(y_pred_xgb_encoded)
from sklearn.metrics import confusion_matrix
classes = ['Negative', 'Neutral', 'Positive']
cm_xgb = confusion_matrix(y_test, y_pred_xgb, labels=classes)
import pandas as pd
cm_xgb_df = pd.DataFrame(cm_xgb, index=classes, columns=classes)
print('Confusion Matrix for best XGBoost (TF-IDF):')
display(cm_xgb_df)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

KeyboardInterrupt: 

In [36]:
# --- Grid Search for RandomForest hyperparameters (TF-IDF) ---
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
import pandas as pd
 
rf_param_grid = {
    'n_estimators': [200, 400],
    'max_depth': [None, 20, 40],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'class_weight': ['balanced_subsample']
}
 
rf_grid = GridSearchCV(
    RandomForestClassifier(random_state=42, n_jobs=-1),
    param_grid=rf_param_grid,
    scoring='f1_macro',
    cv=3,
    n_jobs=-1
 )
 
rf_grid.fit(X_train_tfidf_bal, y_train_tfidf_bal)
 
print('Best RandomForest parameters:', rf_grid.best_params_)
print('Best macro-F1:', rf_grid.best_score_)
 
# Predict and print confusion matrix for best RandomForest model
best_rf = rf_grid.best_estimator_
y_pred_rf = best_rf.predict(X_test_tfidf)
classes = ['Negative', 'Neutral', 'Positive']
cm_rf = confusion_matrix(y_test, y_pred_rf, labels=classes)
cm_rf_df = pd.DataFrame(cm_rf, index=classes, columns=classes)
print('Confusion Matrix for best RandomForest (TF-IDF):')
display(cm_rf_df)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Best RandomForest parameters: {'class_weight': 'balanced_subsample', 'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 400}
Best macro-F1: 0.843941266537238
Confusion Matrix for best RandomForest (TF-IDF):


Unnamed: 0,Negative,Neutral,Positive
Negative,2271,59,658
Neutral,6,526,813
Positive,434,180,5976


## STEP 6: Transformer Approach (HuggingFace)

Implementing modern transformer-based models for sentiment classification using HuggingFace transformers.

### 6.1 Pre-trained Model Selection and Baseline

Testing pre-trained transformer models without fine-tuning to establish baseline performance.

In [37]:
"""
1. DATA PREPROCESSING - HUGGINGFACE TRANSFORMERS

This cell implements the complete HuggingFace transformer preprocessing pipeline as required:
1.1 Data Cleaning and Tokenization - Using HuggingFace tokenizers
1.2 Data Encoding - Converting text to numerical IDs

WHAT ARE TRANSFORMER MODELS?
Transformer models are a revolutionary deep learning architecture introduced in 2017 that use 
self-attention mechanisms to process sequential data like text. Unlike traditional ML approaches 
that work with hand-crafted features (like TF-IDF vectors), transformers learn complex patterns 
and contextual relationships directly from raw text.

Attention is a mechanism that helps the model determine which parts of the input sequence are most relevant when processing a particular element.

KEY TRANSFORMER CHARACTERISTICS:
• Self-Attention: Can focus on different parts of the input text simultaneously
• Contextual Understanding: Words get different representations based on surrounding context
• Pre-training: Trained on massive text corpora to learn general language patterns
• Transfer Learning: Can be fine-tuned for specific tasks like sentiment analysis
• Bidirectional: Models like BERT read text in both directions for better context

TRANSFORMER vs TRADITIONAL ML COMPARISON:
Traditional ML (Previous Cells):     | Transformer Models (This Cell):
• Manual feature engineering         | • Automatic feature learning
• Fixed word representations         | • Dynamic contextual embeddings  
• Bag-of-words assumptions          | • Sequential and positional awareness
• Fast training/inference           | • Slower but more accurate
• Interpretable features            | • Complex but powerful representations

WHY THIS CELL COMES AFTER TRADITIONAL ML:
1. PROGRESSIVE COMPLEXITY: We start with simpler, interpretable methods before advanced techniques
2. BASELINE ESTABLISHMENT: Traditional ML provides performance benchmarks to beat
3. COMPUTATIONAL EFFICIENCY: Traditional methods are faster, good for initial exploration
4. EDUCATIONAL VALUE: Understanding both approaches shows evolution of NLP techniques
5. PRACTICAL COMPARISON: Real projects need to evaluate speed vs accuracy trade-offs
"""

print("=== 1. DATA PREPROCESSING - HUGGINGFACE TRANSFORMERS ===")

# Use the unified transformer_models from the top cell
print("🎯 TRANSFORMER MODELS FOR EVALUATION:")
for name, model_id in transformer_models.items():
    print(f"   • {name}: {model_id}")

# Prepare sample data for transformer processing
sample_size = min(3000, len(df_processed))  # Manageable size for transformers
df_transformer_sample = df_processed.sample(n=sample_size, random_state=42).reset_index(drop=True)

print(f"\n📊 USING {len(df_transformer_sample)} SAMPLES FOR TRANSFORMER PROCESSING")
print(f"   Train/Test Split: 80%/20%")
print(f"   Sentiment Distribution:")
print(df_transformer_sample['sentiment'].value_counts())

# 1.1 DATA CLEANING AND TOKENIZATION using HuggingFace Transformers
def huggingface_preprocessing(texts, labels, model_name, max_length=256):
    """
    Complete HuggingFace preprocessing pipeline
    
    1.1 Data Cleaning and Tokenization:
    - Clean text using HuggingFace tokenizer (handles special chars, punctuation)
    - Apply model-specific tokenization (WordPiece, BPE, etc.)
    - Add special tokens ([CLS], [SEP], [PAD])
    
    1.2 Data Encoding:
    - Convert tokens to numerical IDs using tokenizer vocabulary
    - Create attention masks for variable-length sequences
    - Handle padding and truncation
    """
    print(f"\n🔧 PREPROCESSING WITH {model_name.upper()}")
    
    # Load tokenizer for the specific model
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    print(f"   Tokenizer: {tokenizer.__class__.__name__}")
    print(f"   Vocabulary size: {tokenizer.vocab_size:,}")
    
    # Convert sentiment labels to numerical format
    label_mapping = {'Negative': 0, 'Neutral': 1, 'Positive': 2}
    numerical_labels = [label_mapping[label] for label in labels]
    
    # Split data before tokenization
    X_train_text, X_test_text, y_train, y_test = train_test_split(
        texts, numerical_labels, test_size=0.2, random_state=42, stratify=numerical_labels
    )
    
    print(f"   Train samples: {len(X_train_text)}")
    print(f"   Test samples: {len(X_test_text)}")
    
    # 1.1 TOKENIZATION: Convert text to tokens with cleaning
    print(f"   🔄 Tokenizing and cleaning data...")
    
    # Show tokenization example BEFORE processing
    sample_text = X_train_text[0][:100] + "..." if len(X_train_text[0]) > 100 else X_train_text[0]
    sample_tokens = tokenizer.tokenize(sample_text)
    
    print(f"\n   📝 TOKENIZATION EXAMPLE:")
    print(f"      Original text: {sample_text}")
    print(f"      Tokens: {sample_tokens[:15]}...")
    print(f"      Special tokens: {tokenizer.special_tokens_map}")
    
    # 1.1 & 1.2 COMBINED: Tokenization + Encoding
    train_encodings = tokenizer(
        X_train_text,
        truncation=True,          # Clean: truncate long sequences
        padding=True,             # Clean: pad short sequences
        max_length=max_length,    # Limit sequence length
        return_tensors='pt',      # Return PyTorch tensors
        return_attention_mask=True, # Create attention masks
        add_special_tokens=True   # Add [CLS], [SEP] tokens
    )
    
    test_encodings = tokenizer(
        X_test_text,
        truncation=True,
        padding=True,
        max_length=max_length,
        return_tensors='pt',
        return_attention_mask=True,
        add_special_tokens=True
    )
    
    # 1.2 DATA ENCODING: Text → Numerical IDs (completed by tokenizer)
    print(f"   ✅ Text cleaned and tokenized using HuggingFace tokenizer")
    print(f"   ✅ Sequences encoded to numerical IDs from vocabulary")
    print(f"   📊 Input IDs shape: {train_encodings['input_ids'].shape}")
    print(f"   📊 Attention mask shape: {train_encodings['attention_mask'].shape}")
    
    # Show encoding example
    sample_ids = train_encodings['input_ids'][0][:20]
    decoded_sample = tokenizer.decode(sample_ids, skip_special_tokens=False)
    print(f"      Encoded IDs: {sample_ids.tolist()}")
    print(f"      Decoded back: {decoded_sample}")
    
    return {
        'tokenizer': tokenizer,
        'train_encodings': train_encodings,
        'test_encodings': test_encodings,
        'y_train': torch.tensor(y_train),
        'y_test': torch.tensor(y_test),
        'X_train_text': X_train_text,
        'X_test_text': X_test_text,
        'label_mapping': label_mapping
    }

# Preprocess data for all transformer models
transformer_data = {}
texts = df_transformer_sample[text_column].astype(str).tolist()
labels = df_transformer_sample['sentiment'].tolist()

print(f"\n🔄 PREPROCESSING DATA FOR ALL MODELS...")

for model_name, model_id in transformer_models.items():
    try:
        transformer_data[model_name] = huggingface_preprocessing(
            texts, labels, model_id, max_length=256
        )
        print(f"✅ {model_name} preprocessing completed")
    except Exception as e:
        print(f"❌ {model_name} preprocessing failed: {e}")

print(f"\n✅ DATA PREPROCESSING COMPLETED")
print(f"Successfully preprocessed data for {len(transformer_data)} models")
print(f"Ready for model building and evaluation!")


=== 1. DATA PREPROCESSING - HUGGINGFACE TRANSFORMERS ===
🎯 TRANSFORMER MODELS FOR EVALUATION:
   • BERT: bert-base-uncased
   • RoBERTa: roberta-base
   • DistilBERT: distilbert-base-uncased
   • ELECTRA: google/electra-base-discriminator

📊 USING 3000 SAMPLES FOR TRANSFORMER PROCESSING
   Train/Test Split: 80%/20%
   Sentiment Distribution:
sentiment
Positive    1832
Negative     789
Neutral      379
Name: count, dtype: int64

🔄 PREPROCESSING DATA FOR ALL MODELS...

🔧 PREPROCESSING WITH BERT-BASE-UNCASED
   Tokenizer: BertTokenizerFast
   Vocabulary size: 30,522
   Train samples: 2400
   Test samples: 600
   🔄 Tokenizing and cleaning data...

   📝 TOKENIZATION EXAMPLE:
      Original text: I bought 2 of these, 1 for each of my 2 youngest Grandaughters. Wanted something that would be made ...
      Tokens: ['i', 'bought', '2', 'of', 'these', ',', '1', 'for', 'each', 'of', 'my', '2', 'youngest', 'grand', '##au']...
      Special tokens: {'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_

In [67]:
"""
2.1 MODEL SELECTION AND BASELINE PERFORMANCE

This cell explores transformer-based models and evaluates their baseline performance without fine-tuning.
We test multiple architectures to select the best pre-trained model for our sentiment analysis task.
"""

# Set device correctly for MacBook M4
if torch.backends.mps.is_available():
    device = torch.device("mps")
    device_id = 0
    print("🚀 Using Apple Metal Performance Shaders (GPU)")
elif torch.cuda.is_available():
    device = torch.device("cuda")
    device_id = 0
    print("🚀 Using CUDA (GPU)")
else:
    device = torch.device("cpu")
    device_id = -1
    print("🖥️ Using CPU")

# Import required libraries
import pandas as pd
import torch
from transformers import pipeline
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from IPython.display import display

print("=== 2.1 MODEL SELECTION AND BASELINE PERFORMANCE ===")

# Pre-trained sentiment models for baseline testing
baseline_sentiment_models = {
    'BERT': 'nlptown/bert-base-multilingual-uncased-sentiment',
    'RoBERTa': 'cardiffnlp/twitter-roberta-base-sentiment-latest', 
    'DistilBERT': 'distilbert-base-uncased-finetuned-sst-2-english',
    'ELECTRA': 'google/electra-small-discriminator'
}

print("🎯 MODEL SELECTION JUSTIFICATION:")
print("""
BERT (Bidirectional Encoder Representations from Transformers):
✅ Pioneering transformer architecture with bidirectional context
✅ Excellent baseline for most NLP tasks
✅ Multilingual variant handles diverse datasets
✅ Strong performance on sentiment classification
❌ Larger model size and slower inference
❌ Requires more computational resources

RoBERTa (Robustly Optimized BERT Approach):
✅ Improved training methodology over BERT (no NSP, longer training)
✅ Better performance on downstream tasks
✅ Twitter variant optimized for social media text
✅ More robust to hyperparameters
❌ Requires significant computational resources
❌ Larger vocabulary than BERT

DistilBERT (Distilled BERT):
✅ 60% smaller than BERT with 97% of performance
✅ 60% faster inference than BERT
✅ Good balance between speed and accuracy
✅ Easier deployment in production
❌ Slightly lower performance than full BERT
❌ May struggle with complex reasoning tasks

ELECTRA (Efficiently Learning an Encoder that Classifies Token Replacements Accurately):
✅ More sample-efficient than BERT (learns from all tokens)
✅ Replaced token detection vs masked language modeling
✅ Better performance with same compute budget
✅ Discriminator-generator architecture innovation
❌ Newer architecture, less established
❌ No pre-trained sentiment models available
""")

def evaluate_baseline_model(model_name, model_id):
    """Evaluate a baseline pre-trained model"""
    try:
        print(f"🔍 Evaluating {model_name}...")
        
        # Create sentiment analysis pipeline
        classifier = pipeline(
            "sentiment-analysis", 
            model=model_id, 
            tokenizer=model_id,
            device=device_id,
            return_all_scores=False,
            truncation=True,
            max_length=512,  # Fixed maximum length
            padding=True     # Enable padding
        )
        
        # Use smaller sample for baseline testing
        # Check if variables exist, if not create fallback
        try:
            sample_texts = df_transformer_sample[text_column].astype(str).tolist()[:500]  # First 500 samples
            sample_labels = df_transformer_sample['sentiment'].tolist()[:500]
        except NameError:
            # Fallback: use df if df_transformer_sample doesn't exist
            try:
                sample_texts = df[text_column].astype(str).tolist()[:500]
                sample_labels = df['sentiment'].tolist()[:500]
            except (NameError, KeyError):
                print(f"   ❌ Error: Required variables not found. Please ensure df_transformer_sample and text_column are defined.")
                return None
        
        
        print(f"   Processing {len(sample_texts)} samples...")
        
        # Process one by one to avoid batch size issues
        predictions = []
        for i, text in enumerate(sample_texts):
            try:
                # Truncate very long texts manually
                if len(text) > 1000:  # Truncate very long reviews
                    text = text[:1000]
                
                pred = classifier(text)
                predictions.append(pred[0] if isinstance(pred, list) else pred)
                
                # Progress indicator
                if (i + 1) % 50 == 0:
                    print(f"   Processed {i + 1}/{len(sample_texts)} samples...")
                    
            except Exception as e:
                print(f"   Warning: Sample {i+1} failed: {str(e)[:100]}...")
                # Add dummy prediction
                predictions.append({'label': 'NEUTRAL', 'score': 0.5})
        
        
        # Map predictions to our labels
        predicted_labels = []
        for pred in predictions:
            label = str(pred['label']).upper()
            if any(neg in label for neg in ['NEGATIVE', '1', '2', 'LABEL_0']):
                predicted_labels.append('Negative')
            elif any(neu in label for neu in ['NEUTRAL', '3', 'LABEL_1']):
                predicted_labels.append('Neutral')
            else:
                predicted_labels.append('Positive')
        
        # Calculate comprehensive metrics
        accuracy = accuracy_score(sample_labels, predicted_labels)
        precision, recall, f1, _ = precision_recall_fscore_support(
            sample_labels, predicted_labels, average='weighted', zero_division=0
        )
        
        # Per-class metrics
        precision_per_class, recall_per_class, f1_per_class, _ = precision_recall_fscore_support(
            sample_labels, predicted_labels, average=None, 
            labels=['Negative', 'Neutral', 'Positive'], zero_division=0
        )
        
        results = {
            'accuracy': accuracy,
            'precision': precision,
            'recall': recall,
            'f1': f1,
            'precision_per_class': precision_per_class,
            'recall_per_class': recall_per_class,
            'f1_per_class': f1_per_class,
            'predictions': predicted_labels,
            'true_labels': sample_labels,
            'model_type': 'baseline'
        }
        
        print(f"   ✅ Baseline Results:")
        print(f"      Accuracy: {accuracy:.4f}")
        print(f"      Precision: {precision:.4f}")
        print(f"      Recall: {recall:.4f}")
        print(f"      F1-Score: {f1:.4f}")
        
        return results
        
    except Exception as e:
        print(f"   ❌ Error evaluating {model_name}: {e}")
        return None

# Evaluate baseline models
print(f"\n📊 BASELINE EVALUATION (Pre-trained models without fine-tuning)")

baseline_results = {}

for model_name, model_id in baseline_sentiment_models.items():
    result = evaluate_baseline_model(model_name, model_id)  # ✅ Call the function
    if result:
        baseline_results[model_name] = result
    else:
        print(f"   ⚠️  Skipping {model_name} due to evaluation error")


# Display baseline results summary
if 'baseline_results' not in globals() or baseline_results is None:
    baseline_results = {}
if baseline_results:
    print(f"\n📈 BASELINE RESULTS SUMMARY:")
    baseline_df = pd.DataFrame([
        {
            'Model': name,
            'Accuracy': results['accuracy'],
            'Precision': results['precision'],
            'Recall': results['recall'],
            'F1-Score': results['f1']
        }
        for name, results in baseline_results.items()
    ])
    display(baseline_df.round(4))
    if not baseline_df.empty:
        best_baseline = baseline_df.loc[baseline_df['Accuracy'].idxmax()]
        print(f"\n🏆 BEST BASELINE MODEL: {best_baseline['Model']}")
        print(f"   📊 Baseline Accuracy: {best_baseline['Accuracy']:.4f}")
        print(f"   📊 Baseline F1-Score: {best_baseline['F1-Score']:.4f}")
        print(f"   🎯 This is our benchmark to beat with fine-tuning!")
        best_results = baseline_results[best_baseline['Model']]
        print(f"\n   📋 Per-class Performance:")
        classes = ['Negative', 'Neutral', 'Positive']
        for i, class_name in enumerate(classes):
            if i < len(best_results['precision_per_class']):
                precision = best_results['precision_per_class'][i]
                recall = best_results['recall_per_class'][i]
                f1 = best_results['f1_per_class'][i]
                print(f"      {class_name}: P={precision:.3f}, R={recall:.3f}, F1={f1:.3f}")
else:
    print("No baseline results available.")
print(f"\n✅ MODEL SELECTION BASELINE COMPLETED")
print(f"Successfully evaluated {len(baseline_results)} baseline models")
print(f"Next step: Fine-tuning selected models for improved performance")

# Clean up GPU memory
if torch.cuda.is_available() or torch.backends.mps.is_available():
    torch.cuda.empty_cache() if torch.cuda.is_available() else None
    print("🧹 GPU memory cleared")


🚀 Using Apple Metal Performance Shaders (GPU)
=== 2.1 MODEL SELECTION AND BASELINE PERFORMANCE ===
🎯 MODEL SELECTION JUSTIFICATION:

BERT (Bidirectional Encoder Representations from Transformers):
✅ Pioneering transformer architecture with bidirectional context
✅ Excellent baseline for most NLP tasks
✅ Multilingual variant handles diverse datasets
✅ Strong performance on sentiment classification
❌ Larger model size and slower inference
❌ Requires more computational resources

RoBERTa (Robustly Optimized BERT Approach):
✅ Improved training methodology over BERT (no NSP, longer training)
✅ Better performance on downstream tasks
✅ Twitter variant optimized for social media text
✅ More robust to hyperparameters
❌ Requires significant computational resources
❌ Larger vocabulary than BERT

DistilBERT (Distilled BERT):
✅ 60% smaller than BERT with 97% of performance
✅ 60% faster inference than BERT
✅ Good balance between speed and accuracy
✅ Easier deployment in production
❌ Slightly lower pe

Device set to use mps:0


   Processing 500 samples...
   Processed 50/500 samples...
   Processed 50/500 samples...
   Processed 100/500 samples...
   Processed 100/500 samples...
   Processed 150/500 samples...
   Processed 150/500 samples...
   Processed 200/500 samples...
   Processed 200/500 samples...
   Processed 250/500 samples...
   Processed 250/500 samples...
   Processed 300/500 samples...
   Processed 300/500 samples...
   Processed 350/500 samples...
   Processed 350/500 samples...
   Processed 400/500 samples...
   Processed 400/500 samples...
   Processed 450/500 samples...
   Processed 450/500 samples...
   Processed 500/500 samples...
   ✅ Baseline Results:
      Accuracy: 0.7380
      Precision: 0.7594
      Recall: 0.7380
      F1-Score: 0.7472
🔍 Evaluating RoBERTa...
   Processed 500/500 samples...
   ✅ Baseline Results:
      Accuracy: 0.7380
      Precision: 0.7594
      Recall: 0.7380
      F1-Score: 0.7472
🔍 Evaluating RoBERTa...


Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use mps:0
Device set to use mps:0


   Processing 500 samples...
   Processed 50/500 samples...
   Processed 50/500 samples...
   Processed 100/500 samples...
   Processed 100/500 samples...
   Processed 150/500 samples...
   Processed 150/500 samples...
   Processed 200/500 samples...
   Processed 200/500 samples...
   Processed 250/500 samples...
   Processed 250/500 samples...
   Processed 300/500 samples...
   Processed 300/500 samples...
   Processed 350/500 samples...
   Processed 350/500 samples...
   Processed 400/500 samples...
   Processed 400/500 samples...
   Processed 450/500 samples...
   Processed 450/500 samples...
   Processed 500/500 samples...
   ✅ Baseline Results:
      Accuracy: 0.7300
      Precision: 0.7432
      Recall: 0.7300
      F1-Score: 0.7337
🔍 Evaluating DistilBERT...
   Processed 500/500 samples...
   ✅ Baseline Results:
      Accuracy: 0.7300
      Precision: 0.7432
      Recall: 0.7300
      F1-Score: 0.7337
🔍 Evaluating DistilBERT...


Device set to use mps:0


   Processing 500 samples...
   Processed 50/500 samples...
   Processed 50/500 samples...
   Processed 100/500 samples...
   Processed 100/500 samples...
   Processed 150/500 samples...
   Processed 150/500 samples...
   Processed 200/500 samples...
   Processed 200/500 samples...
   Processed 250/500 samples...
   Processed 250/500 samples...
   Processed 300/500 samples...
   Processed 300/500 samples...
   Processed 350/500 samples...
   Processed 350/500 samples...
   Processed 400/500 samples...
   Processed 400/500 samples...
   Processed 450/500 samples...
   Processed 450/500 samples...
   Processed 500/500 samples...
   ✅ Baseline Results:
      Accuracy: 0.7800
      Precision: 0.7025
      Recall: 0.7800
      F1-Score: 0.7368
🔍 Evaluating ELECTRA...
   Processed 500/500 samples...
   ✅ Baseline Results:
      Accuracy: 0.7800
      Precision: 0.7025
      Recall: 0.7800
      F1-Score: 0.7368
🔍 Evaluating ELECTRA...


Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-small-discriminator and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Device set to use mps:0
Device set to use mps:0


   Processing 500 samples...
   Processed 50/500 samples...
   Processed 50/500 samples...
   Processed 100/500 samples...
   Processed 100/500 samples...
   Processed 150/500 samples...
   Processed 150/500 samples...
   Processed 200/500 samples...
   Processed 200/500 samples...
   Processed 250/500 samples...
   Processed 250/500 samples...
   Processed 300/500 samples...
   Processed 300/500 samples...
   Processed 350/500 samples...
   Processed 350/500 samples...
   Processed 400/500 samples...
   Processed 400/500 samples...
   Processed 450/500 samples...
   Processed 450/500 samples...
   Processed 500/500 samples...
   ✅ Baseline Results:
      Accuracy: 0.2720
      Precision: 0.0740
      Recall: 0.2720
      F1-Score: 0.1163

📈 BASELINE RESULTS SUMMARY:
   Processed 500/500 samples...
   ✅ Baseline Results:
      Accuracy: 0.2720
      Precision: 0.0740
      Recall: 0.2720
      F1-Score: 0.1163

📈 BASELINE RESULTS SUMMARY:


Unnamed: 0,Model,Accuracy,Precision,Recall,F1-Score
0,BERT,0.738,0.7594,0.738,0.7472
1,RoBERTa,0.73,0.7432,0.73,0.7337
2,DistilBERT,0.78,0.7025,0.78,0.7368
3,ELECTRA,0.272,0.074,0.272,0.1163



🏆 BEST BASELINE MODEL: DistilBERT
   📊 Baseline Accuracy: 0.7800
   📊 Baseline F1-Score: 0.7368
   🎯 This is our benchmark to beat with fine-tuning!

   📋 Per-class Performance:
      Negative: P=0.674, R=0.897, F1=0.770
      Neutral: P=0.000, R=0.000, F1=0.000
      Positive: P=0.840, R=0.867, F1=0.854

✅ MODEL SELECTION BASELINE COMPLETED
Successfully evaluated 4 baseline models
Next step: Fine-tuning selected models for improved performance
🧹 GPU memory cleared


In [68]:
# Document baseline performance clearly
print("=== BASELINE PERFORMANCE (WITHOUT FINE-TUNING) ===")
print("This is the performance using pre-trained models directly on our data:")

if 'baseline_results' not in globals() or baseline_results is None:
    baseline_results = {}
if baseline_results:
    for model_name, results in baseline_results.items():
        print(f"\n{model_name} (Pre-trained, no fine-tuning):")
        print(f"   • Accuracy: {results['accuracy']:.4f} ({results['accuracy']:.1%})")
        print(f"   • F1-Score: {results['f1']:.4f}")
        print(f"   • This is our baseline to compare against fine-tuned models")
else:
    print("No baseline results to display.")

=== BASELINE PERFORMANCE (WITHOUT FINE-TUNING) ===
This is the performance using pre-trained models directly on our data:

BERT (Pre-trained, no fine-tuning):
   • Accuracy: 0.7380 (73.8%)
   • F1-Score: 0.7472
   • This is our baseline to compare against fine-tuned models

RoBERTa (Pre-trained, no fine-tuning):
   • Accuracy: 0.7300 (73.0%)
   • F1-Score: 0.7337
   • This is our baseline to compare against fine-tuned models

DistilBERT (Pre-trained, no fine-tuning):
   • Accuracy: 0.7800 (78.0%)
   • F1-Score: 0.7368
   • This is our baseline to compare against fine-tuned models

ELECTRA (Pre-trained, no fine-tuning):
   • Accuracy: 0.2720 (27.2%)
   • F1-Score: 0.1163
   • This is our baseline to compare against fine-tuned models


In [69]:
"""
3. MODEL EVALUATION

3.1 Evaluation Metrics - Comprehensive performance evaluation
3.2 Results - Detailed results presentation with confusion matrices

This cell provides complete evaluation of both baseline and fine-tuned transformer models,
comparing performance metrics and analyzing results across different sentiment classes.
"""

print("=== 3. MODEL EVALUATION ===")
print("=== 3.1 EVALUATION METRICS & 3.2 RESULTS ===")

# Combine all transformer results for comprehensive comparison
all_transformer_results = []

# Add baseline results (pre-trained models without fine-tuning)
for model_name, results in baseline_results.items():
    all_transformer_results.append({
        'Model': f"{model_name}",
        'Type': 'Baseline (Pre-trained)',
        'Accuracy': results['accuracy'],
        'Precision': results['precision'],
        'Recall': results['recall'],
        'F1-Score': results['f1'],
        'Details': results
    })
"""
# Add fine-tuned results
for model_name, results in fine_tuned_results.items():
    all_transformer_results.append({
        'Model': f"{model_name}",
        'Type': 'Fine-tuned',
        'Accuracy': results['accuracy'],
        'Precision': results['precision'],
        'Recall': results['recall'],
        'F1-Score': results['f1'],
        'Details': results
    })"""

if all_transformer_results:
    transformer_comparison_df = pd.DataFrame(all_transformer_results)
    
    print("📊 COMPREHENSIVE TRANSFORMER EVALUATION RESULTS:")
    display_df = transformer_comparison_df.drop('Details', axis=1)  # Remove details for clean display
    display(display_df.round(4))
    
    # Find best model overall
    best_model_idx = transformer_comparison_df['Accuracy'].idxmax()
    best_model = transformer_comparison_df.loc[best_model_idx]
    
    print(f"\n🏆 BEST PERFORMING TRANSFORMER MODEL:")
    print(f"   🥇 Model: {best_model['Model']} ({best_model['Type']})")
    print(f"   📊 Accuracy: {best_model['Accuracy']:.4f} ({best_model['Accuracy']:.1%})")
    print(f"   📊 F1-Score: {best_model['F1-Score']:.4f}")
    print(f"   📊 Precision: {best_model['Precision']:.4f}")
    print(f"   📊 Recall: {best_model['Recall']:.4f}")
    
    # Detailed evaluation for best model
    best_results = best_model['Details']
    
    print(f"\n📋 DETAILED RESULTS - {best_model['Model'].upper()} ({best_model['Type'].upper()}):")
    print(f"   🎯 Model achieved an accuracy of {best_results['accuracy']:.1%} on the validation dataset")
    
    # Per-class performance
    print(f"\n   📊 Per-class Performance:")
    classes = ['Negative', 'Neutral', 'Positive']
    
    if len(best_results['precision_per_class']) >= 3:
        for i, class_name in enumerate(classes):
            precision = best_results['precision_per_class'][i]
            recall = best_results['recall_per_class'][i] 
            f1 = best_results['f1_per_class'][i]
            print(f"      • Class {class_name}:")
            print(f"        - Precision: {precision:.1%} ({precision:.4f})")
            print(f"        - Recall: {recall:.1%} ({recall:.4f})")
            print(f"        - F1-score: {f1:.1%} ({f1:.4f})")
    
    # Confusion Matrix
    if 'confusion_matrix' in best_results:
        cm = best_results['confusion_matrix']
    else:
        # Calculate confusion matrix if not stored
        cm = confusion_matrix(
            best_results['true_labels'], 
            best_results['predictions'], 
            labels=classes
        )
    
    print(f"\n📊 CONFUSION MATRIX - {best_model['Model'].upper()}:")
    cm_df = pd.DataFrame(cm, index=classes, columns=classes)
    cm_df.index.name = 'True Label'
    cm_df.columns.name = 'Predicted Label'
    display(cm_df)
    
    # Classification Report
    print(f"\n📈 DETAILED CLASSIFICATION REPORT:")
    print(classification_report(
        best_results['true_labels'], 
        best_results['predictions'],
        target_names=classes
    ))
    
    # Training metrics for fine-tuned models
    if best_model['Type'] == 'Fine-tuned':
        print(f"\n🏋️ TRAINING METRICS:")
        print(f"   Training Loss: {best_results['training_loss']:.4f}")
        print(f"   Validation Loss: {best_results['eval_loss']:.4f}")
        
        # Loss analysis
        loss_ratio = best_results['eval_loss'] / best_results['training_loss']
        if loss_ratio < 1.2:
            print(f"   📊 Loss Ratio: {loss_ratio:.2f} (Good - No significant overfitting)")
        elif loss_ratio < 1.5:
            print(f"   📊 Loss Ratio: {loss_ratio:.2f} (Acceptable - Slight overfitting)")
        else:
            print(f"   📊 Loss Ratio: {loss_ratio:.2f} (Warning - Possible overfitting)")

# Performance comparison analysis
print(f"\n📈 PERFORMANCE COMPARISON ANALYSIS:")
"""
# Baseline vs Fine-tuned comparison
baseline_models = set(baseline_results.keys())
finetuned_models = set(fine_tuned_results.keys())
common_models = baseline_models.intersection(finetuned_models)

if common_models:
    print(f"\n   🔄 FINE-TUNING IMPACT ANALYSIS:")
    for model_name in common_models:
        baseline_acc = baseline_results[model_name]['accuracy']
        finetuned_acc = fine_tuned_results[model_name]['accuracy']
        improvement = finetuned_acc - baseline_acc
        improvement_pct = (improvement / baseline_acc) * 100
        
    print(f"      {model_name}:")
    print(f"      • Baseline Accuracy: {baseline_acc:.4f}")
    print(f"      • Fine-tuned Accuracy: {finetuned_acc:.4f}")
    print(f"      • Improvement: {improvement:+.4f} ({improvement_pct:+.1f}%)")
        
        if improvement > 0.05:
            print(f"      • 🚀 Significant improvement from fine-tuning!")
        elif improvement > 0.02:
            print(f"      • ✅ Moderate improvement from fine-tuning")
        elif improvement > 0:
            print(f"      • 📈 Small improvement from fine-tuning")
        else:
            print(f"      • ⚠️ Fine-tuning did not improve performance")
"""
# Model architecture comparison
print(f"\n   🏗️ MODEL ARCHITECTURE COMPARISON:")
arch_comparison = {}
for model_data in all_transformer_results:
    model_base = model_data['Model'].split()[0]  # Get base model name
    if model_base not in arch_comparison:
        arch_comparison[model_base] = []
    arch_comparison[model_base].append(model_data['Accuracy'])

for arch, accuracies in arch_comparison.items():
    avg_acc = np.mean(accuracies)
    max_acc = np.max(accuracies)
    print(f"      {arch}: Avg={avg_acc:.4f}, Max={max_acc:.4f}")

# Comprehensive visualization
print(f"\n📊 CREATING COMPREHENSIVE VISUALIZATIONS...")

try:
    # Create comprehensive evaluation dashboard
    fig = make_subplots(
        rows=2, cols=2,
        subplot_titles=(
            'Model Performance Comparison',
            'Baseline vs Fine-tuned', 
            'Confusion Matrix (Best Model)',
            'Precision-Recall by Class'
        ),
        specs=[
            [{"type": "bar"}, {"type": "bar"}],
            [{"type": "heatmap"}, {"type": "bar"}]
        ]
    )
    
    # Model performance comparison
    models = [f"{row['Model']} ({row['Type'][:8]})" for row in all_transformer_results]
    accuracies = [row['Accuracy'] for row in all_transformer_results]
    colors = ['lightblue' if 'Baseline' in row['Type'] else 'lightcoral' for row in all_transformer_results]
    
    fig.add_trace(
        go.Bar(x=models, y=accuracies, name='Accuracy', marker_color=colors),
        row=1, col=1
    )
    
    # Baseline vs Fine-tuned comparison
    if baseline_results and fine_tuned_results:
        comparison_data = []
        comparison_labels = []
        comparison_colors = []
        
        for model in baseline_results:
            comparison_data.append(baseline_results[model]['accuracy'])
            comparison_labels.append(f"{model}\nBaseline")
            comparison_colors.append('lightblue')
            
        for model in fine_tuned_results:
            comparison_data.append(fine_tuned_results[model]['accuracy'])
            comparison_labels.append(f"{model}\nFine-tuned")
            comparison_colors.append('lightcoral')
        
        fig.add_trace(
            go.Bar(x=comparison_labels, y=comparison_data, name='Comparison', 
                  marker_color=comparison_colors, showlegend=False),
            row=1, col=2
        )
    
    # Confusion matrix heatmap
    fig.add_trace(
        go.Heatmap(z=cm, x=classes, y=classes, colorscale='Blues', 
                  text=cm, texttemplate="%{text}", showscale=False),
        row=2, col=1
    )
    
    # Precision-Recall by class for best model
    if len(best_results['precision_per_class']) >= 3:
        metrics = ['Precision', 'Recall', 'F1-Score']
        values = [
            best_results['precision_per_class'],
            best_results['recall_per_class'],
            best_results['f1_per_class']
        ]
        
        for i, metric in enumerate(metrics):
            fig.add_trace(
                go.Bar(x=classes, y=values[i], name=metric, 
                      offsetgroup=i, opacity=0.8),
                row=2, col=2
            )
    
    fig.update_layout(
        height=800, 
        title_text=f"🚀 Transformer Models Evaluation Dashboard<br>Best Model: {best_model['Model']} ({best_model['Accuracy']:.1%} accuracy)",
        showlegend=False
    )
    
    fig.update_xaxes(tickangle=45, row=1, col=1)
    fig.update_xaxes(tickangle=45, row=1, col=2)
    
    fig.show()
    
except Exception as e:
    print(f"Visualization error: {e}")
    
    # Matplotlib fallback
    plt.figure(figsize=(15, 10))
    
    # Subplot 1: Model comparison
    plt.subplot(2, 2, 1)
    model_names = [f"{row['Model']}\n({row['Type'][:8]})" for row in all_transformer_results]
    accuracies = [row['Accuracy'] for row in all_transformer_results]
    colors = ['lightblue' if 'Baseline' in row['Type'] else 'lightcoral' for row in all_transformer_results]
    
    plt.bar(range(len(model_names)), accuracies, color=colors)
    plt.xlabel('Models')
    plt.ylabel('Accuracy')
    plt.title('Model Performance Comparison')
    plt.xticks(range(len(model_names)), model_names, rotation=45, ha='right')
    
    # Subplot 2: Confusion matrix
    plt.subplot(2, 2, 2)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                xticklabels=classes, yticklabels=classes)
    plt.title(f'Confusion Matrix - {best_model["Model"]}')
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    
    # Subplot 3: Per-class metrics
    plt.subplot(2, 2, 3)
    if len(best_results['precision_per_class']) >= 3:
        x = np.arange(len(classes))
        width = 0.25
        
        plt.bar(x - width, best_results['precision_per_class'], width, label='Precision', alpha=0.8)
        plt.bar(x, best_results['recall_per_class'], width, label='Recall', alpha=0.8)
        plt.bar(x + width, best_results['f1_per_class'], width, label='F1-Score', alpha=0.8)
        
        plt.xlabel('Classes')
        plt.ylabel('Score')
        plt.title('Per-class Performance Metrics')
        plt.xticks(x, classes)
        plt.legend()
    
    # Subplot 4: Accuracy distribution
    plt.subplot(2, 2, 4)
    plt.hist(accuracies, bins=10, alpha=0.7, color='skyblue', edgecolor='black')
    plt.axvline(best_model['Accuracy'], color='red', linestyle='--', 
                label=f'Best: {best_model["Accuracy"]:.3f}')
    plt.xlabel('Accuracy')
    plt.ylabel('Frequency')
    plt.title('Accuracy Distribution')
    plt.legend()
    
    plt.tight_layout()
    plt.show()

=== 3. MODEL EVALUATION ===
=== 3.1 EVALUATION METRICS & 3.2 RESULTS ===
📊 COMPREHENSIVE TRANSFORMER EVALUATION RESULTS:


Unnamed: 0,Model,Type,Accuracy,Precision,Recall,F1-Score
0,BERT,Baseline (Pre-trained),0.738,0.7594,0.738,0.7472
1,RoBERTa,Baseline (Pre-trained),0.73,0.7432,0.73,0.7337
2,DistilBERT,Baseline (Pre-trained),0.78,0.7025,0.78,0.7368
3,ELECTRA,Baseline (Pre-trained),0.272,0.074,0.272,0.1163



🏆 BEST PERFORMING TRANSFORMER MODEL:
   🥇 Model: DistilBERT (Baseline (Pre-trained))
   📊 Accuracy: 0.7800 (78.0%)
   📊 F1-Score: 0.7368
   📊 Precision: 0.7025
   📊 Recall: 0.7800

📋 DETAILED RESULTS - DISTILBERT (BASELINE (PRE-TRAINED)):
   🎯 Model achieved an accuracy of 78.0% on the validation dataset

   📊 Per-class Performance:
      • Class Negative:
        - Precision: 67.4% (0.6740)
        - Recall: 89.7% (0.8971)
        - F1-score: 77.0% (0.7697)
      • Class Neutral:
        - Precision: 0.0% (0.0000)
        - Recall: 0.0% (0.0000)
        - F1-score: 0.0% (0.0000)
      • Class Positive:
        - Precision: 84.0% (0.8401)
        - Recall: 86.7% (0.8673)
        - F1-score: 85.4% (0.8535)

📊 CONFUSION MATRIX - DISTILBERT:


Predicted Label,Negative,Neutral,Positive
True Label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Negative,122,0,14
Neutral,18,0,37
Positive,41,0,268



📈 DETAILED CLASSIFICATION REPORT:
              precision    recall  f1-score   support

    Negative       0.67      0.90      0.77       136
     Neutral       0.00      0.00      0.00        55
    Positive       0.84      0.87      0.85       309

    accuracy                           0.78       500
   macro avg       0.50      0.59      0.54       500
weighted avg       0.70      0.78      0.74       500


📈 PERFORMANCE COMPARISON ANALYSIS:

   🏗️ MODEL ARCHITECTURE COMPARISON:
      BERT: Avg=0.7380, Max=0.7380
      RoBERTa: Avg=0.7300, Max=0.7300
      DistilBERT: Avg=0.7800, Max=0.7800
      ELECTRA: Avg=0.2720, Max=0.2720

📊 CREATING COMPREHENSIVE VISUALIZATIONS...


## STEP 6: Transformer Approach (HuggingFace)

Implementing modern transformer-based models for sentiment classification using HuggingFace transformers.

### 6.1 Pre-trained Model Selection and Baseline

Testing pre-trained transformer models without fine-tuning to establish baseline performance.

In [70]:
"""
1. DATA PREPROCESSING - HUGGINGFACE TRANSFORMERS

This cell implements the complete HuggingFace transformer preprocessing pipeline as required:
1.1 Data Cleaning and Tokenization - Using HuggingFace tokenizers
1.2 Data Encoding - Converting text to numerical IDs

WHAT ARE TRANSFORMER MODELS?
Transformer models are a revolutionary deep learning architecture introduced in 2017 that use 
self-attention mechanisms to process sequential data like text. Unlike traditional ML approaches 
that work with hand-crafted features (like TF-IDF vectors), transformers learn complex patterns 
and contextual relationships directly from raw text.

Attention is a mechanism that helps the model determine which parts of the input sequence are most relevant when processing a particular element.

KEY TRANSFORMER CHARACTERISTICS:
• Self-Attention: Can focus on different parts of the input text simultaneously
• Contextual Understanding: Words get different representations based on surrounding context
• Pre-training: Trained on massive text corpora to learn general language patterns
• Transfer Learning: Can be fine-tuned for specific tasks like sentiment analysis
• Bidirectional: Models like BERT read text in both directions for better context

TRANSFORMER vs TRADITIONAL ML COMPARISON:
Traditional ML (Previous Cells):     | Transformer Models (This Cell):
• Manual feature engineering         | • Automatic feature learning
• Fixed word representations         | • Dynamic contextual embeddings  
• Bag-of-words assumptions          | • Sequential and positional awareness
• Fast training/inference           | • Slower but more accurate
• Interpretable features            | • Complex but powerful representations

WHY THIS CELL COMES AFTER TRADITIONAL ML:
1. PROGRESSIVE COMPLEXITY: We start with simpler, interpretable methods before advanced techniques
2. BASELINE ESTABLISHMENT: Traditional ML provides performance benchmarks to beat
3. COMPUTATIONAL EFFICIENCY: Traditional methods are faster, good for initial exploration
4. EDUCATIONAL VALUE: Understanding both approaches shows evolution of NLP techniques
5. PRACTICAL COMPARISON: Real projects need to evaluate speed vs accuracy trade-offs
"""

print("=== 1. DATA PREPROCESSING - HUGGINGFACE TRANSFORMERS ===")

# 1.1 & 1.2 - Define transformer models for preprocessing and evaluation
transformer_models = {
    'BERT': 'bert-base-uncased',
    'RoBERTa': 'roberta-base', 
    'DistilBERT': 'distilbert-base-uncased',
    'ELECTRA': 'google/electra-base-discriminator'  # NEW: Adding ELECTRA model
}

print("🎯 TRANSFORMER MODELS FOR EVALUATION:")
for name, model_id in transformer_models.items():
    print(f"   • {name}: {model_id}")

print(f"\n💡 WHY ELECTRA WAS ADDED:")
print(f"   ✅ Efficient Pre-training: Uses replaced token detection instead of masked language modeling")
print(f"   ✅ Better Sample Efficiency: Learns from all input tokens, not just masked ones")
print(f"   ✅ Strong Performance: Often matches or exceeds BERT with less compute")
print(f"   ✅ Google Research: Advanced discriminator-generator architecture")
print(f"   ✅ Computational Efficiency: Faster training and inference than BERT")

# Prepare sample data for transformer processing
sample_size = min(3000, len(df_processed))  # Manageable size for transformers
df_transformer_sample = df_processed.sample(n=sample_size, random_state=42).reset_index(drop=True)

print(f"\n📊 USING {len(df_transformer_sample)} SAMPLES FOR TRANSFORMER PROCESSING")
print(f"   Train/Test Split: 80%/20%")
print(f"   Sentiment Distribution:")
print(df_transformer_sample['sentiment'].value_counts())

# 1.1 DATA CLEANING AND TOKENIZATION using HuggingFace Transformers
def huggingface_preprocessing(texts, labels, model_name, max_length=256):
    """
    Complete HuggingFace preprocessing pipeline
    
    1.1 Data Cleaning and Tokenization:
    - Clean text using HuggingFace tokenizer (handles special chars, punctuation)
    - Apply model-specific tokenization (WordPiece, BPE, etc.)
    - Add special tokens ([CLS], [SEP], [PAD])
    
    1.2 Data Encoding:
    - Convert tokens to numerical IDs using tokenizer vocabulary
    - Create attention masks for variable-length sequences
    - Handle padding and truncation
    """
    print(f"\n🔧 PREPROCESSING WITH {model_name.upper()}")
    
    # Load tokenizer for the specific model
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    print(f"   Tokenizer: {tokenizer.__class__.__name__}")
    print(f"   Vocabulary size: {tokenizer.vocab_size:,}")
    
    # Convert sentiment labels to numerical format
    label_mapping = {'Negative': 0, 'Neutral': 1, 'Positive': 2}
    numerical_labels = [label_mapping[label] for label in labels]
    
    # Split data before tokenization
    X_train_text, X_test_text, y_train, y_test = train_test_split(
        texts, numerical_labels, test_size=0.2, random_state=42, stratify=numerical_labels
    )
    
    print(f"   Train samples: {len(X_train_text)}")
    print(f"   Test samples: {len(X_test_text)}")
    
    # 1.1 TOKENIZATION: Convert text to tokens with cleaning
    print(f"   🔄 Tokenizing and cleaning data...")
    
    # Show tokenization example BEFORE processing
    sample_text = X_train_text[0][:100] + "..." if len(X_train_text[0]) > 100 else X_train_text[0]
    sample_tokens = tokenizer.tokenize(sample_text)
    
    print(f"\n   📝 TOKENIZATION EXAMPLE:")
    print(f"      Original text: {sample_text}")
    print(f"      Tokens: {sample_tokens[:15]}...")
    print(f"      Special tokens: {tokenizer.special_tokens_map}")
    
    # 1.1 & 1.2 COMBINED: Tokenization + Encoding
    train_encodings = tokenizer(
        X_train_text,
        truncation=True,          # Clean: truncate long sequences
        padding=True,             # Clean: pad short sequences
        max_length=max_length,    # Limit sequence length
        return_tensors='pt',      # Return PyTorch tensors
        return_attention_mask=True, # Create attention masks
        add_special_tokens=True   # Add [CLS], [SEP] tokens
    )
    
    test_encodings = tokenizer(
        X_test_text,
        truncation=True,
        padding=True,
        max_length=max_length,
        return_tensors='pt',
        return_attention_mask=True,
        add_special_tokens=True
    )
    
    # 1.2 DATA ENCODING: Text → Numerical IDs (completed by tokenizer)
    print(f"   ✅ Text cleaned and tokenized using HuggingFace tokenizer")
    print(f"   ✅ Sequences encoded to numerical IDs from vocabulary")
    print(f"   📊 Input IDs shape: {train_encodings['input_ids'].shape}")
    print(f"   📊 Attention mask shape: {train_encodings['attention_mask'].shape}")
    
    # Show encoding example
    sample_ids = train_encodings['input_ids'][0][:20]
    decoded_sample = tokenizer.decode(sample_ids, skip_special_tokens=False)
    print(f"      Encoded IDs: {sample_ids.tolist()}")
    print(f"      Decoded back: {decoded_sample}")
    
    return {
        'tokenizer': tokenizer,
        'train_encodings': train_encodings,
        'test_encodings': test_encodings,
        'y_train': torch.tensor(y_train),
        'y_test': torch.tensor(y_test),
        'X_train_text': X_train_text,
        'X_test_text': X_test_text,
        'label_mapping': label_mapping
    }

# Preprocess data for all transformer models
transformer_data = {}
texts = df_transformer_sample[text_column].astype(str).tolist()
labels = df_transformer_sample['sentiment'].tolist()

print(f"\n🔄 PREPROCESSING DATA FOR ALL MODELS...")

for model_name, model_id in transformer_models.items():
    try:
        transformer_data[model_name] = huggingface_preprocessing(
            texts, labels, model_id, max_length=256
        )
        print(f"✅ {model_name} preprocessing completed")
    except Exception as e:
        print(f"❌ {model_name} preprocessing failed: {e}")

print(f"\n✅ DATA PREPROCESSING COMPLETED")
print(f"Successfully preprocessed data for {len(transformer_data)} models")
print(f"Ready for model building and evaluation!")

=== 1. DATA PREPROCESSING - HUGGINGFACE TRANSFORMERS ===
🎯 TRANSFORMER MODELS FOR EVALUATION:
   • BERT: bert-base-uncased
   • RoBERTa: roberta-base
   • DistilBERT: distilbert-base-uncased
   • ELECTRA: google/electra-base-discriminator

💡 WHY ELECTRA WAS ADDED:
   ✅ Efficient Pre-training: Uses replaced token detection instead of masked language modeling
   ✅ Better Sample Efficiency: Learns from all input tokens, not just masked ones
   ✅ Strong Performance: Often matches or exceeds BERT with less compute
   ✅ Google Research: Advanced discriminator-generator architecture
   ✅ Computational Efficiency: Faster training and inference than BERT

📊 USING 3000 SAMPLES FOR TRANSFORMER PROCESSING
   Train/Test Split: 80%/20%
   Sentiment Distribution:
sentiment
Positive    1832
Negative     789
Neutral      379
Name: count, dtype: int64

📊 USING 3000 SAMPLES FOR TRANSFORMER PROCESSING
   Train/Test Split: 80%/20%
   Sentiment Distribution:
sentiment
Positive    1832
Negative     789
Neutr

In [71]:
"""
2.1 MODEL SELECTION AND BASELINE PERFORMANCE

This cell explores transformer-based models and evaluates their baseline performance without fine-tuning.
We test multiple architectures to select the best pre-trained model for our sentiment analysis task.
"""

# Set device correctly for MacBook M4
if torch.backends.mps.is_available():
    device = torch.device("mps")
    device_id = 0
    print("🚀 Using Apple Metal Performance Shaders (GPU)")
elif torch.cuda.is_available():
    device = torch.device("cuda")
    device_id = 0
    print("🚀 Using CUDA (GPU)")
else:
    device = torch.device("cpu")
    device_id = -1
    print("🖥️ Using CPU")

# Import required libraries
import pandas as pd
import torch
from transformers import pipeline
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from IPython.display import display

print("=== 2.1 MODEL SELECTION AND BASELINE PERFORMANCE ===")

# Pre-trained sentiment models for baseline testing
baseline_sentiment_models = {
    'BERT': 'nlptown/bert-base-multilingual-uncased-sentiment',
    'RoBERTa': 'cardiffnlp/twitter-roberta-base-sentiment-latest', 
    'DistilBERT': 'distilbert-base-uncased-finetuned-sst-2-english',
    # ELECTRA doesn't have a pre-trained sentiment variant, so we'll evaluate it after fine-tuning
}

print("🎯 MODEL SELECTION JUSTIFICATION:")
print("""
BERT (Bidirectional Encoder Representations from Transformers):
✅ Pioneering transformer architecture with bidirectional context
✅ Excellent baseline for most NLP tasks
✅ Multilingual variant handles diverse datasets
✅ Strong performance on sentiment classification
❌ Larger model size and slower inference
❌ Requires more computational resources

RoBERTa (Robustly Optimized BERT Approach):
✅ Improved training methodology over BERT (no NSP, longer training)
✅ Better performance on downstream tasks
✅ Twitter variant optimized for social media text
✅ More robust to hyperparameters
❌ Requires significant computational resources
❌ Larger vocabulary than BERT

DistilBERT (Distilled BERT):
✅ 60% smaller than BERT with 97% of performance
✅ 60% faster inference than BERT
✅ Good balance between speed and accuracy
✅ Easier deployment in production
❌ Slightly lower performance than full BERT
❌ May struggle with complex reasoning tasks

ELECTRA (Efficiently Learning an Encoder that Classifies Token Replacements Accurately):
✅ More sample-efficient than BERT (learns from all tokens)
✅ Replaced token detection vs masked language modeling
✅ Better performance with same compute budget
✅ Discriminator-generator architecture innovation
❌ Newer architecture, less established
❌ No pre-trained sentiment models available
""")

def evaluate_baseline_model(model_name, model_id):
    """Evaluate a baseline pre-trained model"""
    try:
        print(f"🔍 Evaluating {model_name}...")
        
        # Create sentiment analysis pipeline
        classifier = pipeline(
            "sentiment-analysis", 
            model=model_id, 
            tokenizer=model_id,
            device=device_id,
            return_all_scores=False,
            truncation=True,
            max_length=512,  # Fixed maximum length
            padding=True     # Enable padding
        )
        
        # Use smaller sample for baseline testing
        # Check if variables exist, if not create fallback
        try:
            sample_texts = df_transformer_sample[text_column].astype(str).tolist()[:500]  # First 500 samples
            sample_labels = df_transformer_sample['sentiment'].tolist()[:500]
        except NameError:
            # Fallback: use df if df_transformer_sample doesn't exist
            try:
                sample_texts = df[text_column].astype(str).tolist()[:500]
                sample_labels = df['sentiment'].tolist()[:500]
            except (NameError, KeyError):
                print(f"   ❌ Error: Required variables not found. Please ensure df_transformer_sample and text_column are defined.")
                return None
        
        
        print(f"   Processing {len(sample_texts)} samples...")
        
        # Process one by one to avoid batch size issues
        predictions = []
        for i, text in enumerate(sample_texts):
            try:
                # Truncate very long texts manually
                if len(text) > 1000:  # Truncate very long reviews
                    text = text[:1000]
                
                pred = classifier(text)
                predictions.append(pred[0] if isinstance(pred, list) else pred)
                
                # Progress indicator
                if (i + 1) % 50 == 0:
                    print(f"   Processed {i + 1}/{len(sample_texts)} samples...")
                    
            except Exception as e:
                print(f"   Warning: Sample {i+1} failed: {str(e)[:100]}...")
                # Add dummy prediction
                predictions.append({'label': 'NEUTRAL', 'score': 0.5})
        
        
        # Map predictions to our labels
        predicted_labels = []
        for pred in predictions:
            label = str(pred['label']).upper()
            if any(neg in label for neg in ['NEGATIVE', '1', '2', 'LABEL_0']):
                predicted_labels.append('Negative')
            elif any(neu in label for neu in ['NEUTRAL', '3', 'LABEL_1']):
                predicted_labels.append('Neutral')
            else:
                predicted_labels.append('Positive')
        
        # Calculate comprehensive metrics
        accuracy = accuracy_score(sample_labels, predicted_labels)
        precision, recall, f1, _ = precision_recall_fscore_support(
            sample_labels, predicted_labels, average='weighted', zero_division=0
        )
        
        # Per-class metrics
        precision_per_class, recall_per_class, f1_per_class, _ = precision_recall_fscore_support(
            sample_labels, predicted_labels, average=None, 
            labels=['Negative', 'Neutral', 'Positive'], zero_division=0
        )
        
        results = {
            'accuracy': accuracy,
            'precision': precision,
            'recall': recall,
            'f1': f1,
            'precision_per_class': precision_per_class,
            'recall_per_class': recall_per_class,
            'f1_per_class': f1_per_class,
            'predictions': predicted_labels,
            'true_labels': sample_labels,
            'model_type': 'baseline'
        }
        
        print(f"   ✅ Baseline Results:")
        print(f"      Accuracy: {accuracy:.4f}")
        print(f"      Precision: {precision:.4f}")
        print(f"      Recall: {recall:.4f}")
        print(f"      F1-Score: {f1:.4f}")
        
        return results
        
    except Exception as e:
        print(f"   ❌ Error evaluating {model_name}: {e}")
        return None

# Evaluate baseline models
print(f"\n📊 BASELINE EVALUATION (Pre-trained models without fine-tuning)")

baseline_results = {}

for model_name, model_id in baseline_sentiment_models.items():
    result = evaluate_baseline_model(model_name, model_id)  # ✅ Call the function
    if result:
        baseline_results[model_name] = result
    else:
        print(f"   ⚠️  Skipping {model_name} due to evaluation error")

# Display baseline results summary
if baseline_results:
    print(f"\n📈 BASELINE RESULTS SUMMARY:")
    baseline_df = pd.DataFrame([
        {
            'Model': name,
            'Accuracy': results['accuracy'],
            'Precision': results['precision'],
            'Recall': results['recall'],
            'F1-Score': results['f1']
        }
        for name, results in baseline_results.items()
    ])
    
    display(baseline_df.round(4))
    
    # Best baseline model
    if not baseline_df.empty:
        best_baseline = baseline_df.loc[baseline_df['Accuracy'].idxmax()]
        print(f"\n🏆 BEST BASELINE MODEL: {best_baseline['Model']}")
        print(f"   📊 Baseline Accuracy: {best_baseline['Accuracy']:.4f}")
        print(f"   📊 Baseline F1-Score: {best_baseline['F1-Score']:.4f}")
        print(f"   🎯 This is our benchmark to beat with fine-tuning!")
        
        # Detailed metrics for best baseline
        best_results = baseline_results[best_baseline['Model']]
        print(f"\n   📋 Per-class Performance:")
        classes = ['Negative', 'Neutral', 'Positive']
        for i, class_name in enumerate(classes):
            if i < len(best_results['precision_per_class']):
                precision = best_results['precision_per_class'][i]
                recall = best_results['recall_per_class'][i]
                f1 = best_results['f1_per_class'][i]
                print(f"      {class_name}: P={precision:.3f}, R={recall:.3f}, F1={f1:.3f}")

print(f"\n✅ MODEL SELECTION BASELINE COMPLETED")
print(f"Successfully evaluated {len(baseline_results)} baseline models")
print(f"Next step: Fine-tuning selected models for improved performance")


# Clean up GPU memory
if torch.cuda.is_available() or torch.backends.mps.is_available():
    torch.cuda.empty_cache() if torch.cuda.is_available() else None
    print("🧹 GPU memory cleared")


🚀 Using Apple Metal Performance Shaders (GPU)
=== 2.1 MODEL SELECTION AND BASELINE PERFORMANCE ===
🎯 MODEL SELECTION JUSTIFICATION:

BERT (Bidirectional Encoder Representations from Transformers):
✅ Pioneering transformer architecture with bidirectional context
✅ Excellent baseline for most NLP tasks
✅ Multilingual variant handles diverse datasets
✅ Strong performance on sentiment classification
❌ Larger model size and slower inference
❌ Requires more computational resources

RoBERTa (Robustly Optimized BERT Approach):
✅ Improved training methodology over BERT (no NSP, longer training)
✅ Better performance on downstream tasks
✅ Twitter variant optimized for social media text
✅ More robust to hyperparameters
❌ Requires significant computational resources
❌ Larger vocabulary than BERT

DistilBERT (Distilled BERT):
✅ 60% smaller than BERT with 97% of performance
✅ 60% faster inference than BERT
✅ Good balance between speed and accuracy
✅ Easier deployment in production
❌ Slightly lower pe

Device set to use mps:0


   Processing 500 samples...
   Processed 50/500 samples...
   Processed 50/500 samples...
   Processed 100/500 samples...
   Processed 100/500 samples...
   Processed 150/500 samples...
   Processed 150/500 samples...
   Processed 200/500 samples...
   Processed 200/500 samples...
   Processed 250/500 samples...
   Processed 250/500 samples...
   Processed 300/500 samples...
   Processed 300/500 samples...
   Processed 350/500 samples...
   Processed 350/500 samples...
   Processed 400/500 samples...
   Processed 400/500 samples...
   Processed 450/500 samples...
   Processed 450/500 samples...
   Processed 500/500 samples...
   ✅ Baseline Results:
      Accuracy: 0.7380
      Precision: 0.7594
      Recall: 0.7380
      F1-Score: 0.7472
🔍 Evaluating RoBERTa...
   Processed 500/500 samples...
   ✅ Baseline Results:
      Accuracy: 0.7380
      Precision: 0.7594
      Recall: 0.7380
      F1-Score: 0.7472
🔍 Evaluating RoBERTa...


Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use mps:0
Device set to use mps:0


   Processing 500 samples...
   Processed 50/500 samples...
   Processed 50/500 samples...
   Processed 100/500 samples...
   Processed 100/500 samples...
   Processed 150/500 samples...
   Processed 150/500 samples...
   Processed 200/500 samples...
   Processed 200/500 samples...
   Processed 250/500 samples...
   Processed 250/500 samples...
   Processed 300/500 samples...
   Processed 300/500 samples...
   Processed 350/500 samples...
   Processed 350/500 samples...
   Processed 400/500 samples...
   Processed 400/500 samples...
   Processed 450/500 samples...
   Processed 450/500 samples...
   Processed 500/500 samples...
   ✅ Baseline Results:
      Accuracy: 0.7300
      Precision: 0.7432
      Recall: 0.7300
      F1-Score: 0.7337
🔍 Evaluating DistilBERT...
   Processed 500/500 samples...
   ✅ Baseline Results:
      Accuracy: 0.7300
      Precision: 0.7432
      Recall: 0.7300
      F1-Score: 0.7337
🔍 Evaluating DistilBERT...


Device set to use mps:0


   Processing 500 samples...
   Processed 50/500 samples...
   Processed 50/500 samples...
   Processed 100/500 samples...
   Processed 100/500 samples...
   Processed 150/500 samples...
   Processed 150/500 samples...
   Processed 200/500 samples...
   Processed 200/500 samples...
   Processed 250/500 samples...
   Processed 250/500 samples...
   Processed 300/500 samples...
   Processed 300/500 samples...
   Processed 350/500 samples...
   Processed 350/500 samples...
   Processed 400/500 samples...
   Processed 400/500 samples...
   Processed 450/500 samples...
   Processed 450/500 samples...
   Processed 500/500 samples...
   ✅ Baseline Results:
      Accuracy: 0.7800
      Precision: 0.7025
      Recall: 0.7800
      F1-Score: 0.7368

📈 BASELINE RESULTS SUMMARY:
   Processed 500/500 samples...
   ✅ Baseline Results:
      Accuracy: 0.7800
      Precision: 0.7025
      Recall: 0.7800
      F1-Score: 0.7368

📈 BASELINE RESULTS SUMMARY:


Unnamed: 0,Model,Accuracy,Precision,Recall,F1-Score
0,BERT,0.738,0.7594,0.738,0.7472
1,RoBERTa,0.73,0.7432,0.73,0.7337
2,DistilBERT,0.78,0.7025,0.78,0.7368



🏆 BEST BASELINE MODEL: DistilBERT
   📊 Baseline Accuracy: 0.7800
   📊 Baseline F1-Score: 0.7368
   🎯 This is our benchmark to beat with fine-tuning!

   📋 Per-class Performance:
      Negative: P=0.674, R=0.897, F1=0.770
      Neutral: P=0.000, R=0.000, F1=0.000
      Positive: P=0.840, R=0.867, F1=0.854

✅ MODEL SELECTION BASELINE COMPLETED
Successfully evaluated 3 baseline models
Next step: Fine-tuning selected models for improved performance
🧹 GPU memory cleared


In [72]:
# Document baseline performance clearly
print("=== BASELINE PERFORMANCE (WITHOUT FINE-TUNING) ===")
print("This is the performance using pre-trained models directly on our data:")

for model_name, results in baseline_results.items():
    print(f"\n{model_name} (Pre-trained, no fine-tuning):")
    print(f"   • Accuracy: {results['accuracy']:.4f} ({results['accuracy']:.1%})")
    print(f"   • F1-Score: {results['f1']:.4f}")

=== BASELINE PERFORMANCE (WITHOUT FINE-TUNING) ===
This is the performance using pre-trained models directly on our data:

BERT (Pre-trained, no fine-tuning):
   • Accuracy: 0.7380 (73.8%)
   • F1-Score: 0.7472

RoBERTa (Pre-trained, no fine-tuning):
   • Accuracy: 0.7300 (73.0%)
   • F1-Score: 0.7337

DistilBERT (Pre-trained, no fine-tuning):
   • Accuracy: 0.7800 (78.0%)
   • F1-Score: 0.7368


In [73]:
"""
3. MODEL EVALUATION

3.1 Evaluation Metrics - Comprehensive performance evaluation
3.2 Results - Detailed results presentation with confusion matrices

This cell provides complete evaluation of both baseline and fine-tuned transformer models,
comparing performance metrics and analyzing results across different sentiment classes.
"""

print("=== 3. MODEL EVALUATION ===")
print("=== 3.1 EVALUATION METRICS & 3.2 RESULTS ===")

all_transformer_results = []
if 'baseline_results' not in globals() or baseline_results is None:
    baseline_results = {}
# Add baseline results (pre-trained models without fine-tuning)
for model_name, results in baseline_results.items():
    all_transformer_results.append({
        'Model': f"{model_name}",
        'Type': 'Baseline (Pre-trained)',
        'Accuracy': results['accuracy'],
        'Precision': results['precision'],
        'Recall': results['recall'],
        'F1-Score': results['f1'],
        'Details': results
    })
# Add fine-tuned results
for model_name, results in fine_tuned_results.items():
    all_transformer_results.append({
        'Model': f"{model_name}",
        'Type': 'Fine-tuned',
        'Accuracy': results['accuracy'],
        'Precision': results['precision'],
        'Recall': results['recall'],
        'F1-Score': results['f1'],
        'Details': results
    })
if all_transformer_results:
    transformer_comparison_df = pd.DataFrame(all_transformer_results)
    print("📊 COMPREHENSIVE TRANSFORMER EVALUATION RESULTS:")
    display_df = transformer_comparison_df.drop('Details', axis=1)
    display(display_df.round(4))
    best_model_idx = transformer_comparison_df['Accuracy'].idxmax()
    best_model = transformer_comparison_df.loc[best_model_idx]
    print(f"\n🏆 BEST PERFORMING TRANSFORMER MODEL:")
    print(f"   🥇 Model: {best_model['Model']} ({best_model['Type']})")
    print(f"   📊 Accuracy: {best_model['Accuracy']:.4f} ({best_model['Accuracy']:.1%})")
    print(f"   📊 F1-Score: {best_model['F1-Score']:.4f}")
    print(f"   📊 Precision: {best_model['Precision']:.4f}")
    print(f"   📊 Recall: {best_model['Recall']:.4f}")
    best_results = best_model['Details']
    print(f"\n📋 DETAILED RESULTS - {best_model['Model'].upper()} ({best_model['Type'].upper()}):")
    print(f"   🎯 Model achieved an accuracy of {best_results['accuracy']:.1%} on the validation dataset")
    print(f"\n   📊 Per-class Performance:")
    classes = ['Negative', 'Neutral', 'Positive']
    if 'precision_per_class' in best_results and len(best_results['precision_per_class']) >= 3:
        for i, class_name in enumerate(classes):
            precision = best_results['precision_per_class'][i]
            recall = best_results['recall_per_class'][i]
            f1 = best_results['f1_per_class'][i]
            print(f"      • Class {class_name}:")
            print(f"        - Precision: {precision:.1%} ({precision:.4f})")
            print(f"        - Recall: {recall:.1%} ({recall:.4f})")
            print(f"        - F1-score: {f1:.1%} ({f1:.4f})")
    if 'confusion_matrix' in best_results:
        cm = best_results['confusion_matrix']
    elif 'true_labels' in best_results and 'predictions' in best_results:
        cm = confusion_matrix(
            best_results['true_labels'],
            best_results['predictions'],
            labels=classes
        )
    else:
        cm = np.zeros((3,3))
    print(f"\n📊 CONFUSION MATRIX - {best_model['Model'].upper()}:")
    cm_df = pd.DataFrame(cm, index=classes, columns=classes)
    cm_df.index.name = 'True Label'
    cm_df.columns.name = 'Predicted Label'
    display(cm_df)
    print(f"\n📈 DETAILED CLASSIFICATION REPORT:")
    if 'true_labels' in best_results and 'predictions' in best_results:
        print(classification_report(
            best_results['true_labels'],
            best_results['predictions'],
            target_names=classes
        ))
    if best_model['Type'] == 'Fine-tuned':
        print(f"\n🏋️ TRAINING METRICS:")
        print(f"   Training Loss: {best_results['training_loss']:.4f}")
        print(f"   Validation Loss: {best_results['eval_loss']:.4f}")
        if 'training_loss' in best_results and best_results['training_loss']:
            loss_ratio = best_results['eval_loss'] / best_results['training_loss']
            if loss_ratio < 1.2:
                print(f"   📊 Loss Ratio: {loss_ratio:.2f} (Good - No significant overfitting)")
            elif loss_ratio < 1.5:
                print(f"   📊 Loss Ratio: {loss_ratio:.2f} (Acceptable - Slight overfitting)")
            else:
                print(f"   📊 Loss Ratio: {loss_ratio:.2f} (Warning - Possible overfitting)")
else:
    print("No transformer results to display.")

=== 3. MODEL EVALUATION ===
=== 3.1 EVALUATION METRICS & 3.2 RESULTS ===
📊 COMPREHENSIVE TRANSFORMER EVALUATION RESULTS:


Unnamed: 0,Model,Type,Accuracy,Precision,Recall,F1-Score
0,BERT,Baseline (Pre-trained),0.738,0.7594,0.738,0.7472
1,RoBERTa,Baseline (Pre-trained),0.73,0.7432,0.73,0.7337
2,DistilBERT,Baseline (Pre-trained),0.78,0.7025,0.78,0.7368



🏆 BEST PERFORMING TRANSFORMER MODEL:
   🥇 Model: DistilBERT (Baseline (Pre-trained))
   📊 Accuracy: 0.7800 (78.0%)
   📊 F1-Score: 0.7368
   📊 Precision: 0.7025
   📊 Recall: 0.7800

📋 DETAILED RESULTS - DISTILBERT (BASELINE (PRE-TRAINED)):
   🎯 Model achieved an accuracy of 78.0% on the validation dataset

   📊 Per-class Performance:
      • Class Negative:
        - Precision: 67.4% (0.6740)
        - Recall: 89.7% (0.8971)
        - F1-score: 77.0% (0.7697)
      • Class Neutral:
        - Precision: 0.0% (0.0000)
        - Recall: 0.0% (0.0000)
        - F1-score: 0.0% (0.0000)
      • Class Positive:
        - Precision: 84.0% (0.8401)
        - Recall: 86.7% (0.8673)
        - F1-score: 85.4% (0.8535)

📊 CONFUSION MATRIX - DISTILBERT:


Predicted Label,Negative,Neutral,Positive
True Label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Negative,122,0,14
Neutral,18,0,37
Positive,41,0,268



📈 DETAILED CLASSIFICATION REPORT:
              precision    recall  f1-score   support

    Negative       0.67      0.90      0.77       136
     Neutral       0.00      0.00      0.00        55
    Positive       0.84      0.87      0.85       309

    accuracy                           0.78       500
   macro avg       0.50      0.59      0.54       500
weighted avg       0.70      0.78      0.74       500



In [74]:
# === Diagnostics: Why is Neutral missing? ===
from collections import Counter
print('Test set class distribution:', Counter(y_test))
 
# Pick two important models to inspect
try:
    xgb_pred = results['TF-IDF']['XGBoost']['y_pred']
    print('XGBoost predictions distribution:', Counter(xgb_pred))
except Exception as e:
    print('XGBoost predictions not available:', e)
 
try:
    rf_pred = results['TF-IDF']['Random Forest']['y_pred']
    print('Random Forest predictions distribution:', Counter(rf_pred))
except Exception as e:
    print('Random Forest predictions not available:', e)

Test set class distribution: Counter({'Positive': 6590, 'Negative': 2988, 'Neutral': 1345})
XGBoost predictions not available: 'TF-IDF'
Random Forest predictions not available: 'TF-IDF'


In [None]:
# === XGBoost and Random Forest: dealing with unbalanced in the best and worst models ===
from collections import Counter
from sklearn.utils.class_weight import compute_class_weight
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
import numpy as np
import pandas as pd
 
# Ensure class_weight_dict exists; if not, compute balanced and then boost Neutral
try:
    _ = class_weight_dict
except NameError:
    class_names = np.array(['Negative', 'Neutral', 'Positive'])
    cw_vals = compute_class_weight(class_weight='balanced', classes=class_names, y=y_train.values)
    class_weight_dict = {cls: w for cls, w in zip(class_names, cw_vals)}
 
# Boost Neutral by 1.15x
class_weight_dict['Neutral'] = class_weight_dict.get('Neutral', 1.0) * 1.15
print('Adjusted class weights (Neutral x1.15):', class_weight_dict)
 
# ---- RandomForest with boosted Neutral weight ----
rf_115 = RandomForestClassifier(
    random_state=42, n_estimators=400, max_depth=None, n_jobs=-1,
    class_weight=class_weight_dict
)
rf_115.fit(X_train_tfidf_bal, y_train_tfidf_bal)
rf_115_pred = rf_115.predict(X_test_tfidf)
classes = ['Negative', 'Neutral', 'Positive']
cm_rf_115 = confusion_matrix(y_test, rf_115_pred, labels=classes)
cm_rf_115_df = pd.DataFrame(cm_rf_115, index=classes, columns=classes)
print('\nConfusion Matrix - RandomForest (Neutral x1.15):')
display(cm_rf_115_df)
print('Predictions distribution (RF):', Counter(rf_115_pred))
 
# ---- XGBoost with boosted Neutral weight ----
# Build sample weights reflecting boosted Neutral
xgb_sw_tfidf_115 = np.array([class_weight_dict[y] for y in y_train_tfidf_bal])
le_tmp = LabelEncoder().fit(['Negative','Neutral','Positive'])
y_train_enc = le_tmp.transform(y_train_tfidf_bal)
xgb_115 = XGBClassifier(random_state=42, n_estimators=200, max_depth=6, learning_rate=0.1, eval_metric='mlogloss', verbosity=0)
xgb_115.fit(X_train_tfidf_bal, y_train_enc, sample_weight=xgb_sw_tfidf_115)
y_pred_enc = xgb_115.predict(X_test_tfidf)
xgb_115_pred = le_tmp.inverse_transform(y_pred_enc)
cm_xgb_115 = confusion_matrix(y_test, xgb_115_pred, labels=classes)
cm_xgb_115_df = pd.DataFrame(cm_xgb_115, index=classes, columns=classes)
print('\nConfusion Matrix - XGBoost (Neutral x1.15):')
display(cm_xgb_115_df)
print('Predictions distribution (XGB):', Counter(xgb_115_pred))

Adjusted class weights (Neutral x1.15): {np.str_('Negative'): np.float64(1.2184388856354054), np.str_('Neutral'): np.float64(5.37213369282261), np.str_('Positive'): np.float64(0.5524828344903456)}

Confusion Matrix - RandomForest (Neutral x1.15):

Confusion Matrix - RandomForest (Neutral x1.15):


Unnamed: 0,Negative,Neutral,Positive
Negative,2256,75,657
Neutral,14,634,697
Positive,442,436,5712


Predictions distribution (RF): Counter({'Positive': 7066, 'Negative': 2712, 'Neutral': 1145})

Confusion Matrix - XGBoost (Neutral x1.15):

Confusion Matrix - XGBoost (Neutral x1.15):


Unnamed: 0,Negative,Neutral,Positive
Negative,2279,453,256
Neutral,5,1313,27
Positive,772,3789,2029


Predictions distribution (XGB): Counter({np.str_('Neutral'): 5555, np.str_('Negative'): 3056, np.str_('Positive'): 2312})


In [None]:
# Diagnostic: try to build the sentiment pipeline for ELECTRA and show full exception
import traceback
from transformers import pipeline
try:
    model_id = baseline_sentiment_models.get('ELECTRA', 'google/electra-base-discriminator')
    print("Attempting to build pipeline for ELECTRA:", model_id)
    clf = pipeline("sentiment-analysis", model=model_id, tokenizer=model_id, device=(0 if torch.cuda.is_available() else -1))
    print("Pipeline created OK. Run a small sample:")
    print(clf("This product is fine, nothing special"))
except Exception as e:
    print("Pipeline creation or sample inference failed for ELECTRA. Full traceback below:")
    traceback.print_exc()

Attempting to build pipeline for ELECTRA: google/electra-base-discriminator


Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-base-discriminator and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Device set to use cpu
Device set to use cpu


Pipeline created OK. Run a small sample:


Positive stayed the dominant class after your sampling strategy (you upsampled Neutral to match Negative but left Positive at its original, larger size) → model still biased toward Positive.
You trained the RandomForest on the ROS-balanced data but also passed a custom class_weight dict computed from the original y_train (or used both oversampling + class_weight) — that mismatch / double-correction can push decisions toward the majority class.
Neutral vs Positive are often semantically close (rating 4→Neutral vs 5→Positive); TF‑IDF features may not separate them well so RF prefers the stronger Positive signal.
Default decision rule (argmax on votes/probs) + uncalibrated probabilities → low-probability Neutral gets mapped to Positive.


Recommendation: run the diagnostic cell first to confirm which cause is dominant (Positive still majority? avg Positive prob on true-Neutral high?). Then either (A) stop double-correcting (remove class_weight when training on ROS-balanced data) or (B) recompute class_weight from balanced training and/or tune threshold.



In [1]:
# Diagnostic: why many Positive false-positives (run this cell)
from collections import Counter
import numpy as np
import pandas as pd
import inspect

print("Train original dist:", Counter(y_train))
print("Train after ROS (tfidf) dist:", Counter(y_train_tfidf_bal))
print("Test dist:", Counter(y_test))
print()

# show class_weight_dict if present
try:
    print("class_weight_dict:", class_weight_dict)
except Exception:
    print("class_weight_dict not defined")

# find an RF model to inspect
rf_candidates = ['rf_115', 'best_rf', 'trained_models_tfidf', 'trained_models_count']
rf_model = None
if 'rf_115' in globals():
    rf_model = globals()['rf_115']; print("Using rf_115")
elif 'best_rf' in globals():
    rf_model = globals()['best_rf']; print("Using best_rf")
elif 'trained_models_tfidf' in globals() and 'Random Forest' in trained_models_tfidf:
    rf_model = trained_models_tfidf['Random Forest']; print("Using trained_models_tfidf['Random Forest']")
elif 'trained_models_count' in globals() and 'Random Forest' in trained_models_count:
    rf_model = trained_models_count['Random Forest']; print("Using trained_models_count['Random Forest']")
else:
    # search globals for RandomForestClassifier instance
    for name, obj in globals().items():
        try:
            if isinstance(obj, (type(np))): pass
        except Exception:
            pass
    print("No RF candidate found automatically. Define rf_115 or best_rf or trained_models_tfidf['Random Forest'].")

if rf_model is not None:
    # predictions & confusion
    try:
        y_pred = rf_model.predict(X_test_tfidf)
        from sklearn.metrics import confusion_matrix
        classes = ['Negative','Neutral','Positive']
        cm = confusion_matrix(y_test, y_pred, labels=classes)
        print("\nConfusion matrix (RF):")
        display(pd.DataFrame(cm, index=classes, columns=classes))
        print("Prediction dist:", Counter(y_pred))
    except Exception as e:
        print("RF predict failed:", e)

    # probabilities
    if hasattr(rf_model, "predict_proba"):
        probs = rf_model.predict_proba(X_test_tfidf)
        print("\nRF.classes_:", list(rf_model.classes_))
        mean_probs = dict(zip(rf_model.classes_, np.round(probs.mean(axis=0),4)))
        print("Mean predicted probability per class (test):", mean_probs)

        if 'Neutral' in rf_model.classes_:
            idx_neu = list(rf_model.classes_).index('Neutral')
            # avg Neutral prob for true-Neutral rows
            y_test_arr = np.asarray(y_test)
            true_neu_idx = np.where(y_test_arr == 'Neutral')[0]
            print("True Neutral count (test):", len(true_neu_idx))
            if len(true_neu_idx)>0:
                avg_neu_prob_on_true = probs[true_neu_idx, idx_neu].mean()
                avg_pos_prob_on_true_neu = probs[true_neu_idx, list(rf_model.classes_).index('Positive')].mean()
                print(f"Avg RF Neutral prob on true-Neutral rows: {avg_neu_prob_on_true:.4f}")
                print(f"Avg RF Positive prob on true-Neutral rows: {avg_pos_prob_on_true_neu:.4f}")
                # show few worst true-Neutral rows by Neutral prob
                worst = np.argsort(probs[true_neu_idx, idx_neu])[:5]
                print("\nTop 5 true-Neutral rows with lowest Neutral prob (global_index, pred, neu_prob, pos_prob):")
                for r in worst:
                    gi = true_neu_idx[r]
                    print(gi, y_pred[gi], probs[gi, idx_neu].round(4), probs[gi, list(rf_model.classes_).index('Positive')].round(4))
    else:
        print("RF has no predict_proba")

    # feature importances (map to TF-IDF features if available)
    try:
        fi = rf_model.feature_importances_
        feat_names = tfidf_vectorizer.get_feature_names_out()
        top_idx = np.argsort(fi)[-20:][::-1]
        print("\nTop RF feature importances (top 20):")
        for ix in top_idx:
            print(f"{feat_names[ix]}: {fi[ix]:.5f}")
    except Exception as e:
        print("Feature importances unavailable or mapping failed:", e)

NameError: name 'y_train' is not defined