In [None]:
# Cell 1: Imports and setup
from datasets import load_dataset
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Make plots appear in the notebook
%matplotlib inline

print("Environment ready! 🚀")

In [None]:
# Cell 2: Load the dataset
print("Loading Amazon Reviews dataset...")
dataset = load_dataset("McAuley-Lab/Amazon-Reviews-2023", 
                      "raw_review_All_Beauty", 
                      trust_remote_code=True)

df = dataset['full'].to_pandas()
print(f"Loaded {len(df)} reviews!")
df.head()

In [None]:
from sklearn.model_selection import train_test_split

# First, let's work with a manageable subset for development
# Start with 10k samples, scale up later
sample_size = min(10000, len(df))
df_sample = df.sample(n=sample_size, random_state=42).reset_index(drop=True)

print(f"Working with {len(df_sample)} samples for initial development")

# Create features and target
X = df_sample['text'].copy()
y = df_sample['rating'].copy()

# Split the data: 70% train, 15% validation, 15% test
X_temp, X_test, y_temp, y_test = train_test_split(
    X, y, test_size=0.15, random_state=42, stratify=y
)

X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp, test_size=0.176, random_state=42, stratify=y_temp  # 0.176 * 0.85 ≈ 0.15
)

print(f"Train set: {len(X_train)} samples")
print(f"Validation set: {len(X_val)} samples") 
print(f"Test set: {len(X_test)} samples")

# Verify class distribution is maintained
print("\nClass distribution in splits:")
for split_name, split_y in [("Train", y_train), ("Val", y_val), ("Test", y_test)]:
    dist = split_y.value_counts(normalize=True).sort_index() * 100
    print(f"{split_name}: {dict(dist.round(1))}")

In [None]:
import re
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder

def preprocess_text(text):
    """
    Basic text preprocessing function
    Start simple, add complexity as needed
    """
    if pd.isna(text):
        return ""
    
    # Convert to lowercase
    text = text.lower()
    
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    
    # Optional: Remove URLs, emails, etc.
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)
    
    return text

# Apply preprocessing
print("Preprocessing text...")
X_train_clean = X_train.apply(preprocess_text)
X_val_clean = X_val.apply(preprocess_text)
X_test_clean = X_test.apply(preprocess_text)

print("Sample preprocessed texts:")
for i in range(3):
    print(f"\nOriginal: {X_train.iloc[i][:100]}...")
    print(f"Cleaned:  {X_train_clean.iloc[i][:100]}...")

In [None]:
# Set up TF-IDF vectorizer for baseline model
vectorizer = TfidfVectorizer(
    max_features=5000,      # Start with 5k features
    stop_words='english',   # Remove common words
    ngram_range=(1, 2),     # Unigrams and bigrams
    min_df=2,               # Ignore terms appearing in < 2 documents
    max_df=0.95             # Ignore terms appearing in > 95% of documents
)

# Fit only on training data
print("Fitting TF-IDF vectorizer...")
X_train_vectors = vectorizer.fit_transform(X_train_clean)
X_val_vectors = vectorizer.transform(X_val_clean)
X_test_vectors = vectorizer.transform(X_test_clean)

print(f"Feature matrix shape: {X_train_vectors.shape}")
print(f"Vocabulary size: {len(vectorizer.vocabulary_)}")

# Look at some features
feature_names = vectorizer.get_feature_names_out()
print(f"\nSample features: {feature_names[:20]}")

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

print("Training Random Forest for Multi-Class Classification (1-5 stars)")
print("=" * 70)

# Multi-class target (1-5 stars)
y_train_multi = y_train.copy()
y_val_multi = y_val.copy()
y_test_multi = y_test.copy()

# Initialize Random Forest with good defaults
rf_multi = RandomForestClassifier(
    n_estimators=100,        # Number of trees
    max_depth=20,           # Prevent overfitting
    min_samples_split=10,   # Prevent overfitting  
    min_samples_leaf=5,     # Prevent overfitting
    class_weight='balanced', # Handle class imbalance
    random_state=42,
    n_jobs=-1               # Use all CPU cores
)

# Train the model
print("Training Random Forest...")
rf_multi.fit(X_train_vectors, y_train_multi)

# Make predictions
print("Making predictions...")
y_val_pred_rf = rf_multi.predict(X_val_vectors)
y_val_prob_rf = rf_multi.predict_proba(X_val_vectors)

# Evaluate performance
accuracy_rf = accuracy_score(y_val_multi, y_val_pred_rf)
print(f"Random Forest Validation Accuracy: {accuracy_rf:.3f}")

print("\nDetailed Classification Report:")
print(classification_report(y_val_multi, y_val_pred_rf))

In [None]:
# Confusion Matrix
plt.figure(figsize=(15, 5))

# Random Forest Confusion Matrix
plt.subplot(1, 3, 1)
cm_rf = confusion_matrix(y_val_multi, y_val_pred_rf)
sns.heatmap(cm_rf, annot=True, fmt='d', cmap='Blues',
            xticklabels=[1,2,3,4,5], yticklabels=[1,2,3,4,5])
plt.title('Random Forest Confusion Matrix')
plt.ylabel('Actual Rating')
plt.xlabel('Predicted Rating')

# Prediction Confidence Distribution
plt.subplot(1, 3, 2)
confidence_scores_rf = np.max(y_val_prob_rf, axis=1)
plt.hist(confidence_scores_rf, bins=30, alpha=0.7, edgecolor='black', color='green')
plt.title('RF Prediction Confidence')
plt.xlabel('Confidence Score')
plt.ylabel('Count')

# Class-wise Performance
plt.subplot(1, 3, 3)
# Calculate per-class accuracy
class_accuracies = []
for class_val in [1.0, 2.0, 3.0, 4.0, 5.0]:
    mask = y_val_multi == class_val
    if mask.sum() > 0:
        class_acc = accuracy_score(y_val_multi[mask], y_val_pred_rf[mask])
        class_accuracies.append(class_acc)
    else:
        class_accuracies.append(0)

plt.bar([1,2,3,4,5], class_accuracies, color=['red', 'orange', 'yellow', 'lightgreen', 'green'])
plt.title('Per-Class Accuracy')
plt.xlabel('Star Rating')
plt.ylabel('Accuracy')
plt.ylim(0, 1)

plt.tight_layout()
plt.show()

print(f"Average prediction confidence: {confidence_scores_rf.mean():.3f}")

In [None]:
# Random Forest gives us feature importance!
feature_names = vectorizer.get_feature_names_out()
importance_scores = rf_multi.feature_importances_

# Create DataFrame for analysis
feature_importance = pd.DataFrame({
    'feature': feature_names,
    'importance': importance_scores
}).sort_values('importance', ascending=False)

print("Top 20 Most Important Features:")
print(feature_importance.head(20))

# Visualize top features
plt.figure(figsize=(12, 8))
top_features = feature_importance.head(20)

plt.barh(range(len(top_features)), top_features['importance'])
plt.yticks(range(len(top_features)), top_features['feature'])
plt.xlabel('Feature Importance')
plt.title('Top 20 Most Important Features (Random Forest)')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

# Look for interesting patterns
print("\nInteresting patterns in top features:")
top_20_features = top_features['feature'].tolist()
print("Words that might indicate positive sentiment:", 
      [word for word in top_20_features if word in ['great', 'amazing', 'excellent', 'love', 'perfect', 'best']])
print("Words that might indicate negative sentiment:", 
      [word for word in top_20_features if word in ['bad', 'terrible', 'awful', 'hate', 'worst', 'horrible']])

In [None]:
def explain_rf_prediction(text, model, vectorizer, feature_names):
    """Explain how Random Forest made a specific prediction"""
    
    # Preprocess and vectorize
    clean_text = preprocess_text(text)
    text_vector = vectorizer.transform([clean_text])
    
    # Get prediction and probabilities
    prediction = model.predict(text_vector)[0]
    probabilities = model.predict_proba(text_vector)[0]
    
    print(f"Review: {text[:100]}...")
    print(f"Predicted Rating: {prediction}")
    print("Probability distribution:")
    for i, prob in enumerate(probabilities):
        stars = i + 1 if model.classes_[i] == i + 1 else model.classes_[i]
        print(f"  {stars} stars: {prob:.3f}")
    
    # Find most important features for this prediction
    # Get the features that are present in this text
    feature_vector = text_vector.toarray()[0]
    present_features = []
    
    for i, value in enumerate(feature_vector):
        if value > 0:  # Feature is present
            feature_name = feature_names[i]
            importance = model.feature_importances_[i]
            present_features.append((feature_name, importance, value))
    
    # Sort by importance
    present_features.sort(key=lambda x: x[1], reverse=True)
    
    print("\nTop contributing features in this review:")
    for feature, importance, tfidf_value in present_features[:10]:
        print(f"  '{feature}': importance={importance:.4f}, tf-idf={tfidf_value:.3f}")

# Test on some examples
test_reviews = [
    "This product is absolutely amazing! I love it so much and would highly recommend it to everyone.",
    "Terrible quality, completely broke after one day. Waste of money and very disappointed.",
    "It's okay, nothing special but does the job. Average product overall.",
    "Great value for money, works as expected and arrived quickly.",
    "Horrible experience, customer service was rude and product was damaged."
]

print("Random Forest Prediction Analysis:")
print("=" * 50)
for i, review in enumerate(test_reviews):
    print(f"\n--- Example {i+1} ---")
    explain_rf_prediction(review, rf_multi, vectorizer, feature_names)

In [None]:
# Check if we can improve with different hyperparameters
print("Model Hyperparameters:")
print("=" * 30)
print(f"Number of trees: {rf_multi.n_estimators}")
print(f"Max depth: {rf_multi.max_depth}")
print(f"Min samples split: {rf_multi.min_samples_split}")
print(f"Min samples leaf: {rf_multi.min_samples_leaf}")

# Quick hyperparameter sensitivity test
print("\nQuick hyperparameter test (this might take a moment):")

# Test different number of trees
tree_counts = [50, 100, 200]
for n_trees in tree_counts:
    rf_test = RandomForestClassifier(
        n_estimators=n_trees,
        max_depth=20,
        class_weight='balanced',
        random_state=42,
        n_jobs=-1
    )
    rf_test.fit(X_train_vectors, y_train_multi)
    test_acc = rf_test.score(X_val_vectors, y_val_multi)
    print(f"  {n_trees} trees: {test_acc:.3f} accuracy")