In [None]:
# Cell 1: Imports and setup
from datasets import load_dataset
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Make plots appear in the notebook
%matplotlib inline

print("Environment ready! 🚀")

In [None]:
# Cell 2: Load the dataset
print("Loading Amazon Reviews dataset...")
dataset = load_dataset("McAuley-Lab/Amazon-Reviews-2023", 
                      "raw_review_All_Beauty", 
                      trust_remote_code=True)

df = dataset['full'].to_pandas()
print(f"Loaded {len(df)} reviews!")
df.head()

In [None]:
# Cell 3: Quick exploration
print("Dataset shape:", df.shape)
print("\nRating distribution:")
df['rating'].value_counts().sort_index()

In [None]:
# Most common words (basic analysis)
from collections import Counter
import re

def clean_text_basic(text):
    """Basic text cleaning"""
    if pd.isna(text):
        return ""
    # Convert to lowercase, remove special chars, split
    text = re.sub(r'[^a-zA-Z\s]', '', text.lower())
    return text

# Sample analysis on first 1000 reviews for speed
sample_df = df.head(1000).copy()
sample_df['clean_text'] = sample_df['text'].apply(clean_text_basic)

# Most common words
all_words = []
for text in sample_df['clean_text']:
    all_words.extend(text.split())

word_counts = Counter(all_words)
print("Most common words:")
print(word_counts.most_common(20))

In [None]:
# Check for potential issues
print("Data Quality Checks:")
print("=" * 50)

df['text_length'] = df['text'].str.len()

# 1. Missing text or ratings
print(f"Reviews with missing text: {df['text'].isnull().sum()}")
print(f"Reviews with missing ratings: {df['rating'].isnull().sum()}")

# 2. Invalid ratings
valid_ratings = df['rating'].between(1, 5)
print(f"Invalid ratings: {(~valid_ratings).sum()}")

# 3. Duplicate reviews
duplicates = df.duplicated(subset=['text'], keep=False)
print(f"Potential duplicate reviews: {duplicates.sum()}")

# 4. Empty or very short reviews
empty_reviews = df['text'].str.strip().str.len() < 10
print(f"Very short reviews (< 10 chars): {empty_reviews.sum()}")

# 5. Extremely long reviews (might be spam)
very_long = df['text_length'] > 5000
print(f"Extremely long reviews (> 5000 chars): {very_long.sum()}")

In [None]:
# Look at examples by rating
print("\nSample 1-star reviews:")
one_star = df[df['rating'] == 1.0]['text'].head(3)
for i, review in enumerate(one_star, 1):
    print(f"{i}. {review[:200]}...")

print("\nSample 5-star reviews:")
five_star = df[df['rating'] == 5.0]['text'].head(3)
for i, review in enumerate(five_star, 1):
    print(f"{i}. {review[:200]}...")

In [None]:
# Create a new notebook cell or section
print("ML Problem Definition:")
print("Input: Review text")
print("Output: Star rating (1-5)")
print("Task: Multi-class classification")

# Check the target distribution
print("\nTarget variable distribution:")
rating_dist = df['rating'].value_counts().sort_index()
print(rating_dist)
print(f"\nClass balance: {(rating_dist / len(df) * 100).round(1)}%")

In [None]:
from sklearn.model_selection import train_test_split

# First, let's work with a manageable subset for development
# Start with 10k samples, scale up later
sample_size = min(10000, len(df))
df_sample = df.sample(n=sample_size, random_state=42).reset_index(drop=True)

print(f"Working with {len(df_sample)} samples for initial development")

# Create features and target
X = df_sample['text'].copy()
y = df_sample['rating'].copy()

# Split the data: 70% train, 15% validation, 15% test
X_temp, X_test, y_temp, y_test = train_test_split(
    X, y, test_size=0.15, random_state=42, stratify=y
)

X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp, test_size=0.176, random_state=42, stratify=y_temp  # 0.176 * 0.85 ≈ 0.15
)

print(f"Train set: {len(X_train)} samples")
print(f"Validation set: {len(X_val)} samples") 
print(f"Test set: {len(X_test)} samples")

# Verify class distribution is maintained
print("\nClass distribution in splits:")
for split_name, split_y in [("Train", y_train), ("Val", y_val), ("Test", y_test)]:
    dist = split_y.value_counts(normalize=True).sort_index() * 100
    print(f"{split_name}: {dict(dist.round(1))}")

In [None]:
import re
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder

def preprocess_text(text):
    """
    Basic text preprocessing function
    Start simple, add complexity as needed
    """
    if pd.isna(text):
        return ""
    
    # Convert to lowercase
    text = text.lower()
    
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    
    # Optional: Remove URLs, emails, etc.
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)
    
    return text

# Apply preprocessing
print("Preprocessing text...")
X_train_clean = X_train.apply(preprocess_text)
X_val_clean = X_val.apply(preprocess_text)
X_test_clean = X_test.apply(preprocess_text)

print("Sample preprocessed texts:")
for i in range(3):
    print(f"\nOriginal: {X_train.iloc[i][:100]}...")
    print(f"Cleaned:  {X_train_clean.iloc[i][:100]}...")

In [None]:
# Set up TF-IDF vectorizer for baseline model
vectorizer = TfidfVectorizer(
    max_features=5000,      # Start with 5k features
    stop_words='english',   # Remove common words
    ngram_range=(1, 2),     # Unigrams and bigrams
    min_df=2,               # Ignore terms appearing in < 2 documents
    max_df=0.95             # Ignore terms appearing in > 95% of documents
)

# Fit only on training data
print("Fitting TF-IDF vectorizer...")
X_train_vectors = vectorizer.fit_transform(X_train_clean)
X_val_vectors = vectorizer.transform(X_val_clean)
X_test_vectors = vectorizer.transform(X_test_clean)

print(f"Feature matrix shape: {X_train_vectors.shape}")
print(f"Vocabulary size: {len(vectorizer.vocabulary_)}")

# Look at some features
feature_names = vectorizer.get_feature_names_out()
print(f"\nSample features: {feature_names[:20]}")

In [None]:
# For most algorithms, we can use ratings directly
# But let's also prepare a binary version for experimentation

# Multi-class target (1-5 stars)
y_train_multi = y_train.copy()
y_val_multi = y_val.copy()
y_test_multi = y_test.copy()

# Binary target (positive vs negative sentiment)
# 1-2 stars = negative (0), 4-5 stars = positive (1), ignore 3 stars
def convert_to_binary(rating):
    if rating <= 2:
        return 0  # Negative
    elif rating >= 4:
        return 1  # Positive
    else:
        return None  # Neutral - we'll filter these out

# Create binary datasets
binary_mask_train = y_train.apply(lambda x: convert_to_binary(x) is not None).values
binary_mask_val = y_val.apply(lambda x: convert_to_binary(x) is not None).values
binary_mask_test = y_test.apply(lambda x: convert_to_binary(x) is not None).values

y_train_binary = y_train[y_train.apply(lambda x: convert_to_binary(x) is not None)].apply(convert_to_binary)
y_val_binary = y_val[y_val.apply(lambda x: convert_to_binary(x) is not None)].apply(convert_to_binary)
y_test_binary = y_test[y_test.apply(lambda x: convert_to_binary(x) is not None)].apply(convert_to_binary)

X_train_vectors_binary = X_train_vectors[binary_mask_train]
X_val_vectors_binary = X_val_vectors[binary_mask_val]
X_test_vectors_binary = X_test_vectors[binary_mask_test]

print(f"Multi-class dataset: {len(y_train_multi)} train samples")
print(f"Binary dataset: {len(y_train_binary)} train samples")
print(f"Binary class distribution: {y_train_binary.value_counts().to_dict()}")

In [None]:
import pickle
import os

# Create a data directory
os.makedirs('data/processed', exist_ok=True)

# Save the preprocessed data
data_to_save = {
    'X_train_vectors': X_train_vectors,
    'X_val_vectors': X_val_vectors,
    'X_test_vectors': X_test_vectors,
    'y_train_multi': y_train_multi,
    'y_val_multi': y_val_multi,
    'y_test_multi': y_test_multi,
    'X_train_vectors_binary': X_train_vectors_binary,
    'X_val_vectors_binary': X_val_vectors_binary,
    'X_test_vectors_binary': X_test_vectors_binary,
    'y_train_binary': y_train_binary,
    'y_val_binary': y_val_binary,
    'y_test_binary': y_test_binary,
    'vectorizer': vectorizer,
    'original_text_train': X_train,
    'original_text_val': X_val,
    'original_text_test': X_test
}

with open('data/processed/preprocessed_data.pkl', 'wb') as f:
    pickle.dump(data_to_save, f)

print("Preprocessed data saved to 'data/processed/preprocessed_data.pkl'")
print("\nReady for model training! 🎯")

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import numpy as np

# Start with binary classification (easier to interpret)
print("Training Logistic Regression for Binary Classification (Positive vs Negative)")
print("=" * 60)

# Initialize the model
lr_binary = LogisticRegression(
    random_state=42,
    max_iter=1000,  # Increase iterations for convergence
    C=1.0           # Regularization strength (lower = more regularization)
)

# Train the model
print("Training...")
lr_binary.fit(X_train_vectors_binary, y_train_binary)

# Make predictions
y_val_pred_binary = lr_binary.predict(X_val_vectors_binary)
y_val_prob_binary = lr_binary.predict_proba(X_val_vectors_binary)

# Evaluate performance
accuracy = accuracy_score(y_val_binary, y_val_pred_binary)
print(f"Validation Accuracy: {accuracy:.3f}")

print("\nDetailed Classification Report:")
print(classification_report(y_val_binary, y_val_pred_binary, 
                          target_names=['Negative (1-2★)', 'Positive (4-5★)']))

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Confusion Matrix
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
cm = confusion_matrix(y_val_binary, y_val_pred_binary)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=['Negative', 'Positive'], 
            yticklabels=['Negative', 'Positive'])
plt.title('Confusion Matrix')
plt.ylabel('Actual')
plt.xlabel('Predicted')

# Prediction Confidence Distribution
plt.subplot(1, 2, 2)
# Get confidence scores (max probability)
confidence_scores = np.max(y_val_prob_binary, axis=1)
plt.hist(confidence_scores, bins=30, alpha=0.7, edgecolor='black')
plt.title('Prediction Confidence Distribution')
plt.xlabel('Confidence Score')
plt.ylabel('Count')
plt.tight_layout()
plt.show()

print(f"Average prediction confidence: {confidence_scores.mean():.3f}")

In [None]:
# Get feature names and coefficients
feature_names = vectorizer.get_feature_names_out()
coefficients = lr_binary.coef_[0]

# Create a dataframe for easier analysis
feature_importance = pd.DataFrame({
    'feature': feature_names,
    'coefficient': coefficients
}).sort_values('coefficient', key=abs, ascending=False)

print("Top 15 features for POSITIVE sentiment:")
positive_features = feature_importance.tail(15)
print(positive_features[['feature', 'coefficient']])

print("\nTop 15 features for NEGATIVE sentiment:")
negative_features = feature_importance.head(15)
print(negative_features[['feature', 'coefficient']])

# Visualize feature importance
plt.figure(figsize=(12, 8))
top_features = pd.concat([negative_features.head(10), positive_features.tail(10)])

plt.barh(range(len(top_features)), top_features['coefficient'], 
         color=['red' if x < 0 else 'green' for x in top_features['coefficient']])
plt.yticks(range(len(top_features)), top_features['feature'])
plt.xlabel('Coefficient Value')
plt.title('Top Features: Negative (Red) vs Positive (Green) Sentiment')
plt.axvline(x=0, color='black', linestyle='--', alpha=0.5)
plt.tight_layout()
plt.show()

In [None]:
def predict_sentiment(text, model, vectorizer):
    """Test the model on a single review"""
    # Preprocess the text (same as training)
    clean_text = preprocess_text(text)
    
    # Vectorize
    text_vector = vectorizer.transform([clean_text])
    
    # Predict
    prediction = model.predict(text_vector)[0]
    probability = model.predict_proba(text_vector)[0]
    
    sentiment = "Positive" if prediction == 1 else "Negative"
    confidence = max(probability)
    
    return sentiment, confidence

# Test some examples
test_reviews = [
    "This product is absolutely amazing! I love it so much!",
    "Terrible quality, completely broke after one day. Waste of money.",
    "It's okay, nothing special but does the job.",
    "Outstanding customer service and fast shipping. Highly recommend!",
    "Cheap plastic, looks nothing like the pictures. Very disappointed."
]

print("Testing Individual Predictions:")
print("=" * 50)
for review in test_reviews:
    sentiment, confidence = predict_sentiment(review, lr_binary, vectorizer)
    print(f"Review: {review[:50]}...")
    print(f"Prediction: {sentiment} (confidence: {confidence:.3f})")
    print()

In [None]:
print("Training Logistic Regression for Multi-Class Classification (1-5 stars)")
print("=" * 70)
from sklearn.multiclass import OneVsRestClassifier

# Train multi-class model
lr_multi = OneVsRestClassifier(LogisticRegression(
    class_weight='balanced',
    random_state=42,
    max_iter=1000,
    C=1.0  # One-vs-Rest for multi-class
))

lr_multi.fit(X_train_vectors, y_train_multi)

# Predict
y_val_pred_multi = lr_multi.predict(X_val_vectors)

# Evaluate
accuracy_multi = accuracy_score(y_val_multi, y_val_pred_multi)
print(f"Multi-class Validation Accuracy: {accuracy_multi:.3f}")

print("\nDetailed Classification Report:")
print(classification_report(y_val_multi, y_val_pred_multi))

# Confusion matrix for multi-class
plt.figure(figsize=(8, 6))
cm_multi = confusion_matrix(y_val_multi, y_val_pred_multi)
sns.heatmap(cm_multi, annot=True, fmt='d', cmap='Blues',
            xticklabels=[1,2,3,4,5], yticklabels=[1,2,3,4,5])
plt.title('Multi-Class Confusion Matrix')
plt.ylabel('Actual Rating')
plt.xlabel('Predicted Rating')
plt.show()