In [None]:
# Cell 1: Imports and setup
from datasets import load_dataset
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Make plots appear in the notebook
%matplotlib inline

print("Environment ready! ðŸš€")

In [None]:
# Cell 2: Load the dataset
print("Loading Amazon Reviews dataset...")
dataset = load_dataset("McAuley-Lab/Amazon-Reviews-2023", 
                      "raw_review_All_Beauty", 
                      trust_remote_code=True)

df = dataset['full'].to_pandas()
print(f"Loaded {len(df)} reviews!")
df.head()

In [None]:
from sklearn.model_selection import train_test_split

# First, let's work with a manageable subset for development
# Start with 10k samples, scale up later
sample_size = min(10000, len(df))
df_sample = df.sample(n=sample_size, random_state=42).reset_index(drop=True)

print(f"Working with {len(df_sample)} samples for initial development")

# Create features and target
X = df_sample['text'].copy()
y = df_sample['rating'].copy()

# Split the data: 70% train, 15% validation, 15% test
X_temp, X_test, y_temp, y_test = train_test_split(
    X, y, test_size=0.15, random_state=42, stratify=y
)

X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp, test_size=0.176, random_state=42, stratify=y_temp  # 0.176 * 0.85 â‰ˆ 0.15
)

print(f"Train set: {len(X_train)} samples")
print(f"Validation set: {len(X_val)} samples") 
print(f"Test set: {len(X_test)} samples")

# Verify class distribution is maintained
print("\nClass distribution in splits:")
for split_name, split_y in [("Train", y_train), ("Val", y_val), ("Test", y_test)]:
    dist = split_y.value_counts(normalize=True).sort_index() * 100
    print(f"{split_name}: {dict(dist.round(1))}")

In [None]:
import re
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder

def preprocess_text(text):
    """
    Basic text preprocessing function
    Start simple, add complexity as needed
    """
    if pd.isna(text):
        return ""
    
    # Convert to lowercase
    text = text.lower()
    
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    
    # Optional: Remove URLs, emails, etc.
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)
    
    return text

# Apply preprocessing
print("Preprocessing text...")
X_train_clean = X_train.apply(preprocess_text)
X_val_clean = X_val.apply(preprocess_text)
X_test_clean = X_test.apply(preprocess_text)

# Multi-class target (1-5 stars)
y_train_multi = y_train.copy()
y_val_multi = y_val.copy()
y_test_multi = y_test.copy()

print("Sample preprocessed texts:")
for i in range(3):
    print(f"\nOriginal: {X_train.iloc[i][:100]}...")
    print(f"Cleaned:  {X_train_clean.iloc[i][:100]}...")

In [None]:
from gensim.models import Word2Vec
from sklearn.linear_model import LogisticRegression
import numpy as np

# Train Word2Vec on your data
def create_word2vec_features(texts, vector_size=100, window=5, min_count=2):
    # Tokenize texts for Word2Vec
    tokenized_texts = [text.split() for text in texts]
    
    # Train Word2Vec model
    w2v_model = Word2Vec(
        sentences=tokenized_texts,
        vector_size=vector_size,
        window=window,
        min_count=min_count,
        workers=4,
        sg=1  # Skip-gram
    )
    
    def text_to_vector(text):
        """Convert text to average word vector"""
        words = text.split()
        word_vectors = []
        
        for word in words:
            if word in w2v_model.wv:
                word_vectors.append(w2v_model.wv[word])
        
        if word_vectors:
            return np.mean(word_vectors, axis=0)
        else:
            return np.zeros(vector_size)
    
    return w2v_model, text_to_vector

# Create Word2Vec features
print("\nTraining Word2Vec...")
w2v_model, text_to_vector = create_word2vec_features(X_train_clean)

# Convert texts to vectors
X_train_w2v = np.array([text_to_vector(text) for text in X_train_clean])
X_val_w2v = np.array([text_to_vector(text) for text in X_val_clean])

print(f"Word2Vec Shape: {X_train_w2v.shape}")
print("Sample vector:", X_train_w2v[0][:10])

# Test performance
lr_w2v = LogisticRegression(class_weight='balanced', random_state=42, max_iter=1000)
lr_w2v.fit(X_train_w2v, y_train_multi)
accuracy_w2v = lr_w2v.score(X_val_w2v, y_val_multi)
print(f"Word2Vec Accuracy: {accuracy_w2v:.3f}")

# Check similar words
print("\nWords similar to 'amazing':")
if 'amazing' in w2v_model.wv:
    similar_words = w2v_model.wv.most_similar('amazing', topn=5)
    for word, similarity in similar_words:
        print(f"  {word}: {similarity:.3f}")