# McDonald's Reviews - Model Training and Comparison

**Step 2**: Train classical ML models and neural network, compare performance on noisy reviews.


## 1. Import Libraries

In [1]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

### Import Neural Network Libraries

In [2]:
# For RNN
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

2026-01-12 21:51:44.330105: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


## 2. Load Data with Noise Analysis

In [3]:
print("="*80)
print("LOADING DATA WITH NOISE ANALYSIS")
print("="*80)

df = pd.read_csv('data_with_noise_analysis.csv')

print(f"Loaded: {len(df):,} reviews")
print(f"Noisy reviews: {df['has_noise'].sum():,} ({df['has_noise'].sum()/len(df)*100:.1f}%)")
print(f"Clean reviews: {(~df['has_noise']).sum():,} ({(~df['has_noise']).sum()/len(df)*100:.1f}%)")

LOADING DATA WITH NOISE ANALYSIS
Loaded: 33,396 reviews
Noisy reviews: 18,584 (55.6%)
Clean reviews: 14,812 (44.4%)


## 3. Minimal Preprocessing

**Important**: Preserving noise for robustness testing. Only removing URLs and normalizing whitespace.

In [4]:
print("\n" + "="*80)
print("MINIMAL PREPROCESSING (PRESERVING NOISE FOR ROBUSTNESS TEST)")
print("="*80)

def minimal_clean(text):
    """
    Minimal cleaning - preserve most noise!
    Only remove URLs and extreme cases, keep typos/slang/abbreviations
    """
    text = str(text)
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'\s+', ' ', text).strip()
    text = text.lower()
    return text

df['review_clean'] = df['review'].apply(minimal_clean)

print("✓ Minimal preprocessing applied")
print("✓ Noise preserved: typos, slang, abbreviations kept intact")


MINIMAL PREPROCESSING (PRESERVING NOISE FOR ROBUSTNESS TEST)
✓ Minimal preprocessing applied
✓ Noise preserved: typos, slang, abbreviations kept intact


### Preview Cleaned Text

In [5]:
print("\nExamples:")
for i in range(3):
    print(f"\nOriginal: {df.iloc[i]['review'][:100]}")
    print(f"Cleaned:  {df.iloc[i]['review_clean'][:100]}")


Examples:

Original: Why does it look like someone spit on my food?
I had a normal transaction,  everyone was chill and p
Cleaned:  why does it look like someone spit on my food? i had a normal transaction, everyone was chill and po

Original: It'd McDonalds. It is what it is as far as the food and atmosphere go. The staff here does make a di
Cleaned:  it'd mcdonalds. it is what it is as far as the food and atmosphere go. the staff here does make a di

Original: Made a mobile order got to the speaker and checked it in.
Line was not moving so I had to leave othe
Cleaned:  made a mobile order got to the speaker and checked it in. line was not moving so i had to leave othe


## 4. Data Splitting (Stratified)

In [6]:
print("\n" + "="*80)
print("DATA SPLITTING (STRATIFIED)")
print("="*80)

X = df['review_clean']
y = df['sentiment']

X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.4, random_state=42, stratify=y
)

# Second split: 20% val, 20% test
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp
)

print(f"Train set: {len(X_train):,} samples (60%)")
print(f"Val set:   {len(X_val):,} samples (20%)")
print(f"Test set:  {len(X_test):,} samples (20%)")


DATA SPLITTING (STRATIFIED)
Train set: 20,037 samples (60%)
Val set:   6,679 samples (20%)
Test set:  6,680 samples (20%)


### Verify Stratification

In [7]:
print("\nClass distribution:")
for split_name, split_y in [('Train', y_train), ('Val', y_val), ('Test', y_test)]:
    dist = split_y.value_counts(normalize=True) * 100
    print(f"{split_name:5s}: Neg={dist['negative']:.1f}% Neu={dist['neutral']:.1f}% Pos={dist['positive']:.1f}%")


Class distribution:
Train: Neg=37.5% Neu=14.4% Pos=48.1%
Val  : Neg=37.5% Neu=14.4% Pos=48.1%
Test : Neg=37.5% Neu=14.4% Pos=48.1%


## 5. TF-IDF Vectorization

In [8]:
print("\n" + "="*80)
print("TF-IDF VECTORIZATION")
print("="*80)

tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))

X_train_tfidf = tfidf.fit_transform(X_train)
X_val_tfidf = tfidf.transform(X_val)
X_test_tfidf = tfidf.transform(X_test)

print(f"✓ TF-IDF features created")
print(f"  Vocabulary size: {len(tfidf.vocabulary_):,}")
print(f"  Feature matrix shape: {X_train_tfidf.shape}")
print(f"  Sparsity: {(1 - X_train_tfidf.nnz / (X_train_tfidf.shape[0] * X_train_tfidf.shape[1]))*100:.1f}%")


TF-IDF VECTORIZATION
✓ TF-IDF features created
  Vocabulary size: 5,000
  Feature matrix shape: (20037, 5000)
  Sparsity: 99.5%


## 6. Train Classical ML Models

### 6.1 Baseline Model

In [9]:
print("\n" + "="*80)
print("TRAINING CLASSICAL ML MODELS")
print("="*80)

# Baseline: Majority class
baseline_class = y_train.value_counts().idxmax()
baseline_pred = [baseline_class] * len(y_val)
baseline_acc = accuracy_score(y_val, baseline_pred)
print(f"\n1. Baseline (Majority Class): {baseline_acc:.4f}")


TRAINING CLASSICAL ML MODELS

1. Baseline (Majority Class): 0.4809


### 6.2 Logistic Regression

In [10]:
print("\n2. Training Logistic Regression...")
lr = LogisticRegression(max_iter=1000, random_state=42)
lr.fit(X_train_tfidf, y_train)
lr_pred = lr.predict(X_val_tfidf)
lr_acc = accuracy_score(y_val, lr_pred)
print(f"   Validation Accuracy: {lr_acc:.4f}")


2. Training Logistic Regression...
   Validation Accuracy: 0.8169


### 6.3 Support Vector Machine

In [11]:
# SVM
print("\n3. Training SVM (LinearSVC)...")
svm = LinearSVC(max_iter=10000, random_state=42)
svm.fit(X_train_tfidf, y_train)
svm_pred = svm.predict(X_val_tfidf)
svm_acc = accuracy_score(y_val, svm_pred)
print(f"   Validation Accuracy: {svm_acc:.4f}")


3. Training SVM (LinearSVC)...
   Validation Accuracy: 0.8155


### 6.4 Random Forest

In [12]:
# Random Forest
print("\n4. Training Random Forest...")
rf = RandomForestClassifier(n_estimators=100, max_depth=30, random_state=42, n_jobs=-1)
rf.fit(X_train_tfidf, y_train)
rf_pred = rf.predict(X_val_tfidf)
rf_acc = accuracy_score(y_val, rf_pred)
print(f"   Validation Accuracy: {rf_acc:.4f}")


4. Training Random Forest...
   Validation Accuracy: 0.7813


## 7. Train Neural Network (RNN-LSTM)

### 7.1 Prepare Sequences

In [13]:
print("\n" + "="*80)
print("TRAINING NEURAL NETWORK (RNN-LSTM)")
print("="*80)

# Tokenization for RNN
print("Tokenizing text for RNN...")
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_val_seq = tokenizer.texts_to_sequences(X_val)
X_test_seq = tokenizer.texts_to_sequences(X_test)

maxlen = 100
X_train_pad = pad_sequences(X_train_seq, maxlen=maxlen, padding='post')
X_val_pad = pad_sequences(X_val_seq, maxlen=maxlen, padding='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=maxlen, padding='post')

print(f"✓ Sequences created (max length: {maxlen})")


TRAINING NEURAL NETWORK (RNN-LSTM)
Tokenizing text for RNN...
✓ Sequences created (max length: 100)


### 7.2 Define RNN Architecture

In [14]:
# Define RNN model
class SentimentRNN(nn.Module):
    def __init__(self, vocab_size=10000, embedding_dim=128, hidden_dim=64, output_dim=3):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(0.3)
    
    def forward(self, x):
        embedded = self.embedding(x)
        lstm_out, (hidden, _) = self.lstm(embedded)
        hidden = self.dropout(hidden[-1])
        output = self.fc(hidden)
        return output

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = SentimentRNN().to(device)

### 7.3 Prepare Data Loaders

In [15]:
# Prepare data loaders
label_map = {'negative': 0, 'neutral': 1, 'positive': 2}
y_train_enc = y_train.map(label_map).values
y_val_enc = y_val.map(label_map).values

train_data = torch.utils.data.TensorDataset(
    torch.LongTensor(X_train_pad),
    torch.LongTensor(y_train_enc)
)
val_data = torch.utils.data.TensorDataset(
    torch.LongTensor(X_val_pad),
    torch.LongTensor(y_val_enc)
)

train_loader = torch.utils.data.DataLoader(train_data, batch_size=64, shuffle=True)
val_loader = torch.utils.data.DataLoader(val_data, batch_size=64, shuffle=False)

### 7.4 Train RNN

In [16]:
# Train RNN
print("\nTraining RNN...")
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

num_epochs = 5
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    
    for batch_x, batch_y in train_loader:
        batch_x, batch_y = batch_x.to(device), batch_y.to(device)
        
        optimizer.zero_grad()
        outputs = model(batch_x)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    
    avg_loss = total_loss / len(train_loader)
    print(f"  Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.4f}")


Training RNN...
  Epoch 1/5, Loss: 0.9968
  Epoch 2/5, Loss: 0.9848
  Epoch 3/5, Loss: 0.8777
  Epoch 4/5, Loss: 0.7785
  Epoch 5/5, Loss: 0.7479


NO early stopping
NO gradient clipping
Just 5 epochs of basic training
This RNN will fail on neutral class (F1 = 0.00) the 3rd notebook of mine, I will work with improved RNN model with early stopping and class weightining


### 7.5 Evaluate RNN

In [17]:
model.eval()
rnn_predictions = []

with torch.no_grad():
    for batch_x, _ in val_loader:
        batch_x = batch_x.to(device)
        outputs = model(batch_x)
        _, predicted = torch.max(outputs, 1)
        rnn_predictions.extend(predicted.cpu().numpy())

rnn_acc = accuracy_score(y_val_enc, rnn_predictions)
print(f"\n5. RNN (LSTM) Validation Accuracy: {rnn_acc:.4f}")


5. RNN (LSTM) Validation Accuracy: 0.6857


## 8. Model Comparison

In [18]:
print("\n" + "="*80)
print("MODEL COMPARISON (VALIDATION SET)")
print("="*80)

results = pd.DataFrame({
    'Model': ['Logistic Regression', 'SVM', 'Random Forest', 'RNN-LSTM', 'Baseline'],
    'Accuracy': [lr_acc, svm_acc, rf_acc, rnn_acc, baseline_acc]
}).sort_values('Accuracy', ascending=False)

print("\n" + results.to_string(index=False))


MODEL COMPARISON (VALIDATION SET)

              Model  Accuracy
Logistic Regression  0.816889
                SVM  0.815541
      Random Forest  0.781255
           RNN-LSTM  0.685731
           Baseline  0.480910


### Visualization

In [None]:
plt.figure(figsize=(12, 6))
colors = ['blue', 'green', 'orange', 'purple', 'red']
bars = plt.bar(results['Model'], results['Accuracy'], color=colors)
plt.ylabel('Validation Accuracy', fontsize=12)
plt.title('Model Performance Comparison on Noisy Reviews', fontsize=14, fontweight='bold')
plt.ylim([0, 1])
plt.xticks(rotation=45, ha='right')
plt.grid(axis='y', alpha=0.3)

for bar, val in zip(bars, results['Accuracy']):
    plt.text(bar.get_x() + bar.get_width()/2, val + 0.02, 
             f'{val:.4f}', ha='center', va='bottom', fontweight='bold')

plt.tight_layout()
plt.savefig('model_comparison_validation.png', dpi=300, bbox_inches='tight')
plt.show()

print("\n Saved: model_comparison_validation.png")

## 9. Save Models and Prepared Data

In [None]:
print("\n" + "="*80)
print("SAVING MODELS AND PREPARED DATA")
print("="*80)

import joblib

# Save models
joblib.dump(lr, 'model_logistic_regression.pkl')
joblib.dump(svm, 'model_svm.pkl')
joblib.dump(rf, 'model_random_forest.pkl')
torch.save(model.state_dict(), 'model_rnn_lstm.pth')

# Save vectorizer and tokenizer
joblib.dump(tfidf, 'tfidf_vectorizer.pkl')
joblib.dump(tokenizer, 'rnn_tokenizer.pkl')

In [None]:
# Save splits with noise annotations
test_df = pd.DataFrame({
    'review': X_test.values,
    'sentiment': y_test.values,
    'review_index': X_test.index
})
test_df = test_df.merge(df[['total_noise', 'noise_level', 'has_noise']], 
                        left_on='review_index', right_index=True)
test_df.to_csv('test_set_with_noise.csv', index=False)

print("✓ Saved all models")
print("✓ Saved vectorizer and tokenizer")
print("✓ Saved test set with noise annotations")

## Summary

In [None]:
print(f"\n{'='*80}")
print("STEP 2 COMPLETE!")
print(f"{'='*80}")
print(f"""
Models trained:
- Logistic Regression: {lr_acc:.4f}
- SVM (LinearSVC): {svm_acc:.4f} ← BEST
- Random Forest: {rf_acc:.4f}
- RNN-LSTM: {rnn_acc:.4f}

Ready for noise robustness analysis in Step 3!
""")