# Cell Phone Reviews Sentiment Analysis
## Complete Pipeline: Data Exploration to Deployment

This notebook implements a comprehensive sentiment analysis pipeline for cell phone and accessories reviews, covering all steps from data exploration to model deployment.

### Project Overview
- **Dataset**: Cell_Phones_and_Accessories_5.json (194,439+ reviews)
- **Goal**: Build a robust sentiment analysis system
- **Approach**: Classical ML + Deep Learning models
- **Deployment**: Flask API + Web Interface

### Pipeline Steps
1. **Data Exploration & Understanding**
2. **Data Preprocessing**  
3. **Feature Extraction**
4. **Model Training & Evaluation**
5. **Model Improvement**
6. **Deployment Preparation**

In [5]:
# Import Required Libraries
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter, defaultdict
import re
import string
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# NLP Libraries
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer
from textblob import TextBlob

# ML Libraries
import sklearn
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.metrics import precision_recall_fscore_support, roc_auc_score, roc_curve
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline

# Visualization Libraries
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
from wordcloud import WordCloud

# Utilities
import joblib
import pickle
import os
from pathlib import Path

# Set up plotting
plt.style.use('default')  # Using default since seaborn-v0_8 might not be available
sns.set_palette("husl")

print("All libraries imported successfully!")
print(f"Pandas version: {pd.__version__}")
print(f"NumPy version: {np.__version__}")
print(f"Scikit-learn version: {sklearn.__version__}")

All libraries imported successfully!
Pandas version: 2.3.0
NumPy version: 2.3.1
Scikit-learn version: 1.7.0


## 1. Load Dataset 

In [6]:
# Load the JSON dataset
def load_json_dataset(file_path):
    """Load reviews from JSON file (one JSON object per line)"""
    reviews = []
    
    print(f"Loading dataset from: {file_path}")
    
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            for line_num, line in enumerate(f, 1):
                try:
                    review = json.loads(line.strip())
                    reviews.append(review)
                    
                    # Progress indicator
                    if line_num % 50000 == 0:
                        print(f"   Loaded {line_num:,} reviews...")
                        
                except json.JSONDecodeError as e:
                    print(f"Skipping line {line_num}: Invalid JSON")
                    continue
                    
        print(f"Successfully loaded {len(reviews):,} reviews!")
        return reviews
        
    except FileNotFoundError:
        print(f"File not found: {file_path}")
        return []
    except Exception as e:
        print(f"Error loading file: {str(e)}")
        return []

# Load the dataset
DATA_FILE = "Cell_Phones_and_Accessories_5.json"
reviews_data = load_json_dataset(DATA_FILE)

# Convert to DataFrame
df = pd.DataFrame(reviews_data)

print(f"\nDataset Shape: {df.shape}")
print(f"Columns: {list(df.columns)}")
print(f"Data Types:")
print(df.dtypes)

Loading dataset from: Cell_Phones_and_Accessories_5.json
   Loaded 50,000 reviews...
   Loaded 100,000 reviews...
   Loaded 150,000 reviews...
Successfully loaded 194,439 reviews!

Dataset Shape: (194439, 9)
Columns: ['reviewerID', 'asin', 'reviewerName', 'helpful', 'reviewText', 'overall', 'summary', 'unixReviewTime', 'reviewTime']
Data Types:
reviewerID         object
asin               object
reviewerName       object
helpful            object
reviewText         object
overall           float64
summary            object
unixReviewTime      int64
reviewTime         object
dtype: object


## 2. Analyze Number of Reviews 

In [7]:
# Basic dataset information
print("DATASET OVERVIEW")
print("=" * 50)
print(f"Total Reviews: {len(df):,}")
print(f"Total Features: {len(df.columns)}")
print(f"Memory Usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

# Display first few reviews
print("\nSample Reviews:")
print("-" * 30)
sample_df = df[['reviewerName', 'overall', 'summary', 'reviewText']].head(3)
for idx, row in sample_df.iterrows():
    print(f"\nReviewer: {row['reviewerName']}")
    print(f"Rating: {row['overall']}")
    print(f"Summary: {row['summary']}")
    print(f"Review: {row['reviewText'][:150]}...")

# Check for missing values
print("\n\nMISSING VALUES ANALYSIS")
print("=" * 50)
missing_info = df.isnull().sum()
missing_percent = (missing_info / len(df)) * 100

missing_df = pd.DataFrame({
    'Missing Count': missing_info,
    'Missing Percentage': missing_percent
})

print(missing_df[missing_df['Missing Count'] > 0])

# Basic statistics
print("\n\nATING STATISTICS")
print("=" * 50)
print(f"Average Rating: {df['overall'].mean():.2f}")
print(f"Rating Range: {df['overall'].min()} - {df['overall'].max()}")
print(f"Standard Deviation: {df['overall'].std():.2f}")

# Rating distribution
rating_counts = df['overall'].value_counts().sort_index()
print(f"\nRating Distribution:")
for rating, count in rating_counts.items():
    percentage = (count / len(df)) * 100
    print(f"{rating}: {count:,} reviews ({percentage:.1f}%)")

# Create rating distribution visualization
fig = px.bar(x=rating_counts.index, y=rating_counts.values,
             title="Distribution of Review Ratings",
             labels={'x': 'Rating', 'y': 'Number of Reviews'},
             color=rating_counts.values,
             color_continuous_scale='viridis')

fig.update_layout(
    xaxis_title="Rating (Stars)",
    yaxis_title="Number of Reviews",
    showlegend=False
)

fig.show()

DATASET OVERVIEW
Total Reviews: 194,439
Total Features: 9
Memory Usage: 179.34 MB

Sample Reviews:
------------------------------

Reviewer: christina
Rating: 4.0
Summary: Looks Good
Review: They look good and stick good! I just don't like the rounded shape because I was always bumping it and Siri kept popping up and it was irritating. I j...

Reviewer: emily l.
Rating: 5.0
Summary: Really great product.
Review: These stickers work like the review says they do. They stick on great and they stay on the phone. They are super stylish and I can share them with my ...

Reviewer: Erica
Rating: 5.0
Summary: LOVE LOVE LOVE
Review: These are awesome and make my phone look so stylish! I have only used one so far and have had it on for almost a year! CAN YOU BELIEVE THAT! ONE YEAR!...


MISSING VALUES ANALYSIS
              Missing Count  Missing Percentage
reviewerName           3519            1.809822


ATING STATISTICS
Average Rating: 4.13
Rating Range: 1.0 - 5.0
Standard Deviation: 1.22

Rat

## 3. Analyze Label Distribution 

In [8]:
# Create sentiment labels from ratings
def create_sentiment_labels(rating):
    """Convert numeric ratings to sentiment labels"""
    if rating >= 4:
        return 'positive'
    elif rating <= 2:
        return 'negative'
    else:
        return 'neutral'

# Apply sentiment labeling
df['sentiment_label'] = df['overall'].apply(create_sentiment_labels)

print("SENTIMENT LABEL DISTRIBUTION")
print("=" * 50)

# Count sentiment labels
sentiment_counts = df['sentiment_label'].value_counts()
total_reviews = len(df)

print("Sentiment Distribution:")
for sentiment, count in sentiment_counts.items():
    percentage = (count / total_reviews) * 100
    print(f"  {sentiment.upper():>8}: {count:>7,} reviews ({percentage:>5.1f}%)")

# Create detailed mapping
rating_sentiment_map = df.groupby(['overall', 'sentiment_label']).size().unstack(fill_value=0)
print(f"\nRating to Sentiment Mapping:")
print(rating_sentiment_map)

# Visualize sentiment distribution
fig = make_subplots(
    rows=1, cols=2,
    specs=[[{"type": "pie"}, {"type": "bar"}]],
    subplot_titles=("Sentiment Distribution (Pie)", "Sentiment Distribution (Bar)")
)

# Pie chart
fig.add_trace(
    go.Pie(labels=sentiment_counts.index, 
           values=sentiment_counts.values,
           hole=0.3,
           textinfo='label+percent',
           marker=dict(colors=['#FF6B6B', '#4ECDC4', '#45B7D1'])),
    row=1, col=1
)

# Bar chart
fig.add_trace(
    go.Bar(x=sentiment_counts.index, 
           y=sentiment_counts.values,
           marker=dict(color=['#FF6B6B', '#4ECDC4', '#45B7D1']),
           text=sentiment_counts.values,
           textposition='auto'),
    row=1, col=2
)

fig.update_layout(
    title_text="Sentiment Analysis: Label Distribution",
    height=400,
    showlegend=False
)

fig.show()

# Show examples of each sentiment
print(f"\nSAMPLE REVIEWS BY SENTIMENT")
print("=" * 50)

for sentiment in ['positive', 'negative', 'neutral']:
    print(f"\n{sentiment.upper()} REVIEWS:")
    print("-" * 20)
    
    sample_reviews = df[df['sentiment_label'] == sentiment].sample(2, random_state=42)
    
    for idx, row in sample_reviews.iterrows():
        print(f"Rating: {row['overall']}")
        print(f"Review: {row['reviewText'][:200]}...")
        print()

SENTIMENT LABEL DISTRIBUTION
Sentiment Distribution:
  POSITIVE: 148,657 reviews ( 76.5%)
  NEGATIVE:  24,343 reviews ( 12.5%)
   NEUTRAL:  21,439 reviews ( 11.0%)

Rating to Sentiment Mapping:
sentiment_label  negative  neutral  positive
overall                                     
1.0                 13279        0         0
2.0                 11064        0         0
3.0                     0    21439         0
4.0                     0        0     39993
5.0                     0        0    108664



SAMPLE REVIEWS BY SENTIMENT

POSITIVE REVIEWS:
--------------------
Rating: 4.0
Review: So bright and colorful. Very appealing. And so easy to put on and take off of your phone. Too bad the color distorts and rubs off at the edges....

Rating: 4.0
Review: I just received the Maxboost. My first impressions were that it was larger and heavier than I initially thought. However, I can still fit this in my pocket so it is not a major issue. The light is a n...


NEGATIVE REVIEWS:
--------------------
Rating: 2.0
Review: I was hoping it would have more of a chiseled hard edge, but it's rounded along the edges. Finger prints are also a big problem because of the gloss coating - wish it had been a matte finish like the ...

Rating: 2.0
Review: I bought this for my sister but she wanted the all hot pink chrome one.  I put this on mine I needed one anyways.  Within 3 weeks, the pink coating started to peel off and got inside along the edges o...


NEUTRAL REVIEWS:
--------------------
Rating: 3

## 4. Analyze Review Length and Language 

analyze the characteristics of the review text.

In [9]:
# Analyze text characteristics
df['text_length'] = df['reviewText'].astype(str).apply(len)
df['word_count'] = df['reviewText'].astype(str).apply(lambda x: len(x.split()))
df['sentence_count'] = df['reviewText'].astype(str).apply(lambda x: len(sent_tokenize(x)))

print("TEXT LENGTH ANALYSIS")
print("=" * 50)

# Overall statistics
text_stats = df[['text_length', 'word_count', 'sentence_count']].describe()
print("Overall Text Statistics:")
print(text_stats)

# Statistics by sentiment
print(f"\nTEXT STATISTICS BY SENTIMENT")
print("=" * 50)

sentiment_text_stats = df.groupby('sentiment_label')[['text_length', 'word_count', 'sentence_count']].agg(['mean', 'median', 'std'])
print(sentiment_text_stats.round(2))

# Language detection (sample)
print(f"\nLANGUAGE DETECTION (Sample)")
print("=" * 50)

# Detect language for a sample of reviews
sample_size = min(1000, len(df))
sample_df = df.sample(sample_size, random_state=42)

languages = []
for text in sample_df['reviewText'].astype(str):
    try:
        blob = TextBlob(text)
        lang = blob.detect_language()
        languages.append(lang)
    except:
        languages.append('unknown')

lang_counts = Counter(languages)
print("Top languages detected:")
for lang, count in lang_counts.most_common(5):
    percentage = (count / sample_size) * 100
    print(f"  {lang}: {count} reviews ({percentage:.1f}%)")

# Visualize text length distribution
fig = make_subplots(
    rows=2, cols=2,
    subplot_titles=("Text Length Distribution", "Word Count Distribution", 
                   "Text Length by Sentiment", "Word Count by Sentiment")
)

# Text length histogram
fig.add_trace(
    go.Histogram(x=df['text_length'], nbinsx=50, name="Text Length"),
    row=1, col=1
)

# Word count histogram  
fig.add_trace(
    go.Histogram(x=df['word_count'], nbinsx=50, name="Word Count"),
    row=1, col=2
)

# Text length by sentiment
for i, sentiment in enumerate(['positive', 'negative', 'neutral']):
    sentiment_data = df[df['sentiment_label'] == sentiment]['text_length']
    fig.add_trace(
        go.Histogram(x=sentiment_data, name=f"{sentiment.title()}", 
                    opacity=0.7, nbinsx=30),
        row=2, col=1
    )

# Word count by sentiment
for i, sentiment in enumerate(['positive', 'negative', 'neutral']):
    sentiment_data = df[df['sentiment_label'] == sentiment]['word_count']
    fig.add_trace(
        go.Histogram(x=sentiment_data, name=f"{sentiment.title()} Words", 
                    opacity=0.7, nbinsx=30),
        row=2, col=2
    )

fig.update_layout(height=800, title_text="Text Characteristics Analysis")
fig.show()

TEXT LENGTH ANALYSIS
Overall Text Statistics:
         text_length     word_count  sentence_count
count  194439.000000  194439.000000   194439.000000
mean      491.839646      91.595112        5.152778
std       749.170243     134.858032        6.043359
min         0.000000       0.000000        0.000000
25%       143.000000      28.000000        2.000000
50%       248.000000      48.000000        4.000000
75%       532.000000     100.000000        6.000000
max     32110.000000    5263.000000      262.000000

TEXT STATISTICS BY SENTIMENT
                text_length                word_count                 \
                       mean median     std       mean median     std   
sentiment_label                                                        
negative             461.16  258.0  626.38      86.11   50.0  112.76   
neutral              513.77  267.0  730.66      96.16   52.0  131.68   
positive             493.70  243.0  769.83      91.83   47.0  138.55   

                sentenc

## 5. Check Data Balance 

determine if our dataset is balanced across sentiment labels.

In [12]:
# Analyze data balance
print("DATA BALANCE ANALYSIS")
print("=" * 50)

sentiment_counts = df['sentiment_label'].value_counts()
total_reviews = len(df)

# Calculate balance metrics
balance_ratio = sentiment_counts.min() / sentiment_counts.max()
majority_class = sentiment_counts.index[0]
minority_class = sentiment_counts.index[-1]

print(f"Total Reviews: {total_reviews:,}")
print(f"Majority Class: {majority_class} ({sentiment_counts[majority_class]:,} reviews)")
print(f"Minority Class: {minority_class} ({sentiment_counts[minority_class]:,} reviews)")
print(f"Balance Ratio: {balance_ratio:.3f}")

# Determine if dataset is balanced
if balance_ratio >= 0.8:
    balance_status = "Well Balanced "
    color = "green"
elif balance_ratio >= 0.5:
    balance_status = "Moderately Balanced "
    color = "orange"
else:
    balance_status = "Imbalanced"
    color = "red"

print(f"Status: {balance_status}")

# Calculate imbalance impact
imbalance_percentage = ((sentiment_counts.max() - sentiment_counts.min()) / total_reviews) * 100
print(f"Imbalance Impact: {imbalance_percentage:.1f}% of total data")

# Recommendations based on balance
print(f"\nRECOMMENDATIONS:")
if balance_ratio < 0.5:
    print("- Consider oversampling minority class (SMOTE)")
    print("- Use stratified sampling for train/test split")
    print("- Apply class weights in model training")
    print("- Consider ensemble methods")
elif balance_ratio < 0.8:
    print("- Use stratified sampling for train/test split")
    print("- Monitor precision/recall for minority class")
else:
    print("- Dataset is well balanced")
    print("- Standard train/test split should work well")

# Visualize balance with recommendations
fig = make_subplots(
    rows=1, cols=2,
    specs=[[{"type": "pie"}, {"type": "bar"}]],
    subplot_titles=("Class Distribution", "Balance Analysis")
)

# Pie chart showing distribution
colors = ['#FF6B6B', '#4ECDC4', '#45B7D1']
fig.add_trace(
    go.Pie(
        labels=sentiment_counts.index,
        values=sentiment_counts.values,
        hole=0.4,
        marker=dict(colors=colors),
        textinfo='label+percent+value'
    ),
    row=1, col=1
)

# Bar chart with balance threshold
fig.add_trace(
    go.Bar(
        x=sentiment_counts.index,
        y=sentiment_counts.values,
        marker=dict(color=colors),
        text=sentiment_counts.values,
        textposition='auto'
    ),
    row=1, col=2
)

fig.update_layout(
    title_text=f"Data Balance Analysis - Status: {balance_status}",
    height=400,
    showlegend=False
)

fig.show()

# Show class distribution over ratings
rating_sentiment_crosstab = pd.crosstab(df['overall'], df['sentiment_label'], normalize='index') * 100
print(f"\nSentiment Distribution by Rating:")
print(rating_sentiment_crosstab.round(1))

DATA BALANCE ANALYSIS
Total Reviews: 194,439
Majority Class: positive (148,657 reviews)
Minority Class: neutral (21,439 reviews)
Balance Ratio: 0.144
Status: Imbalanced
Imbalance Impact: 65.4% of total data

RECOMMENDATIONS:
- Consider oversampling minority class (SMOTE)
- Use stratified sampling for train/test split
- Apply class weights in model training
- Consider ensemble methods



Sentiment Distribution by Rating:
sentiment_label  negative  neutral  positive
overall                                     
1.0                 100.0      0.0       0.0
2.0                 100.0      0.0       0.0
3.0                   0.0    100.0       0.0
4.0                   0.0      0.0     100.0
5.0                   0.0      0.0     100.0


## 6. Text Preprocessing 

clean and preprocess the text data for machine learning.

In [14]:
# Text preprocessing function
def clean_text(text):
    """Clean and preprocess text data"""
    if pd.isna(text):
        return ""
    
    # Initialize tools
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))
    
    # Convert to string and lowercase
    text = str(text).lower()
    
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    
    # Remove email addresses
    text = re.sub(r'\S+@\S+', '', text)
    
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    
    # Remove punctuation and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    
    # Tokenize
    tokens = word_tokenize(text)
    
    # Remove stop words and short words
    tokens = [token for token in tokens if token not in stop_words and len(token) > 2]
    
    # Lemmatize
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    
    return ' '.join(tokens)

# Apply preprocessing to a sample
print("TEXT PREPROCESSING DEMONSTRATION")
print("=" * 60)

# Take a sample for demonstration
sample_texts = df['reviewText'].head(5).tolist()

print("BEFORE CLEANING:")
print("-" * 30)
for i, text in enumerate(sample_texts, 1):
    print(f"{i}. {str(text)[:100]}...")

print("\nAFTER CLEANING:")
print("-" * 30)
for i, text in enumerate(sample_texts, 1):
    cleaned = clean_text(text)
    print(f"{i}. {cleaned[:100]}...")

# Apply to full dataset (on a sample for speed)
print(f"\nProcessing text data...")
sample_size = 10000  # Process first 10k for demo
df_sample = df.head(sample_size).copy()
df_sample['clean_text'] = df_sample['reviewText'].apply(clean_text)

# Remove empty reviews
initial_count = len(df_sample)
df_sample = df_sample[df_sample['clean_text'].str.len() > 0]
final_count = len(df_sample)

print(f"Processed {sample_size:,} reviews")
print(f"Removed {initial_count - final_count} empty reviews")
print(f"Final sample size: {final_count:,} reviews")

# Show statistics
print(f"\nTEXT STATISTICS AFTER CLEANING:")
print("-" * 40)
df_sample['clean_text_length'] = df_sample['clean_text'].str.len()
df_sample['clean_word_count'] = df_sample['clean_text'].str.split().str.len()

print(f"Average text length: {df_sample['clean_text_length'].mean():.1f} characters")
print(f"Average word count: {df_sample['clean_word_count'].mean():.1f} words")
print(f"Text length range: {df_sample['clean_text_length'].min()} - {df_sample['clean_text_length'].max()}")

# Show word frequency
print(f"\nMOST COMMON WORDS AFTER PREPROCESSING:")
print("-" * 40)
all_words = ' '.join(df_sample['clean_text']).split()
word_freq = Counter(all_words)
print("Top 20 words:")
for word, count in word_freq.most_common(20):
    print(f"  {word}: {count:,}")

print(f"\nText preprocessing completed successfully!")
print(f" Sample data ready for feature extraction and modeling")

TEXT PREPROCESSING DEMONSTRATION
BEFORE CLEANING:
------------------------------
1. They look good and stick good! I just don't like the rounded shape because I was always bumping it a...
2. These stickers work like the review says they do. They stick on great and they stay on the phone. Th...
3. These are awesome and make my phone look so stylish! I have only used one so far and have had it on ...
4. Item arrived in great time and was in perfect condition. However, I ordered these buttons because th...
5. awesome! stays on, and looks great. can be used on multiple apple products.  especially having nails...

AFTER CLEANING:
------------------------------
1. look good stick good dont like rounded shape always bumping siri kept popping irritating wont buy pr...
2. sticker work like review say stick great stay phone super stylish share sister...
3. awesome make phone look stylish used one far almost year believe one year great quality...
4. item arrived great time perfect condition howev

Processed 10,000 reviews
Removed 6 empty reviews
Final sample size: 9,994 reviews

TEXT STATISTICS AFTER CLEANING:
----------------------------------------
Average text length: 353.2 characters
Average word count: 52.6 words
Text length range: 4 - 10848

MOST COMMON WORDS AFTER PREPROCESSING:
----------------------------------------
Top 20 words:
  phone: 10,607
  one: 5,704
  work: 4,907
  headset: 4,318
  use: 4,305
  case: 4,136
  great: 3,779
  like: 3,692
  good: 3,529
  get: 3,230
  would: 3,064
  battery: 2,993
  charger: 2,966
  time: 2,913
  charge: 2,904
  well: 2,891
  ear: 2,598
  product: 2,287
  quality: 2,287
  device: 2,238

Text preprocessing completed successfully!
 Sample data ready for feature extraction and modeling


## 7. Machine Learning Model Training 

train and evaluate multiple machine learning models on our preprocessed data.

In [15]:
# Prepare data for machine learning
print("MACHINE LEARNING MODEL TRAINING")
print("=" * 60)

# Features and labels
X = df_sample['clean_text']
y = df_sample['sentiment_label']

print(f"Dataset for training:")
print(f"  Features (X): {len(X):,} text samples")
print(f"  Labels (y): {len(y):,} sentiment labels")
print(f"  Label distribution:")
for label, count in y.value_counts().items():
    print(f"    {label}: {count:,} ({count/len(y)*100:.1f}%)")

# Encode labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
print(f"\nLabels encoded: {list(label_encoder.classes_)}")

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)

print(f"\nData split:")
print(f"  Training set: {len(X_train):,} samples")
print(f"  Test set: {len(X_test):,} samples")

# Create TF-IDF features
print(f"\nCreating TF-IDF features...")
tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1, 2), stop_words='english')
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

print(f"TF-IDF feature matrix shape: {X_train_tfidf.shape}")
print(f"Feature vocabulary size: {len(tfidf.vocabulary_):,}")

# Train multiple models
models = {
    'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000),
    'Naive Bayes': MultinomialNB(),
    'Random Forest': RandomForestClassifier(random_state=42, n_estimators=100),
    'SVM': SVC(random_state=42, probability=True)
}

results = {}

print(f"\nTraining models...")
print("-" * 40)

for name, model in models.items():
    print(f"\nTraining {name}...")
    
    # Train model
    import time
    start_time = time.time()
    model.fit(X_train_tfidf, y_train)
    training_time = time.time() - start_time
    
    # Make predictions
    y_pred = model.predict(X_test_tfidf)
    y_pred_proba = model.predict_proba(X_test_tfidf) if hasattr(model, 'predict_proba') else None
    
    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, average='weighted')
    
    # Store results
    results[name] = {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1_score': f1,
        'training_time': training_time,
        'model': model
    }
    
    print(f"{name} completed:")
    print(f"   Accuracy: {accuracy:.3f}")
    print(f"   F1-Score: {f1:.3f}")
    print(f"   Training time: {training_time:.2f}s")

# Compare models
print(f"\nMODEL COMPARISON")
print("=" * 60)
print(f"{'Model':<20} {'Accuracy':<10} {'Precision':<10} {'Recall':<10} {'F1-Score':<10} {'Time(s)':<10}")
print("-" * 70)

for name, result in results.items():
    print(f"{name:<20} {result['accuracy']:<10.3f} {result['precision']:<10.3f} "
          f"{result['recall']:<10.3f} {result['f1_score']:<10.3f} {result['training_time']:<10.2f}")

# Find best model
best_model_name = max(results.keys(), key=lambda x: results[x]['f1_score'])
best_score = results[best_model_name]['f1_score']

print(f"\nBest Model: {best_model_name}")
print(f"Best F1-Score: {best_score:.3f}")

# Show detailed classification report for best model
best_model = results[best_model_name]['model']
y_pred_best = best_model.predict(X_test_tfidf)

print(f"\nDETAILED CLASSIFICATION REPORT ({best_model_name})")
print("-" * 60)
print(classification_report(y_test, y_pred_best, target_names=label_encoder.classes_))

# Test with sample predictions
print(f"\nSAMPLE PREDICTIONS")
print("-" * 40)

sample_texts = [
    "This phone case is amazing! Great quality and fast shipping.",
    "Terrible product. Broke after one day. Don't waste your money.",
    "It's okay. Nothing special but does the job."
]

for i, text in enumerate(sample_texts, 1):
    # Clean and transform text
    cleaned_text = clean_text(text)  # Fixed: changed variable name from clean_text to cleaned_text
    text_tfidf = tfidf.transform([cleaned_text])
    
    # Make prediction
    prediction = best_model.predict(text_tfidf)[0]
    probability = best_model.predict_proba(text_tfidf)[0]
    confidence = max(probability)
    sentiment = label_encoder.inverse_transform([prediction])[0]
    
    print(f"\n{i}. Text: '{text[:50]}...'")
    print(f"   Predicted: {sentiment.upper()} (confidence: {confidence:.3f})")

print(f"\nMachine learning pipeline completed successfully!")
print(f"Ready for production deployment!")

MACHINE LEARNING MODEL TRAINING
Dataset for training:
  Features (X): 9,994 text samples
  Labels (y): 9,994 sentiment labels
  Label distribution:
    positive: 7,207 (72.1%)
    negative: 1,678 (16.8%)
    neutral: 1,109 (11.1%)

Labels encoded: ['negative', 'neutral', 'positive']

Data split:
  Training set: 7,995 samples
  Test set: 1,999 samples

Creating TF-IDF features...
TF-IDF feature matrix shape: (7995, 5000)
Feature vocabulary size: 5,000

Training models...
----------------------------------------

Training Logistic Regression...
Logistic Regression completed:
   Accuracy: 0.779
   F1-Score: 0.724
   Training time: 0.98s

Training Naive Bayes...
Naive Bayes completed:
   Accuracy: 0.742
   F1-Score: 0.653
   Training time: 0.01s

Training Random Forest...
Random Forest completed:
   Accuracy: 0.769
   F1-Score: 0.707
   Training time: 43.44s

Training SVM...
SVM completed:
   Accuracy: 0.774
   F1-Score: 0.709
   Training time: 375.07s

MODEL COMPARISON
Model              

## 🎉 Machine Learning Training Results Summary

The machine learning pipeline has been successfully completed! Here's what was accomplished:

### ✅ **Training Completed Successfully**
- **Dataset**: 10,000 preprocessed reviews
- **Features**: TF-IDF vectorization with 5,000 features
- **Models Trained**: 4 different algorithms
- **Training Time**: ~7 minutes total

### 📊 **Model Performance Comparison**
All models were trained and evaluated on the same test set:

1. **Logistic Regression** - Fast and effective
2. **Naive Bayes** - Good baseline performance  
3. **Random Forest** - Ensemble method
4. **Support Vector Machine** - High accuracy potential

### 🔬 **Evaluation Metrics**
Each model was evaluated using:
- ✅ **Accuracy** - Overall correctness
- ✅ **Precision** - Positive prediction accuracy
- ✅ **Recall** - Ability to find all positives
- ✅ **F1-Score** - Balanced metric
- ✅ **Training Time** - Efficiency measure

### 🧪 **Sample Predictions Tested**
The best model was tested on sample texts:
- ✅ Positive sentiment detection
- ✅ Negative sentiment detection  
- ✅ Neutral sentiment detection
- ✅ Confidence scoring working

### 🚀 **Ready for Production**
The trained models are now ready for:
- Real-time sentiment prediction
- Batch processing of reviews
- API integration
- Web interface deployment

**Next Steps**: The models can be saved and integrated into the Flask API for production use!

In [16]:
# Save the best model and preprocessing pipeline for production use
print("💾 SAVING MODELS FOR PRODUCTION")
print("=" * 50)

# Create models directory if it doesn't exist
os.makedirs('models', exist_ok=True)

# Create a complete pipeline with the best model
from sklearn.pipeline import Pipeline

# Get the best model (assuming it was determined in the previous cell)
best_model_name = max(results.keys(), key=lambda x: results[x]['f1_score'])
best_model = results[best_model_name]['model']

print(f"🏆 Best model: {best_model_name}")
print(f"🎯 F1-Score: {results[best_model_name]['f1_score']:.3f}")

# Create complete prediction pipeline
prediction_pipeline = Pipeline([
    ('tfidf', tfidf),
    ('classifier', best_model)
])

# Save the pipeline and label encoder
joblib.dump(prediction_pipeline, 'models/prediction_pipeline.pkl')
joblib.dump(label_encoder, 'models/label_encoder.pkl')

print(f"\n✅ Models saved successfully!")
print(f"📁 Files created:")
print(f"   • models/prediction_pipeline.pkl")
print(f"   • models/label_encoder.pkl")

# Test the saved pipeline
print(f"\n🧪 Testing saved pipeline...")
test_text = "This phone case is amazing! Great quality."

# Load and test
loaded_pipeline = joblib.load('models/prediction_pipeline.pkl')
loaded_encoder = joblib.load('models/label_encoder.pkl')

# Make prediction
prediction = loaded_pipeline.predict([test_text])[0]
probabilities = loaded_pipeline.predict_proba([test_text])[0]
sentiment = loaded_encoder.inverse_transform([prediction])[0]
confidence = max(probabilities)

print(f"✅ Pipeline test successful!")
print(f"   Text: '{test_text}'")
print(f"   Predicted: {sentiment} (confidence: {confidence:.3f})")

print(f"\n🚀 READY FOR PRODUCTION!")
print(f"💡 You can now use these models in your Flask API!")
print(f"🌐 Start the API with: python api/app.py")

💾 SAVING MODELS FOR PRODUCTION
🏆 Best model: Logistic Regression
🎯 F1-Score: 0.724

✅ Models saved successfully!
📁 Files created:
   • models/prediction_pipeline.pkl
   • models/label_encoder.pkl

🧪 Testing saved pipeline...
✅ Pipeline test successful!
   Text: 'This phone case is amazing! Great quality.'
   Predicted: positive (confidence: 0.943)

🚀 READY FOR PRODUCTION!
💡 You can now use these models in your Flask API!
🌐 Start the API with: python api/app.py
