In [1]:
# Import Required Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
import warnings
warnings.filterwarnings('ignore')

ModuleNotFoundError: No module named 'matplotlib'

# Amazon Review Sentiment Analysis Model

This notebook builds a Keras neural network model for sentiment analysis on Amazon product reviews.

**Objective:** Classify reviews as Positive (1) or Negative (0)
**Dataset:** Amazon Product Reviews
**Features:** Review body text
**Output:** Sentiment (binary classification)

In [None]:
# Load and Explore Dataset
df = pd.read_csv('Amazon-Product-Reviews - Amazon Product Review (1).csv')

print("Dataset Shape:", df.shape)
print("\nFirst few rows:")
print(df.head())
print("\nDataset Info:")
print(df.info())
print("\nMissing Values:")
print(df.isnull().sum())
print("\nSentiment Distribution:")
print(df['sentiment'].value_counts())

In [None]:
# Data Preprocessing
# Remove rows with missing review_body or sentiment
df = df.dropna(subset=['review_body', 'sentiment'])

# Remove duplicates
df = df.drop_duplicates(subset=['review_body'], keep='first')

# Convert sentiment to binary (ensure 0 and 1 values)
df['sentiment'] = df['sentiment'].astype(int)

print(f"Dataset size after cleaning: {len(df)}")
print(f"\nSentiment distribution after cleaning:")
print(df['sentiment'].value_counts())
print(f"\nPercentages:")
print(df['sentiment'].value_counts(normalize=True) * 100)

In [None]:
# Text Feature Extraction using CountVectorizer
# This converts text into numerical features

# Create separate vectorizers for title and body (as in the project structure)
from sklearn.feature_extraction.text import CountVectorizer

# Vectorizer for review body
cv_body = CountVectorizer(max_features=1000, stop_words='english')
X_body = cv_body.fit_transform(df['review_body']).toarray()

# Vectorizer for review headline
cv_title = CountVectorizer(max_features=100, stop_words='english')
X_title = cv_title.fit_transform(df['review_headline']).toarray()

# Combine features
X = np.hstack([X_title, X_body])
y = df['sentiment'].values

print(f"Feature matrix shape: {X.shape}")
print(f"Title features: {X_title.shape[1]}")
print(f"Body features: {X_body.shape[1]}")
print(f"Total features: {X.shape[1]}")

# Save vectorizers for later use in the app
import joblib
joblib.dump(cv_title, 'cv1.pkl')
joblib.dump(cv_body, 'cv2.pkl')
print("\nVectorizers saved: cv1.pkl, cv2.pkl")

In [None]:
# Balance Dataset using SMOTE (Synthetic Minority Over-sampling Technique)
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline as ImbPipeline

print("Balancing dataset...")
print(f"Before balancing - Sentiment distribution:")
print(pd.Series(y).value_counts())
print(f"Class ratio: {y.value_counts()[1] / y.value_counts()[0]:.2%}")

# Method 1: SMOTE + Undersampling for optimal balance
# SMOTE creates synthetic samples for minority class
# Undersampler reduces majority class to balance
imb_pipeline = ImbPipeline([
    ('smote', SMOTE(random_state=42, k_neighbors=5)),
    ('under', RandomUnderSampler(random_state=42, sampling_strategy=0.8))
])

X_balanced, y_balanced = imb_pipeline.fit_resample(X, y)

print(f"\nAfter balancing - Sentiment distribution:")
print(pd.Series(y_balanced).value_counts())
print(f"Class ratio: {y_balanced.value_counts()[1] / y_balanced.value_counts()[0]:.2%}")
print(f"New dataset size: {len(X_balanced)} (was {len(X)})")

# Use balanced data for training
X = X_balanced
y = y_balanced

In [None]:
# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set size: {X_train.shape[0]}")
print(f"Test set size: {X_test.shape[0]}")
print(f"\nTraining set sentiment distribution:")
print(pd.Series(y_train).value_counts())
print(f"\nTest set sentiment distribution:")
print(pd.Series(y_test).value_counts())

In [None]:
# Build Keras Neural Network Model
model = Sequential([
    Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
    Dropout(0.3),
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(32, activation='relu'),
    Dropout(0.2),
    Dense(1, activation='sigmoid')  # Binary classification
])

# Compile the model
model.compile(
    optimizer=Adam(learning_rate=0.001),
    loss='binary_crossentropy',
    metrics=['accuracy', 
             tf.keras.metrics.Precision(),
             tf.keras.metrics.Recall()]
)

# Display model architecture
print("Model Architecture:")
model.summary()

In [None]:
# Train the Model
print("Training the model...")
history = model.fit(
    X_train, y_train,
    epochs=10,
    batch_size=32,
    validation_split=0.2,
    verbose=1,
    callbacks=[
        keras.callbacks.EarlyStopping(
            monitor='val_loss',
            patience=2,
            restore_best_weights=True
        )
    ]
)

print("\nTraining completed!")

In [None]:
# Plot Training History
plt.figure(figsize=(14, 5))

# Accuracy plot
plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'], label='Train Accuracy')
plt.plot(history.history['val_accuracy'], label='Val Accuracy')
plt.title('Model Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()
plt.grid(True)

# Loss plot
plt.subplot(1, 2, 2)
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Val Loss')
plt.title('Model Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.grid(True)

plt.tight_layout()
plt.show()

print("Training history visualization completed!")

In [None]:
# Evaluate Model on Test Set
print("Evaluating model on test set...")
y_pred_prob = model.predict(X_test)
y_pred = (y_pred_prob > 0.5).astype(int).flatten()

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"\n{'='*50}")
print(f"{'Model Performance Metrics':<30}")
print(f"{'='*50}")
print(f"Accuracy:  {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall:    {recall:.4f}")
print(f"F1-Score:  {f1:.4f}")
print(f"{'='*50}")

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
print(f"\nConfusion Matrix:")
print(cm)

# Classification Report
print(f"\nDetailed Classification Report:")
print(classification_report(y_test, y_pred, target_names=['Negative', 'Positive']))

In [None]:
# Plot Confusion Matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=['Negative', 'Positive'],
            yticklabels=['Negative', 'Positive'])
plt.title('Confusion Matrix')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.tight_layout()
plt.show()

In [None]:
# Save the Trained Model
model.save('model.keras')
print("âœ“ Model saved as 'model.keras'")
print(f"\nModel Summary:")
print(f"- Total parameters: {model.count_params():,}")
print(f"- Input shape: {model.input_shape}")
print(f"- Output shape: {model.output_shape}")

In [None]:
# Test the Model with Sample Reviews
def predict_sentiment(review_text, title_text=""):
    \"\"\"
    Predict sentiment for a given review.
    
    Args:
        review_text: The review body text
        title_text: The review title text
        
    Returns:
        dict with sentiment label and confidence score
    \"\"\"
    # Transform using the saved vectorizers
    title_features = cv_title.transform([title_text]).toarray()
    body_features = cv_body.transform([review_text]).toarray()
    features = np.hstack([title_features, body_features])
    
    # Predict
    prediction = model.predict(features, verbose=0)[0][0]
    sentiment = 'POSITIVE' if prediction > 0.5 else 'NEGATIVE'
    confidence = prediction if prediction > 0.5 else 1 - prediction
    
    return {
        'sentiment': sentiment,
        'confidence': float(confidence),
        'raw_score': float(prediction)
    }

# Test with sample reviews
test_reviews = [
    ("This product is amazing!", "Great quality"),
    ("Terrible product, waste of money", "Worst purchase"),
    ("It's okay, nothing special", "Average"),
    ("Love it! Best product ever", "Five stars"),
]

print("Sample Predictions:")
print("=" * 70)
for review, title in test_reviews:
    result = predict_sentiment(review, title)
    print(f"Title: {title}")
    print(f"Review: {review}")
    print(f"Prediction: {result['sentiment']} ({result['confidence']:.2%})")
    print("-" * 70)