# Sentiment Analysis: iPhone 15 vs Galaxy S24 (NLP + ML)

**Goal:** Compare customer sentiment for iPhone 15 and Galaxy S24 using review text, and build multiple sentiment classifiers.

**Models:**
- Model A: TF-IDF (Unigram) + Logistic Regression
- Model B: TF-IDF (Unigram+Bigram) + Logistic Regression
- Model C: Word2Vec + SVM
- Model D: FastText + SVM

**Outputs:** EDA comparison charts, confusion matrices, model comparison, phone-wise evaluation, error analysis table, and demo-safe predictions.

## 1. Environment Setup
Install required libraries and download the dataset (Kaggle API).

In [None]:
!pip -q install kaggle

import os, json, shutil
os.makedirs("/root/.kaggle", exist_ok=True)
shutil.copy("kaggle.json", "/root/.kaggle/kaggle.json")
os.chmod("/root/.kaggle/kaggle.json", 600)

# OPTION A (recommended dataset)
!kaggle datasets download -d mohankrishnathalla/mobile-reviews-sentiment-and-specification -p /content --unzip


In [None]:
!pip -q install nltk gensim scikit-learn seaborn matplotlib

## 2. Imports
Load all Python libraries used across the project.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import nltk
import re
import string

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score

from gensim.models import Word2Vec


## 3. NLTK Resources
Download tokenizer and stopword resources needed for preprocessing.

In [None]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')


## 4. Load Dataset
Load the dataset and inspect its structure (rows, columns, data types).

In [None]:
import os
os.listdir('/content')


In [None]:
df = pd.read_csv('/content/Mobile Reviews Sentiment.csv')
df.head()


In [None]:
df.shape


In [None]:
df.columns


In [None]:
df.info()


## 5. Filter Target Phones (iPhone 15 & Galaxy S24)
Keep only reviews that belong to iPhone 15 and Galaxy S24 for a fair comparison.

In [None]:
df = df.drop(columns=['sentiment'])


In [None]:
iphone_keywords = ['iphone 15']
samsung_keywords = ['galaxy s24', 's24']

def filter_models(text):
    text = str(text).lower()
    if any(k in text for k in iphone_keywords):
        return 'iPhone 15'
    elif any(k in text for k in samsung_keywords):
        return 'Galaxy S24'
    else:
        return 'Other'

df['brand_model'] = df['model'].apply(filter_models)
df = df[df['brand_model'] != 'Other']

df['brand_model'].value_counts()


## 6. Create Sentiment Labels (from Rating)
Map rating to sentiment for transparent supervised learning: 1–2=Negative, 3=Neutral, 4–5=Positive.

In [None]:
def rating_to_sentiment(rating):
    if rating <= 2:
        return 'Negative'
    elif rating == 3:
        return 'Neutral'
    else:
        return 'Positive'

df['sentiment_label'] = df['rating'].apply(rating_to_sentiment)

df['sentiment_label'].value_counts()


## 7. Text Preprocessing
Clean and normalize review text (lowercase, remove noise, tokenize, remove stopwords).

In [None]:
import re
import string
from nltk.corpus import stopwords
import nltk

stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = str(text).lower()
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"\d+", "", text)
    text = text.translate(str.maketrans("", "", string.punctuation))
    tokens = nltk.word_tokenize(text)
    tokens = [t for t in tokens if t not in stop_words and len(t) > 2]
    return " ".join(tokens)

df['clean_review'] = df['review_text'].apply(clean_text)

df[['review_text', 'clean_review']].head()


## 8. Exploratory Data Analysis (EDA) & Comparison
Visualize sentiment distribution overall and by phone model to support comparison.

In [None]:
plt.figure(figsize=(6,4))
sns.countplot(x='sentiment_label', data=df, palette='viridis')
plt.title('Sentiment Distribution (iPhone 15 vs Galaxy S24)')
plt.xlabel('Sentiment')
plt.ylabel('Count')
plt.show()


In [None]:
plt.figure(figsize=(8,5))
sns.countplot(
    data=df,
    x='sentiment_label',
    hue='brand_model',
    palette='Set2'
)

plt.title('Sentiment Distribution by Phone Model (iPhone 15 vs Galaxy S24)')
plt.xlabel('Sentiment')
plt.ylabel('Number of Reviews')
plt.legend(title='Phone Model')
plt.show()


In [None]:
# --- Sentiment proportions by phone model ---
sent_table = pd.crosstab(df['brand_model'], df['sentiment_label'], normalize='index') * 100
sent_counts = pd.crosstab(df['brand_model'], df['sentiment_label'])

display(sent_counts)
display(sent_table.round(2))

# --- Stacked bar chart (percent) ---
sent_table[['Negative','Neutral','Positive']].plot(kind='bar', stacked=True, figsize=(8,5))
plt.ylabel('Percentage of Reviews (%)')
plt.title('Comparative Sentiment Composition: iPhone 15 vs Galaxy S24')
plt.legend(title='Sentiment', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.show()


## 9. Train/Test Split
Split the dataset using stratification to preserve sentiment class proportions.

In [None]:
from sklearn.model_selection import train_test_split

X = df['clean_review']
y = df['sentiment_label']

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)


## 10. Model A: TF-IDF (Unigram) + Logistic Regression
Baseline frequency-based model using unigram TF-IDF features.

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Model A: TF-IDF (Unigram) Vectorization
tfidf = TfidfVectorizer(
    max_features=5000,
    ngram_range=(1, 1)  # ✅ unigram only
)

X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

X_train_tfidf.shape, X_test_tfidf.shape


In [None]:
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression(max_iter=1000)
logreg.fit(X_train_tfidf, y_train)

y_pred_lr = logreg.predict(X_test_tfidf)


In [None]:
from sklearn.metrics import accuracy_score, classification_report

print("Accuracy (Model A - TF-IDF Unigram + Logistic Regression):", accuracy_score(y_test, y_pred_lr))

print("\nClassification Report:\n")
print(classification_report(y_test, y_pred_lr))


In [None]:
from sklearn.metrics import confusion_matrix

cm_lr = confusion_matrix(y_test, y_pred_lr)

plt.figure(figsize=(6,5))
sns.heatmap(
    cm_lr,
    annot=True,
    fmt='d',
    cmap='Blues',
    xticklabels=logreg.classes_,
    yticklabels=logreg.classes_
)

plt.xlabel('Predicted Label')
plt.ylabel('Actual Label')
plt.title('Confusion Matrix – TF-IDF + Logistic Regression')
plt.show()


## 11. Model B: TF-IDF (Unigram + Bigram) + Logistic Regression
Adds bigrams to capture short phrases such as negation (e.g., “not good”).

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

tfidf_bigram = TfidfVectorizer(
    max_features=8000,
    ngram_range=(1, 2)   # ✅ unigram + bigram
)

X_train_tfidf_bi = tfidf_bigram.fit_transform(X_train)
X_test_tfidf_bi = tfidf_bigram.transform(X_test)

logreg_bi = LogisticRegression(max_iter=2000)
logreg_bi.fit(X_train_tfidf_bi, y_train)

y_pred_lr_bi = logreg_bi.predict(X_test_tfidf_bi)

print("Accuracy (TF-IDF Uni+Bi + Logistic Regression):", accuracy_score(y_test, y_pred_lr_bi))
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred_lr_bi))


In [None]:
cm_lr_bi = confusion_matrix(y_test, y_pred_lr_bi)

plt.figure(figsize=(6,5))
sns.heatmap(
    cm_lr_bi, annot=True, fmt='d', cmap='Purples',
    xticklabels=logreg_bi.classes_, yticklabels=logreg_bi.classes_
)
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix – TF-IDF (Unigram+Bigram) + Logistic Regression")
plt.show()


## 12. Error Analysis (Anomaly Investigation)
Extract misclassified test samples to explain model weaknesses (neutral ambiguity, negation, mixed sentiment).

In [None]:
import pandas as pd
import numpy as np

# Helper: build a neat error table for ANY model output
def build_error_table(df, y_true, y_pred, model_name, n=12):
    # Align indices (y_true is a Series with original df indices)
    idx = y_true.index
    y_pred_s = pd.Series(y_pred, index=idx, name="predicted")

    temp = df.loc[idx, ['brand_model', 'review_text', 'clean_review']].copy()
    temp['actual'] = y_true
    temp['predicted'] = y_pred_s
    temp['is_error'] = temp['actual'] != temp['predicted']

    errors = temp[temp['is_error']].copy()
    errors['error_type'] = errors['actual'].astype(str) + " → " + errors['predicted'].astype(str)

    print(f"\n===== ERROR ANALYSIS: {model_name} =====")
    print("Total test samples:", len(temp))
    print("Total errors:", len(errors))
    print("\nTop error types:")
    print(errors['error_type'].value_counts().head(10))

    # Pick balanced samples across error types
    samples = (
        errors.groupby('error_type', group_keys=False)
              .apply(lambda g: g.sample(min(3, len(g)), random_state=42))
              .reset_index(drop=True)
    )

    # If too many rows, take n
    samples = samples.head(n)[['brand_model', 'actual', 'predicted', 'error_type', 'review_text']]

    # Shorten long text for readability in report
    samples['review_text'] = samples['review_text'].str.slice(0, 140) + np.where(samples['review_text'].str.len() > 140, "…", "")

    return samples

# --- Run for Model B (TF-IDF Uni+Bi + Logistic Regression) ---
errors_B = build_error_table(df, y_test, y_pred_lr_bi, "Model B: TF-IDF (Unigram+Bigram) + Logistic Regression", n=12)
errors_B


In [None]:
errors_B.to_csv("Table4_Misclassified_Examples_ModelB.csv", index=False)
print("Saved: Table4_Misclassified_Examples_ModelB.csv")


## 13. Model C: Word2Vec + SVM
Embedding-based model using averaged Word2Vec vectors + SVM classifier.

In [None]:
from gensim.models import Word2Vec

sentences = [text.split() for text in df['clean_review']]


In [None]:
w2v_model = Word2Vec(
    sentences=sentences,
    vector_size=100,
    window=5,
    min_count=2,
    workers=4,
    epochs=20
)


In [None]:
import numpy as np

def review_to_vector(review):
    words = review.split()
    vectors = [w2v_model.wv[w] for w in words if w in w2v_model.wv]
    if len(vectors) == 0:
        return np.zeros(100)
    return np.mean(vectors, axis=0)

X_w2v = np.array([review_to_vector(r) for r in df['clean_review']])


In [None]:
from sklearn.model_selection import train_test_split

X_train_w2v, X_test_w2v, y_train_w2v, y_test_w2v = train_test_split(
    X_w2v,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)


In [None]:
from sklearn.svm import LinearSVC

svm = LinearSVC()
svm.fit(X_train_w2v, y_train_w2v)

y_pred_svm = svm.predict(X_test_w2v)


In [None]:
from sklearn.metrics import accuracy_score, classification_report

print("Accuracy (Word2Vec + SVM):",
      accuracy_score(y_test_w2v, y_pred_svm))

print("\nClassification Report:\n")
print(classification_report(y_test_w2v, y_pred_svm))


In [None]:
from sklearn.metrics import confusion_matrix

cm_svm = confusion_matrix(y_test_w2v, y_pred_svm)

plt.figure(figsize=(6,5))
sns.heatmap(
    cm_svm,
    annot=True,
    fmt='d',
    cmap='Greens',
    xticklabels=svm.classes_,
    yticklabels=svm.classes_
)

plt.xlabel('Predicted Label')
plt.ylabel('Actual Label')
plt.title('Confusion Matrix – Word2Vec + SVM')
plt.show()


## 14. Model D: FastText + SVM
Embedding model with subword information to handle rare/unseen words better.

In [None]:
from gensim.models import FastText
from sklearn.svm import LinearSVC

# Train FastText on your cleaned corpus
sentences = [text.split() for text in df['clean_review']]

ft_model = FastText(
    vector_size=100,
    window=5,
    min_count=2,
    workers=4
)
ft_model.build_vocab(sentences)
ft_model.train(sentences, total_examples=len(sentences), epochs=20)

def review_to_vector_fasttext(review, vector_size=100):
    words = review.split()
    vectors = [ft_model.wv[w] for w in words if w in ft_model.wv]
    if len(vectors) == 0:
        return np.zeros(vector_size)
    return np.mean(vectors, axis=0)

X_ft = np.array([review_to_vector_fasttext(r, 100) for r in df['clean_review']])

X_train_ft, X_test_ft, y_train_ft, y_test_ft = train_test_split(
    X_ft, y, test_size=0.2, random_state=42, stratify=y
)

svm_ft = LinearSVC()
svm_ft.fit(X_train_ft, y_train_ft)

y_pred_ft = svm_ft.predict(X_test_ft)

print("Accuracy (FastText + SVM):", accuracy_score(y_test_ft, y_pred_ft))
print("\nClassification Report:\n")
print(classification_report(y_test_ft, y_pred_ft))


In [None]:
cm_ft = confusion_matrix(y_test_ft, y_pred_ft)

plt.figure(figsize=(6,5))
sns.heatmap(
    cm_ft, annot=True, fmt='d', cmap='Oranges',
    xticklabels=svm_ft.classes_, yticklabels=svm_ft.classes_
)
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix – FastText + SVM")
plt.show()


## 15. Model Comparison
Compare models using Weighted F1-score to account for class imbalance.

In [None]:
from sklearn.metrics import f1_score

f1_A = f1_score(y_test, y_pred_lr, average='weighted')        # Model A
f1_B = f1_score(y_test, y_pred_lr_bi, average='weighted')     # Model B
f1_C = f1_score(y_test_w2v, y_pred_svm, average='weighted')   # Model C
f1_D = f1_score(y_test_ft, y_pred_ft, average='weighted')     # Model D

labels = ['A: TF-IDF Uni+LR', 'B: TF-IDF Uni+Bi+LR', 'C: Word2Vec+SVM', 'D: FastText+SVM']
scores = [f1_A, f1_B, f1_C, f1_D]

plt.figure(figsize=(9,4))
plt.bar(labels, scores)
plt.ylabel('Weighted F1-score')
plt.title('Model Comparison (Weighted F1)')
plt.ylim(0,1)
plt.xticks(rotation=20, ha='right')
plt.show()


## 16. Phone-wise Evaluation (Comparative Proof)
Evaluate each model separately on iPhone 15 vs Galaxy S24 test reviews to strengthen the comparison.

In [None]:
from sklearn.metrics import classification_report, accuracy_score

def phone_wise_evaluation(df, y_true, y_pred, title):
    print("\n" + "="*70)
    print(title)
    print("="*70)

    test_idx = y_true.index  # keeps original indices
    phone_series = df.loc[test_idx, 'brand_model']

    for phone in ['iPhone 15', 'Galaxy S24']:
        mask = (phone_series == phone)
        y_t = y_true[mask]
        y_p = pd.Series(y_pred, index=test_idx)[mask]

        print(f"\n--- {phone} ---")
        print("Accuracy:", round(accuracy_score(y_t, y_p), 4))
        print(classification_report(y_t, y_p))


In [None]:
phone_wise_evaluation(df, y_test, y_pred_lr, "Model A: TF-IDF (Unigram) + Logistic Regression (Phone-wise Results)")


In [None]:
phone_wise_evaluation(df, y_test, y_pred_lr_bi, "Model B: TF-IDF (Unigram+Bigram) + Logistic Regression (Phone-wise Results)")


In [None]:
phone_wise_evaluation(df, y_test_w2v, y_pred_svm, "Model C: Word2Vec + SVM (Phone-wise Results)")


In [None]:
phone_wise_evaluation(df, y_test_ft, y_pred_ft, "Model D: FastText + SVM (Phone-wise Results)")


## 17. Demonstration (Presentation-Safe)
Show correct labelled outputs using real test-set examples + one challenging case (to explain limitations).

In [None]:
# -----------------------------
# DEMO-SAFE PREDICTION FUNCTIONS
# -----------------------------

def predict_model_B(text):
    """Model B: TF-IDF (Unigram+Bigram) + Logistic Regression"""
    text = "" if text is None else str(text).strip()
    if len(text) == 0:
        return "Invalid input (empty text)"
    cleaned = clean_text(text)
    vec = tfidf_bigram.transform([cleaned])
    pred = logreg_bi.predict(vec)[0]
    proba = logreg_bi.predict_proba(vec)[0]
    return pred, float(np.max(proba))

def demo_from_testset(model_pred_fn, y_true, title, per_class=2):
    print("\n" + "="*70)
    print(title)
    print("="*70)

    test_df = df.loc[y_true.index, ['brand_model', 'review_text']].copy()
    test_df['actual'] = y_true
    test_df = test_df.drop_duplicates(subset=['review_text'])  # ✅ avoid repeated examples

    # Predict on all test rows
    preds = []
    confs = []
    for t in test_df['review_text']:
        out = model_pred_fn(t)
        if isinstance(out, tuple):
            p, c = out
        else:
            p, c = out, None
        preds.append(p)
        confs.append(c)
    test_df['predicted'] = preds
    test_df['confidence'] = confs
    test_df['correct'] = test_df['predicted'] == test_df['actual']

    # Show correct examples per class (safe demo)
    print("\n--- Correct examples (safe demo) ---")
    shown = 0
    for cls in ['Positive', 'Neutral', 'Negative']:
        subset = test_df[(test_df['actual'] == cls) & (test_df['correct'])]
        sample = subset.head(per_class)
        if len(sample) > 0:
            for _, row in sample.iterrows():
                print(f"\nPhone: {row['brand_model']}")
                print(f"Review: {row['review_text']}")
                print(f"Actual: {row['actual']} | Predicted: {row['predicted']} | Confidence: {row['confidence']:.3f}")
                shown += 1

    # Show 1 challenging misclassified example (for explanation)
    wrong = test_df[~test_df['correct']].head(1)
    if len(wrong) > 0:
        row = wrong.iloc[0]
        print("\n--- Challenging example (to explain limitations) ---")
        print(f"Phone: {row['brand_model']}")
        print(f"Review: {row['review_text']}")
        print(f"Actual: {row['actual']} | Predicted: {row['predicted']} | Confidence: {row['confidence']}")
        print("Explanation: Neutral and mixed-sentiment text can be ambiguous; models may confuse it with Positive/Negative.")
    else:
        print("\nNo misclassified example found in the first rows (good!).")

# Run demo using Model B (best for negation phrases)
demo_from_testset(predict_model_B, y_test, "DEMONSTRATION: Model B (TF-IDF Uni+Bi + Logistic Regression)", per_class=2)
