# 03 - Sentiment Analysis & Categorization

Two sentiment methods compared:
1. **TextBlob** - polarity-based baseline
2. **VADER** - social media/review-optimized

Categorization via keyword rules:
- **Apps**: app performance, UI, crashes, bugs, updates
- **Feature**: specific features, functionality
- **Others → Price**: pricing, packages, cost
- **Others → Network**: signal, speed, coverage
- **Others → Content**: content, offers, promos

In [None]:
import pandas as pd
from datetime import datetime
from textblob import TextBlob
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import os

In [None]:
# Load cleaned data — update filename to match preprocessing output date tag
INPUT_FILE = "data/processed/reviews_cleaned.csv"  # Update with date tag if needed
OUTPUT_DIR = "data/final"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Version/date tag for output files
RUN_DATE = datetime.now().strftime("%Y%m%d")

# Auto-detect latest processed file if default doesn't exist
if not os.path.exists(INPUT_FILE):
    import glob
    proc_files = sorted(glob.glob("data/processed/reviews_cleaned_*.csv"))
    if proc_files:
        INPUT_FILE = proc_files[-1]
        print(f"Using latest processed file: {INPUT_FILE}")

df = pd.read_csv(INPUT_FILE, parse_dates=["at"])
print(f"Loaded {len(df)} reviews from {INPUT_FILE}")
print(f"Output file tag: {RUN_DATE}")
df.head()

## Sentiment Analysis

In [None]:
# Method 1: TextBlob
def textblob_sentiment(text):
    """Return polarity score and label using TextBlob."""
    if pd.isna(text) or not text.strip():
        return 0.0, "neutral"
    polarity = TextBlob(text).sentiment.polarity
    if polarity > 0.1:
        label = "positive"
    elif polarity < -0.1:
        label = "negative"
    else:
        label = "neutral"
    return polarity, label

print("Running TextBlob sentiment analysis...")
results = df["content_clean"].apply(textblob_sentiment)
df["tb_polarity"] = results.apply(lambda x: x[0])
df["tb_sentiment"] = results.apply(lambda x: x[1])

print("TextBlob sentiment distribution:")
print(df["tb_sentiment"].value_counts())

In [None]:
# Method 2: VADER
vader = SentimentIntensityAnalyzer()

def vader_sentiment(text):
    """Return compound score and label using VADER."""
    if pd.isna(text) or not text.strip():
        return 0.0, "neutral"
    scores = vader.polarity_scores(text)
    compound = scores["compound"]
    if compound >= 0.05:
        label = "positive"
    elif compound <= -0.05:
        label = "negative"
    else:
        label = "neutral"
    return compound, label

print("Running VADER sentiment analysis...")
results = df["content_clean"].apply(vader_sentiment)
df["vader_compound"] = results.apply(lambda x: x[0])
df["vader_sentiment"] = results.apply(lambda x: x[1])

print("VADER sentiment distribution:")
print(df["vader_sentiment"].value_counts())

In [None]:
# Compare methods
agreement = (df["tb_sentiment"] == df["vader_sentiment"]).sum()
print(f"\nMethod Agreement: {agreement}/{len(df)} ({agreement/len(df)*100:.1f}%)")

print("\nCross-tabulation (TextBlob vs VADER):")
ct = pd.crosstab(df["tb_sentiment"], df["vader_sentiment"], margins=True)
print(ct)

# Compare with star rating as rough ground truth
print("\nAverage star rating by VADER sentiment:")
print(df.groupby("vader_sentiment")["score"].mean().round(2))
print("\nAverage star rating by TextBlob sentiment:")
print(df.groupby("tb_sentiment")["score"].mean().round(2))

## Review Categorization

In [None]:
# Keyword-based categorization
CATEGORY_KEYWORDS = {
    "Apps": [
        "app", "crash", "bug", "update", "install", "uninstall", "download",
        "ui", "interface", "design", "layout", "loading", "slow app", "lag",
        "freeze", "error", "force close", "not working", "won't open",
        "notification", "login", "logout", "otp", "version", "permission",
    ],
    "Feature": [
        "feature", "function", "option", "setting", "menu", "button",
        "check balance", "check quota", "purchase", "buy package", "redeem",
        "transfer", "pay", "payment", "top up", "topup", "recharge",
        "history", "profile", "account", "dashboard", "widget",
    ],
    "Others - Price": [
        "price", "expensive", "cheap", "cost", "money", "charge", "fee",
        "tariff", "rate", "affordable", "overcharge", "billing", "credit",
        "pulsa", "quota", "package", "plan", "subscription",
    ],
    "Others - Network": [
        "signal", "network", "coverage", "speed", "internet", "connection",
        "4g", "5g", "3g", "lte", "wifi", "data", "mbps", "slow internet",
        "no signal", "disconnect", "unstable", "buffering",
    ],
    "Others - Content": [
        "promo", "promotion", "offer", "deal", "discount", "reward",
        "point", "loyalty", "gift", "voucher", "coupon", "cashback",
        "content", "entertainment", "game", "music", "video",
    ],
}

def categorize_review(text):
    """Categorize review based on keyword matching. Returns best matching category."""
    if pd.isna(text) or not text.strip():
        return "Others - General"
    
    text_lower = text.lower()
    scores = {}
    
    for category, keywords in CATEGORY_KEYWORDS.items():
        score = sum(1 for kw in keywords if kw in text_lower)
        scores[category] = score
    
    max_score = max(scores.values())
    if max_score == 0:
        return "Others - General"
    
    return max(scores, key=scores.get)

print("Categorizing reviews...")
df["category"] = df["content_clean"].apply(categorize_review)

print("\nCategory distribution:")
print(df["category"].value_counts())

In [None]:
# Category breakdown per app
print("Category distribution per app:")
print("="*60)
for app in df["app_name"].unique():
    print(f"\n{app}:")
    app_df = df[df["app_name"] == app]
    for cat, count in app_df["category"].value_counts().items():
        pct = count / len(app_df) * 100
        print(f"  {cat}: {count} ({pct:.1f}%)")

In [None]:
# Sentiment by category
print("\nVADER Sentiment by Category:")
print("="*60)
sentiment_by_cat = pd.crosstab(df["category"], df["vader_sentiment"], normalize="index") * 100
print(sentiment_by_cat.round(1))

In [None]:
# Save final results
output_file = f"{OUTPUT_DIR}/reviews_analyzed_{RUN_DATE}.csv"
df.to_csv(output_file, index=False)
print(f"Saved {len(df)} analyzed reviews to {output_file}")

# Also save as Excel for easier sharing
excel_file = f"{OUTPUT_DIR}/reviews_analyzed_{RUN_DATE}.xlsx"
df.to_excel(excel_file, index=False)
print(f"Saved to {excel_file}")

In [None]:
# Final Summary
print("\n" + "="*60)
print("FINAL ANALYSIS SUMMARY")
print("="*60)
print(f"Total reviews analyzed: {len(df)}")
print(f"\nPer app:")
for app in df["app_name"].unique():
    app_df = df[df["app_name"] == app]
    avg_score = app_df["score"].mean()
    vader_pos = (app_df["vader_sentiment"] == "positive").mean() * 100
    print(f"  {app}: {len(app_df)} reviews, avg rating {avg_score:.2f}, {vader_pos:.1f}% positive (VADER)")

print(f"\nOverall VADER Sentiment:")
for sent, count in df["vader_sentiment"].value_counts().items():
    pct = count / len(df) * 100
    print(f"  {sent}: {count} ({pct:.1f}%)")

print(f"\nOverall Categories:")
for cat, count in df["category"].value_counts().items():
    pct = count / len(df) * 100
    print(f"  {cat}: {count} ({pct:.1f}%)")

In [None]:
# Sample reviews to spot-check
print("\nSample reviews for spot-checking:")
print("="*60)
sample = df.sample(10, random_state=42)
for _, row in sample.iterrows():
    print(f"\n[{row['app_name']}] Score: {row['score']} | VADER: {row['vader_sentiment']} | TB: {row['tb_sentiment']} | Cat: {row['category']}")
    print(f"  \"{row['content_clean'][:120]}...\"" if len(str(row['content_clean'])) > 120 else f"  \"{row['content_clean']}\"")