# üìù Text Classification: Sentiment Analysis

NLP pipeline for sentiment analysis on product reviews.

**Level**: Intermediate  
**Time Required**: ~35 minutes

In [None]:
import sys
sys.path.insert(0, '../../')

from data_science_master_system import DataLoader, Plotter, calculate_metrics
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
import warnings
warnings.filterwarnings('ignore')

print("‚úÖ Ready!")

In [None]:
# Load review data
loader = DataLoader()
df = loader.read('../data/csv/product_reviews.csv')

print(f"Dataset: {df.shape}")
print(f"\nSentiment distribution:")
print(df['sentiment'].value_counts())
df.head()

## 1. Text Preprocessing

In [None]:
import re
import string

def clean_text(text):
    """Basic text cleaning."""
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = ' '.join(text.split())  # Remove extra whitespace
    return text

# Apply cleaning
df['text_clean'] = df['text'].apply(clean_text)

print("Sample cleaned text:")
for i in range(3):
    print(f"  Original: {df['text'].iloc[i]}")
    print(f"  Cleaned:  {df['text_clean'].iloc[i]}")
    print()

## 2. Feature Extraction: TF-IDF

In [None]:
# Split data
X = df['text_clean']
y = df['sentiment']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# TF-IDF Vectorization
tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1, 2), stop_words='english')
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

print(f"TF-IDF features: {X_train_tfidf.shape[1]}")

In [None]:
# Top TF-IDF features per class
feature_names = tfidf.get_feature_names_out()

for sentiment in ['positive', 'negative', 'neutral']:
    idx = y_train == sentiment
    mean_tfidf = np.asarray(X_train_tfidf[idx].mean(axis=0)).flatten()
    top_idx = mean_tfidf.argsort()[-5:][::-1]
    top_words = [feature_names[i] for i in top_idx]
    print(f"\nTop words for {sentiment}: {', '.join(top_words)}")

## 3. Model Training

In [None]:
# Train multiple classifiers
models = {
    'Naive Bayes': MultinomialNB(),
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Random Forest': RandomForestClassifier(n_estimators=100, n_jobs=-1)
}

results = []
for name, model in models.items():
    model.fit(X_train_tfidf, y_train)
    y_pred = model.predict(X_test_tfidf)
    
    metrics = calculate_metrics(y_test, y_pred, 'classification')
    results.append({
        'Model': name,
        'Accuracy': metrics['accuracy'],
        'F1': metrics['f1']
    })

results_df = pd.DataFrame(results).sort_values('F1', ascending=False)
print("\nüìä Model Comparison:")
display(results_df)

## 4. Evaluate Best Model

In [None]:
# Use best model
best_model = models['Logistic Regression']
y_pred = best_model.predict(X_test_tfidf)

# Confusion matrix
from sklearn.metrics import confusion_matrix, classification_report

cm = confusion_matrix(y_test, y_pred)
labels = ['negative', 'neutral', 'positive']

plotter = Plotter()
fig = plotter.confusion_matrix(cm, labels=labels, normalize=True, title='Sentiment Classification')
plt.show()

In [None]:
# Classification report
print("\nüìã Classification Report:")
print(classification_report(y_test, y_pred))

## 5. Predict New Reviews

In [None]:
def predict_sentiment(text):
    """Predict sentiment for new text."""
    cleaned = clean_text(text)
    features = tfidf.transform([cleaned])
    prediction = best_model.predict(features)[0]
    probabilities = best_model.predict_proba(features)[0]
    
    return {
        'text': text,
        'sentiment': prediction,
        'confidence': max(probabilities)
    }

# Test with new reviews
test_reviews = [
    "This product is absolutely amazing! Best purchase ever!",
    "Terrible quality. Complete waste of money.",
    "It's okay. Nothing special but does the job."
]

print("\nüîÆ Predictions:")
for review in test_reviews:
    result = predict_sentiment(review)
    emoji = {'positive': 'üòä', 'negative': 'üòû', 'neutral': 'üòê'}[result['sentiment']]
    print(f"  {emoji} {result['sentiment'].upper()} ({result['confidence']:.0%}): \"{review[:50]}...\"")

## üéØ Key Takeaways

1. TF-IDF for text feature extraction
2. N-grams capture phrases
3. Logistic Regression works well for text
4. Clean text improves results
5. Confidence scores add value