# Sentiment Analysis — EDA & Baseline

This notebook explores the reviews dataset and trains a baseline TF‑IDF + Logistic Regression model.

In [None]:
import pandas as pd, numpy as np
import matplotlib.pyplot as plt
from collections import Counter
from wordcloud import WordCloud
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, ConfusionMatrixDisplay
import nltk, re
from nltk.corpus import stopwords
try:
    _ = stopwords.words('english')
except LookupError:
    nltk.download('stopwords')
    _ = stopwords.words('english')

DATA_PATH = '../data/reviews.csv'  # ensure the file exists
df = pd.read_csv(DATA_PATH)
df.head()

In [None]:
# Basic info
print(df.shape)
print(df['label'].value_counts())
df['text'].str.len().describe()

In [None]:
# Simple cleaner
def clean_text(s):
    s = str(s).lower()
    s = re.sub(r"[^a-z0-9'\s]", " ", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s

df['clean'] = df['text'].map(clean_text)

In [None]:
# WordClouds for quick intuition (if classes exist)
for cls in df['label'].unique():
    texts = " ".join(df.loc[df['label']==cls, 'clean'].tolist()[:5000])
    if texts:
        wc = WordCloud(width=900, height=500).generate(texts)
        plt.figure()
        plt.imshow(wc)
        plt.axis('off')
        plt.title(f'WordCloud — {cls}')
        plt.show()

In [None]:
# Train/test split + TF-IDF + Logistic Regression
X_train, X_test, y_train, y_test = train_test_split(df['clean'], df['label'], test_size=0.2, random_state=42, stratify=df['label'])
vec = TfidfVectorizer(ngram_range=(1,2), min_df=3, max_df=0.9, sublinear_tf=True)
Xtr = vec.fit_transform(X_train)
Xte = vec.transform(X_test)

clf = LogisticRegression(max_iter=300)
clf.fit(Xtr, y_train)
pred = clf.predict(Xte)
print(classification_report(y_test, pred))

In [None]:
# Confusion matrix
disp = ConfusionMatrixDisplay.from_predictions(y_test, pred)
plt.title('Confusion Matrix')
plt.show()

In [None]:
# Top weighted features for each class (one-vs-rest)
import numpy as np
feature_names = np.array(vec.get_feature_names_out())
if hasattr(clf, 'coef_'):
    coefs = clf.coef_
    classes = clf.classes_
    for i, cls in enumerate(classes):
        top_pos_idx = np.argsort(coefs[i])[-15:][::-1]
        print(f"Top features for class '{cls}':")
        print(feature_names[top_pos_idx])
