In [9]:
# ------------------------------------------------------------
# Compare COUNT unigrams vs INDICATOR (binary) unigrams
# with: stemming, stopword removal, lowercasing, punctuation removal,
# min frequency threshold = 5, unigrams only, + number-of-words feature,
# RandomForest (defaults), using cross-validation.
# ------------------------------------------------------------

import re
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer

# -----------------------------
# 1) Load data
# -----------------------------
# Expected format (UCI "sentiment labelled sentences"):
# each line: "review text<TAB>label"
# label: 0 or 1

DATA_PATH = "amazon_cells_labelled.txt"   # change if needed

texts = []
labels = []
with open(DATA_PATH, "r", encoding="utf-8", errors="ignore") as f:
    for line in f:
        line = line.strip()
        if not line:
            continue
        parts = line.split("\t")
        if len(parts) != 2:
            continue
        text, lab = parts
        texts.append(text)
        labels.append(int(lab))

df = pd.DataFrame({"text": texts, "y": labels})
print(df.head(), "\nClass balance:\n", df["y"].value_counts(normalize=True))

# -----------------------------
# 2) Preprocessing helpers
# -----------------------------
# We'll implement:
# - lowercasing
# - remove punctuation
# - tokenize
# - remove stopwords
# - stemming
#
# We'll build a custom analyzer for CountVectorizer.

import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

# If you haven't downloaded these before, uncomment:
# nltk.download("stopwords")

STOPWORDS = set(stopwords.words("english"))
STEMMER = PorterStemmer()

TOKEN_RE = re.compile(r"[A-Za-z]+")

def stem_analyzer(doc: str):
    doc = doc.lower()
    # keep only word-like tokens (removes punctuation/numbers)
    tokens = TOKEN_RE.findall(doc)
    # stopword removal + stemming
    return [STEMMER.stem(t) for t in tokens if t not in STOPWORDS]

# -----------------------------
# 3) Number-of-words feature transformer
# -----------------------------
class WordCountTransformer(BaseEstimator, TransformerMixin):
    """Returns a (n_samples, 1) array = number of tokens in the processed text."""
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        counts = []
        for doc in X:
            tokens = stem_analyzer(doc)
            counts.append(len(tokens))
        return np.array(counts).reshape(-1, 1)

# -----------------------------
# 4) Build two feature pipelines
# -----------------------------
def make_model(binary: bool):
    """
    binary=False => count unigrams
    binary=True  => indicator unigrams
    """
    text_vectorizer = CountVectorizer(
        analyzer=stem_analyzer,
        ngram_range=(1, 1),  # unigrams only
        min_df=1,            # threshold = 5
        binary=binary
    )

    # ColumnTransformer to combine:
    # - sparse unigram matrix from df["text"]
    # - dense word-count feature from df["text"]
    features = ColumnTransformer(
        transformers=[
            ("unigrams", text_vectorizer, "text"),
            ("n_words", WordCountTransformer(), "text")
        ],
        remainder="drop",
        sparse_threshold=0.3
    )

    clf = RandomForestClassifier(random_state=42)  # defaults otherwise

    return Pipeline([
        ("features", features),
        ("rf", clf)
    ])

count_model = make_model(binary=False)
indicator_model = make_model(binary=True)

# -----------------------------
# 5) Evaluate with cross-validation
# -----------------------------
X = df[["text"]]
y = df["y"].values

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

count_scores = cross_val_score(count_model, X, y, cv=cv, scoring="accuracy")
ind_scores   = cross_val_score(indicator_model, X, y, cv=cv, scoring="accuracy")

print("\n=== RESULTS (5-fold CV Accuracy) ===")
print(f"COUNT unigrams     : mean={count_scores.mean():.3f}, std={count_scores.std():.3f}, scores={np.round(count_scores,3)}")
print(f"INDICATOR unigrams : mean={ind_scores.mean():.3f}, std={ind_scores.std():.3f}, scores={np.round(ind_scores,3)}")

# Optional: fit on full data if you want a final model
# count_model.fit(X, y)
# indicator_model.fit(X, y)


                                                text  y
0  So there is no way for me to plug it in here i...  0
1                        Good case, Excellent value.  1
2                             Great for the jawbone.  1
3  Tied to charger for conversations lasting more...  0
4                                  The mic is great.  1 
Class balance:
 y
0    0.5
1    0.5
Name: proportion, dtype: float64

=== RESULTS (5-fold CV Accuracy) ===
COUNT unigrams     : mean=0.787, std=0.020, scores=[0.79  0.75  0.8   0.81  0.785]
INDICATOR unigrams : mean=0.789, std=0.022, scores=[0.785 0.755 0.81  0.815 0.78 ]
