<a href="https://colab.research.google.com/github/dornercr/INFO371/blob/main/INFO371_Week7_Sentiment_Classification_Pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# 📦 Step 1: Import required libraries
import pandas as pd
import numpy as np
import spacy
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from tqdm import tqdm

# 🧠 Step 2: Load spaCy model for tokenization + embeddings
!python -m spacy download en_core_web_md
nlp = spacy.load("en_core_web_md")  # This includes 300-dim word vectors

# 🧹 Custom spaCy tokenizer used for BoW and TF-IDF
def spacy_tokenizer(text):
    doc = nlp(text)
    return [token.lemma_.lower().strip() for token in doc
            if not token.is_stop and not token.is_punct and token.lemma_.strip()]

# 🧪 Step 3: Define a simple toy sentiment dataset
toy_messages = [
    "I absolutely loved this product!",
    "This is the worst thing I've ever used.",
    "Fantastic experience overall.",
    "Terrible customer service.",
    "Will definitely buy again.",
    "Horrible. Just horrible.",
    "Five stars, no complaints!",
    "Disappointed and frustrated.",
    "Everything went smoothly and I'm happy.",
    "It broke on day one."
]
labels = [1, 0, 1, 0, 1, 0, 1, 0, 1, 0]  # 1 = Positive, 0 = Negative

df = pd.DataFrame({"message": toy_messages, "label": labels})

# 🧮 Step 4A: Bag-of-Words Vectorization
bow_vectorizer = CountVectorizer(tokenizer=spacy_tokenizer, ngram_range=(1,1))
X_bow = bow_vectorizer.fit_transform(df["message"])

# 🧮 Step 4B: TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer(tokenizer=spacy_tokenizer, ngram_range=(1,1))
X_tfidf = tfidf_vectorizer.fit_transform(df["message"])

# 🧠 Step 4C: Word Embeddings (Mean of word vectors)
def get_embedding(text):
    return nlp(text).vector  # average of all word vectors

X_embed = np.array([get_embedding(msg) for msg in tqdm(df["message"])])

# 🧪 Step 5: Model Training and Evaluation Function
def evaluate_knn(X, y, k=3):
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.3, random_state=42, stratify=y
    )
    clf = KNeighborsClassifier(n_neighbors=k)
    cv_score = cross_val_score(clf, X_train, y_train, cv=3).mean()
    clf.fit(X_train, y_train)
    preds = clf.predict(X_test)

    return {
        "CV Accuracy": round(cv_score, 2),
        "Test Accuracy": round(accuracy_score(y_test, preds), 2),
        "Precision": round(precision_score(y_test, preds), 2),
        "Recall": round(recall_score(y_test, preds), 2),
        "F1 Score": round(f1_score(y_test, preds), 2)
    }

# 📊 Step 6: Run evaluations for each feature set
results = {
    "BoW": evaluate_knn(X_bow, df["label"]),
    "TF-IDF": evaluate_knn(X_tfidf, df["label"]),
    "Word Embeddings": evaluate_knn(X_embed, df["label"])
}

# 🖨️ Show results
for model_type, metrics in results.items():
    print(f"\n🔎 {model_type} Results:")
    for metric, value in metrics.items():
        print(f"{metric}: {value}")


Collecting en-core-web-md==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.8.0/en_core_web_md-3.8.0-py3-none-any.whl (33.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m33.5/33.5 MB[0m [31m32.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: en-core-web-md
Successfully installed en-core-web-md-3.8.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


100%|██████████| 10/10 [00:00<00:00, 108.12it/s]



🔎 BoW Results:
CV Accuracy: 0.56
Test Accuracy: 0.33
Precision: 0.33
Recall: 1.0
F1 Score: 0.5

🔎 TF-IDF Results:
CV Accuracy: 0.56
Test Accuracy: 0.33
Precision: 0.33
Recall: 1.0
F1 Score: 0.5

🔎 Word Embeddings Results:
CV Accuracy: 0.56
Test Accuracy: 0.67
Precision: 0.0
Recall: 0.0
F1 Score: 0.0


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
