In [3]:
#########################################################
######## Exercise 1: Enhanced Text Preprocessing ########
#########################################################

In [4]:
#--------------------------------------------------------
# Task 1: Compare NLTK v.s. spaCy preprocessing
#--------------------------------------------------------
import nltk
import spacy
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')

# Load spaCy model
nlp = spacy.load("en_core_web_sm")

# Create small dataset
data = {
    'review_text': [
        "This movie was absolutely fantastic, truly a masterpiece!",
        "It was okay, not great, not terrible, just meh.",
        "Terrible acting and a boring plot. A complete waste of time.",
        "A surprisingly good film with compelling characters.",
        "I hated every minute. The worst movie I've seen all year."
    ],
    'sentiment': [1, 0, 0, 1, 0]
}
df = pd.DataFrame(data)

# NLTK preprocessing function
def nltk_preprocess(text):
    tokens = word_tokenize(text.lower())  # Tokenize and lowercase
    stop_words = set(stopwords.words('english'))
    filtered = [w for w in tokens if w.isalnum() and w not in stop_words]
    return filtered

# spaCy preprocessing function
def spacy_preprocess(text):
    doc = nlp(text)
    filtered = [token.lower_ for token in doc if token.is_alpha and not token.is_stop]
    return filtered

# Apply both functions
df['nltk_tokens'] = df['review_text'].apply(nltk_preprocess)
df['spacy_tokens'] = df['review_text'].apply(spacy_preprocess)

print("Preprocessing Output Comparison:")
print(df[['review_text', 'nltk_tokens', 'spacy_tokens']])

[nltk_data] Downloading package punkt to /home/codespace/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/codespace/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Preprocessing Output Comparison:
                                         review_text  \
0  This movie was absolutely fantastic, truly a m...   
1    It was okay, not great, not terrible, just meh.   
2  Terrible acting and a boring plot. A complete ...   
3  A surprisingly good film with compelling chara...   
4  I hated every minute. The worst movie I've see...   

                                         nltk_tokens  \
0  [movie, absolutely, fantastic, truly, masterpi...   
1                       [okay, great, terrible, meh]   
2  [terrible, acting, boring, plot, complete, was...   
3  [surprisingly, good, film, compelling, charact...   
4   [hated, every, minute, worst, movie, seen, year]   

                                        spacy_tokens  
0  [movie, absolutely, fantastic, truly, masterpi...  
1                       [okay, great, terrible, meh]  
2  [terrible, acting, boring, plot, complete, was...  
3  [surprisingly, good, film, compelling, charact...  
4          [hated,

In [5]:
#--------------------------------------------------------
# Task 2: Analyze preprocessing steps 
#--------------------------------------------------------
def analyze_steps(text):
    print(f"Original: {text}")
    # Basic tokenization
    tokens = word_tokenize(text)
    print(f"NLTK Tokens: {tokens}")
    # Case normalization
    tokens_lower = [t.lower() for t in tokens]
    print(f"Lowercase: {tokens_lower}")
    # Stopword removal + punctuation filtering
    filtered = [t for t in tokens_lower if t.isalnum() and t not in stopwords.words('english')]
    print(f"Filtered: {filtered}")
    print(f"Token Count: {len(filtered)}")
    print('-' * 40)

print("\nStep-by-step NLTK preprocessing analysis:\n")
for review in df['review_text']:
    analyze_steps(review)

# Token count comparison
df['nltk_token_count'] = df['nltk_tokens'].apply(len)
df['spacy_token_count'] = df['spacy_tokens'].apply(len)

print("\nToken Counts:")
print(df[['review_text', 'nltk_token_count', 'spacy_token_count']])


Step-by-step NLTK preprocessing analysis:

Original: This movie was absolutely fantastic, truly a masterpiece!
NLTK Tokens: ['This', 'movie', 'was', 'absolutely', 'fantastic', ',', 'truly', 'a', 'masterpiece', '!']
Lowercase: ['this', 'movie', 'was', 'absolutely', 'fantastic', ',', 'truly', 'a', 'masterpiece', '!']
Filtered: ['movie', 'absolutely', 'fantastic', 'truly', 'masterpiece']
Token Count: 5
----------------------------------------
Original: It was okay, not great, not terrible, just meh.
NLTK Tokens: ['It', 'was', 'okay', ',', 'not', 'great', ',', 'not', 'terrible', ',', 'just', 'meh', '.']
Lowercase: ['it', 'was', 'okay', ',', 'not', 'great', ',', 'not', 'terrible', ',', 'just', 'meh', '.']
Filtered: ['okay', 'great', 'terrible', 'meh']
Token Count: 4
----------------------------------------
Original: Terrible acting and a boring plot. A complete waste of time.
NLTK Tokens: ['Terrible', 'acting', 'and', 'a', 'boring', 'plot', '.', 'A', 'complete', 'waste', 'of', 'time', '.']

In [6]:
#--------------------------------------------------------
# Task 3: Document and Analyze Differences
#--------------------------------------------------------

print("\nAnalyzing differences between NLTK and spaCy preprocessing:")

for i, row in df.iterrows():
    nltk_set = set(row['nltk_tokens'])
    spacy_set = set(row['spacy_tokens'])
    
    print(f"\nReview {i+1}: {row['review_text']}")
    print(f"Tokens only in NLTK: {nltk_set - spacy_set}")
    print(f"Tokens only in spaCy: {spacy_set - nltk_set}")
    print(f"Common tokens: {nltk_set & spacy_set}")


Analyzing differences between NLTK and spaCy preprocessing:

Review 1: This movie was absolutely fantastic, truly a masterpiece!
Tokens only in NLTK: set()
Tokens only in spaCy: set()
Common tokens: {'movie', 'fantastic', 'masterpiece', 'absolutely', 'truly'}

Review 2: It was okay, not great, not terrible, just meh.
Tokens only in NLTK: set()
Tokens only in spaCy: set()
Common tokens: {'okay', 'meh', 'terrible', 'great'}

Review 3: Terrible acting and a boring plot. A complete waste of time.
Tokens only in NLTK: set()
Tokens only in spaCy: set()
Common tokens: {'plot', 'waste', 'boring', 'acting', 'terrible', 'time', 'complete'}

Review 4: A surprisingly good film with compelling characters.
Tokens only in NLTK: set()
Tokens only in spaCy: set()
Common tokens: {'good', 'film', 'characters', 'compelling', 'surprisingly'}

Review 5: I hated every minute. The worst movie I've seen all year.
Tokens only in NLTK: {'every'}
Tokens only in spaCy: set()
Common tokens: {'hated', 'year', 'seen

In [7]:
#########################################################
######### Exercise 2: Basic Feature Extraction ##########
#########################################################

In [8]:
#--------------------------------------------------------
# Task 1: Compare CountVectorizer and TdidfVectorizer 
#--------------------------------------------------------
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Use the preprocessed NLTK text for fair comparison
df['text_clean'] = df['nltk_tokens'].apply(lambda tokens: ' '.join(tokens))

# Initialize vectorizers
count_vec = CountVectorizer()
tfidf_vec = TfidfVectorizer()

# Fit and transform the text
X_count = count_vec.fit_transform(df['text_clean'])
X_tfidf = tfidf_vec.fit_transform(df['text_clean'])

# Extract features
count_features = count_vec.get_feature_names_out()
tfidf_features = tfidf_vec.get_feature_names_out()

print("🔹 CountVectorizer Features:")
print(count_features)

print("\n🔹 TF-IDF Features:")
print(tfidf_features)

print(f"\nCountVectorizer shape: {X_count.shape}")
print(f"TF-IDF shape: {X_tfidf.shape}")

🔹 CountVectorizer Features:
['absolutely' 'acting' 'boring' 'characters' 'compelling' 'complete'
 'every' 'fantastic' 'film' 'good' 'great' 'hated' 'masterpiece' 'meh'
 'minute' 'movie' 'okay' 'plot' 'seen' 'surprisingly' 'terrible' 'time'
 'truly' 'waste' 'worst' 'year']

🔹 TF-IDF Features:
['absolutely' 'acting' 'boring' 'characters' 'compelling' 'complete'
 'every' 'fantastic' 'film' 'good' 'great' 'hated' 'masterpiece' 'meh'
 'minute' 'movie' 'okay' 'plot' 'seen' 'surprisingly' 'terrible' 'time'
 'truly' 'waste' 'worst' 'year']

CountVectorizer shape: (5, 26)
TF-IDF shape: (5, 26)


In [9]:
#--------------------------------------------------------
# Task 2: Compare CountVectorizer and TdidfVectorizer 
#--------------------------------------------------------

# Try different n-gram ranges
ngram_configs = [(1,1), (1,2), (2,2)]

for ngram_range in ngram_configs:
    print(f"\n🔸 N-gram range: {ngram_range}")
    
    vec = CountVectorizer(ngram_range=ngram_range)
    X_ng = vec.fit_transform(df['text_clean'])
    features = vec.get_feature_names_out()
    
    print(f"Feature count: {len(features)}")
    print(f"Sample features: {features[:10]}")


🔸 N-gram range: (1, 1)
Feature count: 26
Sample features: ['absolutely' 'acting' 'boring' 'characters' 'compelling' 'complete'
 'every' 'fantastic' 'film' 'good']

🔸 N-gram range: (1, 2)
Feature count: 49
Sample features: ['absolutely' 'absolutely fantastic' 'acting' 'acting boring' 'boring'
 'boring plot' 'characters' 'compelling' 'compelling characters'
 'complete']

🔸 N-gram range: (2, 2)
Feature count: 23
Sample features: ['absolutely fantastic' 'acting boring' 'boring plot'
 'compelling characters' 'complete waste' 'every minute' 'fantastic truly'
 'film compelling' 'good film' 'great terrible']


In [10]:
#--------------------------------------------------------
# Task 3: Compare CountVectorizer and TdidfVectorizer 
#--------------------------------------------------------
import numpy as np

def top_tfidf_terms(vectorizer, tfidf_matrix, top_n=5):
    feature_names = vectorizer.get_feature_names_out()
    for i in range(tfidf_matrix.shape[0]):
        print(f"\n📝 Review {i+1}:")
        row = tfidf_matrix[i].toarray().flatten()
        top_indices = row.argsort()[::-1][:top_n]
        for idx in top_indices:
            print(f"  {feature_names[idx]}: {row[idx]:.4f}")

top_tfidf_terms(tfidf_vec, X_tfidf)



📝 Review 1:
  truly: 0.4637
  masterpiece: 0.4637
  fantastic: 0.4637
  absolutely: 0.4637
  movie: 0.3741

📝 Review 2:
  okay: 0.5234
  meh: 0.5234
  great: 0.5234
  terrible: 0.4222
  worst: 0.0000

📝 Review 3:
  time: 0.3878
  waste: 0.3878
  plot: 0.3878
  complete: 0.3878
  acting: 0.3878

📝 Review 4:
  surprisingly: 0.4472
  good: 0.4472
  compelling: 0.4472
  characters: 0.4472
  film: 0.4472

📝 Review 5:
  year: 0.3878
  worst: 0.3878
  seen: 0.3878
  every: 0.3878
  hated: 0.3878


In [11]:
#########################################################
########### Exercise 3: Text Classification #############
#########################################################

In [12]:
#--------------------------------------------------------
# Step 1: Implement and Compare Classifiers 
#--------------------------------------------------------
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Use TF-IDF vectors from NLTK-preprocessed text
tfidf_vec = TfidfVectorizer()
X = tfidf_vec.fit_transform(df['text_clean'])
y = df['sentiment']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define classifiers
models = {
    'Naive Bayes': MultinomialNB(),
    'Logistic Regression': LogisticRegression(max_iter=1000)
}

# Train and evaluate
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(f"\n📊 Classification Report: {name}")
    print(classification_report(y_test, y_pred))



📊 Classification Report: Naive Bayes


              precision    recall  f1-score   support

           0       1.00      1.00      1.00         1

    accuracy                           1.00         1
   macro avg       1.00      1.00      1.00         1
weighted avg       1.00      1.00      1.00         1


📊 Classification Report: Logistic Regression
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         1

    accuracy                           1.00         1
   macro avg       1.00      1.00      1.00         1
weighted avg       1.00      1.00      1.00         1



In [None]:
#--------------------------------------------------------
# Step 2: Study Preprocessing Impact
#--------------------------------------------------------

# Define reusable evaluation function
def evaluate_model(preprocessed_texts, labels, vectorizer=TfidfVectorizer(), title=""):
    X = vectorizer.fit_transform(preprocessed_texts)
    X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.2, random_state=42)

    print(f"Results using: {title}")
    for name, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        print(f"\n{name}")
        print(classification_report(y_test, y_pred))

# Compare no preprocessing vs NLTK vs spaCy:
# 1. No preprocessing
evaluate_model(df['review_text'], df['sentiment'], title="No Preprocessing")

# 2. NLTK preprocessed
evaluate_model(df['text_clean'], df['sentiment'], title="NLTK Preprocessing")

# 3. spaCy preprocessed
df['spacy_clean'] = df['spacy_tokens'].apply(lambda tokens: ' '.join(tokens))
evaluate_model(df['spacy_clean'], df['sentiment'], title="spaCy Preprocessing")



Results using: No Preprocessing

Naive Bayes
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       1.0
           1       0.00      0.00      0.00       0.0

    accuracy                           0.00       1.0
   macro avg       0.00      0.00      0.00       1.0
weighted avg       0.00      0.00      0.00       1.0


Logistic Regression
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       1.0
           1       0.00      0.00      0.00       0.0

    accuracy                           0.00       1.0
   macro avg       0.00      0.00      0.00       1.0
weighted avg       0.00      0.00      0.00       1.0

Results using: NLTK Preprocessing

Naive Bayes
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         1

    accuracy                           1.00         1
   macro avg       1.00      1.00      1.00         1
weighted avg   

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [15]:
#--------------------------------------------------------
# Step 3: Feature Selection Analysis
#--------------------------------------------------------

from sklearn.feature_selection import SelectKBest, chi2

def feature_selection_analysis(X, y, vectorizer, top_ks=[5, 10, 15]):
    for k in top_ks:
        print(f"\n🔍 Top {k} Features (using chi2):")
        selector = SelectKBest(score_func=chi2, k=k)
        selector.fit(X, y)
        mask = selector.get_support()
        selected_features = vectorizer.get_feature_names_out()[mask]
        print(selected_features)

# Run Feature Selection
# Use spaCy-preprocessed data here as an example
vectorizer = TfidfVectorizer()
X_full = vectorizer.fit_transform(df['spacy_clean'])
y_full = df['sentiment']

feature_selection_analysis(X_full, y_full, vectorizer)



🔍 Top 5 Features (using chi2):
['absolutely' 'fantastic' 'masterpiece' 'surprisingly' 'truly']

🔍 Top 10 Features (using chi2):
['absolutely' 'characters' 'compelling' 'fantastic' 'film' 'good'
 'masterpiece' 'surprisingly' 'terrible' 'truly']

🔍 Top 15 Features (using chi2):
['absolutely' 'characters' 'compelling' 'fantastic' 'film' 'good' 'great'
 'masterpiece' 'meh' 'okay' 'surprisingly' 'terrible' 'truly' 'worst'
 'year']
