In [None]:
# Cell 1
# Download required dependencies and import needed libraries

# --- Install Commands ---
%pip install torch skorch
%pip install torchvision
%pip install kaggle
%pip install kagglehub
%pip install kagglehub[pandas-datasets]

# --- Standard Libraries ---
import os
import re
import time
import random
import numpy as np
import pandas as pd
from collections import Counter

# --- Visualizing Data ---
import seaborn as sns
import matplotlib.pyplot as plt

# --- Scikit-learn Tools ---
from sklearn.model_selection import train_test_split, cross_val_score

# --- Deep Learning Tools ---
import torch
import torch.nn as nn
import torch.optim as optim
from torch.nn.utils.rnn import pad_sequence
from torchvision import datasets, models, transforms
from skorch import NeuralNetClassifier
from skorch.helper import predefined_split

# --- Kagglehub for Dataset ---
import kagglehub
    

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [82]:
# Cell 2
# --- Functions ---

# Downloads CSV from kagglehub, optionally applying feature vector, optionally applying input name
def get_df_from_kaggle_api(kaggle_filepath: str, csv_filename: str = None, feature_vector: list[str] = None) -> pd.DataFrame:
    dataset_path = kagglehub.dataset_download(kaggle_filepath)
    csv_files = [f for f in os.listdir(dataset_path) if f.endswith('.csv')]
    if not csv_files:
        raise FileNotFoundError(f"No CSV files found in {dataset_path}")
    
    csv_path = os.path.join(dataset_path, csv_files[0])
    if csv_filename is not None:
        updated_name_path = os.path.join(dataset_path, csv_filename)
        os.rename(csv_path, updated_name_path)
        csv_path = updated_name_path
    if not os.path.exists(csv_path):
        raise FileNotFoundError(f"CSV file not found: {csv_path}")
    print("Path to dataset file:", csv_path)

    df = pd.read_csv(csv_path).dropna()
    if feature_vector is not None:
        df = df[feature_vector]
    return df

# Simple math for getting a consistent output
def get_scaling_factor_for_result(size: int, desired_size=10_000) -> float:
    alpha = 1.00
    if desired_size < size:
        alpha = desired_size / size
    return alpha

# Math for scaling lists to desired size
def scale_list(list_to_scale: list, scaling_factor: float):
    if scaling_factor < 1.0:
        sample_size = int(len(list_to_scale) * scaling_factor)
        list_to_scale = random.sample(list_to_scale, sample_size)
    return list_to_scale

# Helper function to tokenize text
def tokenize(text):
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text)  # Remove punctuation
    return text.lower().split()

# Helper function to build vocabulary from training data
def build_vocab(texts, min_freq=2):
    counter = Counter()
    for text in texts:
        tokens = tokenize(text)
        counter.update(tokens)
    vocab = {"<PAD>": 0, "<UNK>": 1}
    for word, freq in counter.items():
        if freq >= min_freq:
            vocab[word] = len(vocab)
    return vocab

# Helper function to convert text to sequence of integers
def text_to_sequence(text, vocab):
    tokens = tokenize(text)
    return [vocab.get(token, vocab["<UNK>"]) for token in tokens]

# Helper function to pad or truncate sequences to a fixed maximum length
def pad_sequence_to_length(seq, max_len):
    if len(seq) > max_len:
        return seq[:max_len]
    else:
        padding = torch.zeros(max_len - len(seq), dtype=torch.long)
        return torch.cat([seq, padding])
    
# Retrieves the best possible model given parameters
def get_best_model(models, X_train, y_train):
    results = {}
    for name, model in models.items():
        scores = cross_val_score(model, X_train, y_train, cv=5, scoring='neg_mean_squared_error')
        rmse_scores = np.sqrt(-scores)
        results[name] = rmse_scores.mean()
        print(f"{name}: RMSE = {rmse_scores.mean():.2f}")
    return min(results, key=results.get)

In [83]:
# Cell 3
# Download CSV and apply filters as desired. Store df into positive and negative lists.

positive_sentiment = []
negative_sentiment = []
applied_state = 2

# Amazon processing
amazon_data_path = "arhamrumi/amazon-product-reviews"
amazon_csv_custom_name = "amazon_review_data.csv"
amazon_features_vector = ["Score", "Text"]
amazon_csv_df = get_df_from_kaggle_api(amazon_data_path, amazon_csv_custom_name, amazon_features_vector)
del amazon_features_vector, amazon_csv_custom_name, amazon_data_path

# Filtering for Mostly Positive Reviews
amazon_data_sample_size = get_scaling_factor_for_result(amazon_csv_df.shape[0])
amazon_rating_threshhold = 4
positive_sentiment.extend(
    amazon_csv_df.loc[amazon_csv_df["Score"] >= amazon_rating_threshhold, "Text"]
    .sample(frac=amazon_data_sample_size, random_state=applied_state)
    .tolist()
)
# Filtering for Mostly Negative Reviews
negative_sentiment.extend(
    amazon_csv_df.loc[amazon_csv_df["Score"] < amazon_rating_threshhold, "Text"]
    .sample(frac=amazon_data_sample_size, random_state=applied_state)
    .tolist()
)
del amazon_csv_df, amazon_data_sample_size, amazon_rating_threshhold

# booking processing
booking_data_path = "jiashenliu/515k-hotel-reviews-data-in-europe"
booking_csv_custom_name = "booking_review_data.csv"
booking_features_vector = ["Positive_Review", "Negative_Review"]
booking_text_df = get_df_from_kaggle_api(booking_data_path, booking_csv_custom_name, booking_features_vector)
del booking_data_path, booking_csv_custom_name, booking_features_vector

booking_data_sample_size = get_scaling_factor_for_result(booking_text_df.shape[0])
positive_sentiment.extend(
    booking_text_df["Positive_Review"]
    .sample(frac=booking_data_sample_size, random_state=applied_state)
    .tolist()
)
negative_sentiment.extend(
    booking_text_df["Negative_Review"]
    .sample(frac=booking_data_sample_size, random_state=applied_state)
    .tolist()
)
del booking_data_sample_size, booking_text_df

# Twitter processing
twitter_negative_data_path = "yashdogra/toxic-tweets"
twitter_csv_custom_name = "twitter_negative_tweets.csv"
twitter_features_vector = ["tweet"]
twitter_text_df = get_df_from_kaggle_api(twitter_negative_data_path, twitter_csv_custom_name, twitter_features_vector)
del twitter_negative_data_path, twitter_csv_custom_name, twitter_features_vector

twitter_sample_size = get_scaling_factor_for_result(twitter_text_df.shape[0])
negative_sentiment.extend(
    twitter_text_df.iloc[:, 0]
    .sample(frac=twitter_sample_size, random_state=applied_state)
    .tolist()
)
del twitter_text_df, twitter_sample_size

print(f"\nSize of positive_sentiment: {len(positive_sentiment)} Size of negative_sentiment: {len(negative_sentiment)}")


Path to dataset file: C:\Users\dusti\.cache\kagglehub\datasets\arhamrumi\amazon-product-reviews\versions\1\amazon_review_data.csv
Path to dataset file: C:\Users\dusti\.cache\kagglehub\datasets\jiashenliu\515k-hotel-reviews-data-in-europe\versions\1\booking_review_data.csv
Path to dataset file: C:\Users\dusti\.cache\kagglehub\datasets\yashdogra\toxic-tweets\versions\1\twitter_negative_tweets.csv

Size of positive_sentiment: 17807 Size of negative_sentiment: 22193


In [84]:
# Cell 4
# Creating vectors to be used by model
p_len = len(positive_sentiment)
n_len = len(negative_sentiment)
print(f"Pre-sample Positive: {p_len}")
print(f"Pre-sample Negative: {n_len}")

positive_scaling_factor = get_scaling_factor_for_result(p_len)
negative_scaling_factor = get_scaling_factor_for_result(n_len)
del p_len, n_len

# Scale size of the lists
positive_sentiment = scale_list(positive_sentiment, positive_scaling_factor)
negative_sentiment = scale_list(negative_sentiment, negative_scaling_factor)
del positive_scaling_factor, negative_scaling_factor

# Balance the dataset
min_samples = min(len(positive_sentiment), len(negative_sentiment))
positive_sentiment = positive_sentiment[:min_samples]
negative_sentiment = negative_sentiment[:min_samples]
del min_samples

# Lowercase the text
positive_sentiment = [text.lower() for text in positive_sentiment]
negative_sentiment = [text.lower() for text in negative_sentiment]

print(f"Post-sample Positive: {len(positive_sentiment)}")
print(f"Post-sample Negative: {len(negative_sentiment)}")

# Combine into final model vectors
final_texts = positive_sentiment + negative_sentiment
final_labels = [1] * len(positive_sentiment) + [0] * len(negative_sentiment)
del positive_sentiment, negative_sentiment


Pre-sample Positive: 17807
Pre-sample Negative: 22193
Post-sample Positive: 10000
Post-sample Negative: 10000


In [None]:
# Cell 5
# Shuffle and split data

print(f"Total Samples Available: {len(final_texts)}")

# Split the full dataset into training and testing
X_train_raw, X_test_raw, y_train, y_test = train_test_split(
    final_texts,
    final_labels,
    test_size=0.2,
    random_state=applied_state,
    stratify=final_labels
)
del final_texts, final_labels

print(f"Training Samples: {len(X_train_raw)}")
print(f"Testing Samples: {len(X_test_raw)}")


Total Samples Available: 20000
Training Samples: 16000
Testing Samples: 4000


In [86]:
# Cell 6
# Build vocabulary and prepare sequences

# Set maximum sequence length (you can adjust this later)
max_seq_length = 100

# Build vocabulary based on training set
vocab = build_vocab(X_train_raw, min_freq=2)

print(f"Vocabulary size: {len(vocab)} words")

# Convert text reviews to sequences of integer IDs
X_train_seq = [torch.tensor(text_to_sequence(text, vocab)) for text in X_train_raw]
X_test_seq = [torch.tensor(text_to_sequence(text, vocab)) for text in X_test_raw]

# Pad or truncate sequences to max_seq_length
X_train_seq = [pad_sequence_to_length(seq, max_seq_length) for seq in X_train_seq]
X_test_seq = [pad_sequence_to_length(seq, max_seq_length) for seq in X_test_seq]

# Stack into tensors
X_train_tensor = torch.stack(X_train_seq)
X_test_tensor = torch.stack(X_test_seq)

# Convert labels to tensor format
y_train_tensor = torch.tensor(y_train, dtype=torch.long)
y_test_tensor = torch.tensor(y_test, dtype=torch.long)

# Summary
print(f"Training tensor shape: {X_train_tensor.shape}")
print(f"Testing tensor shape: {X_test_tensor.shape}")

Vocabulary size: 11614 words
Training tensor shape: torch.Size([16000, 100])
Testing tensor shape: torch.Size([4000, 100])


In [87]:
# Cell 7
# Define a slightly better CNN-based model

# Define the device (CPU or GPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class CNNTextClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, output_dim, kernel_sizes=[3,4,5], num_filters=100, dropout=0.5):
        super(CNNTextClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.convs = nn.ModuleList([
            nn.Conv1d(in_channels=embed_dim, out_channels=num_filters, kernel_size=k) for k in kernel_sizes
        ])
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(len(kernel_sizes)*num_filters, output_dim)

    def forward(self, x):
        embedded = self.embedding(x)          # (batch_size, seq_length, embed_dim)
        embedded = embedded.permute(0, 2, 1)   # (batch_size, embed_dim, seq_length)
        conved = [torch.relu(conv(embedded)) for conv in self.convs]
        pooled = [torch.max(c, dim=2)[0] for c in conved]
        cat = torch.cat(pooled, dim=1)
        cat = self.dropout(cat)
        return self.fc(cat)

# New Net with CNN
net = NeuralNetClassifier(
    module=CNNTextClassifier,
    module__vocab_size=len(vocab),
    module__embed_dim=100,
    module__output_dim=2,
    criterion=nn.CrossEntropyLoss,
    max_epochs=10,
    lr=0.001,
    batch_size=64,
    optimizer=optim.Adam,
    iterator_train__shuffle=True,
    device=device
)

print(f"Upgraded CNN model ready! Using device: {device}")

Upgraded CNN model ready! Using device: cpu


In [None]:
# Cell 8
# Train the model

# Start timer
start_time = time.time()

# Fit the model
net.fit(X_train_tensor, y_train_tensor)

# End timer
end_time = time.time()
training_time = end_time - start_time

# Summary
print(f"Training completed in {training_time:.2f} seconds.")

RuntimeError: Expected tensor for argument #1 'indices' to have one of the following scalar types: Long, Int; but got torch.FloatTensor instead (while checking arguments for embedding)

In [None]:
# Cell 9
# Evaluate the model on the test set

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import seaborn as sns

# Predict test set
y_pred = net.predict(X_test_tensor)

# Accuracy
test_accuracy = accuracy_score(y_test_tensor, y_pred)
print(f"Test Accuracy: {test_accuracy:.4f}")

# Classification Report
print("\nClassification Report:")
print(classification_report(y_test_tensor, y_pred, target_names=["Negative", "Positive"]))

# Confusion Matrix
cm = confusion_matrix(y_test_tensor, y_pred)

plt.figure(figsize=(6, 5))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=["Negative", "Positive"], yticklabels=["Negative", "Positive"])
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.title("Confusion Matrix")
plt.show()

In [None]:
# Cell 10
# Save the trained model and vocabulary for future use

import pickle
import torch

# Save the model to a .pt file
model_filename = "sentiment_model.pt"
torch.save(net.module_.state_dict(), model_filename)

# Save the vocabulary to a .pkl file
vocab_filename = "vocab.pkl"
with open(vocab_filename, "wb") as f:
    pickle.dump(vocab, f)

print(f"Model saved to {model_filename}")
print(f"Vocabulary saved to {vocab_filename}")

In [None]:
# Cell 11
# Load the model and vocab, and define a function to classify new reviews

# Load vocab
with open("vocab.pkl", "rb") as f:
    vocab = pickle.load(f)

# Rebuild the CNN model architecture
model = CNNTextClassifier(
    vocab_size=len(vocab),
    embed_dim=100,
    output_dim=2
)
model.load_state_dict(torch.load("sentiment_model.pt", map_location=device))
model.to(device)
model.eval()

# Prediction helper
def demo_predict(review_text):
    # Clean and tokenize
    tokens = tokenize(review_text)
    sequence = [vocab.get(token, vocab["<UNK>"]) for token in tokens]
    sequence = pad_sequence_to_length(torch.tensor(sequence), max_seq_length)
    sequence = sequence.unsqueeze(0).to(device)  # add batch dimension

    with torch.no_grad():
        output = model(sequence)
        prediction = torch.argmax(output, dim=1).item()

    sentiment = "Positive" if prediction == 1 else "Negative"
    print(f"Review: \"{review_text}\"\n→ Predicted Sentiment: {sentiment}")

In [None]:
# Cell 12
# Classify 10 sample reviews (SCOPE-style)

scope_reviews = [
    "This class was a total mess — lectures never made any sense.",
    "Great class! I learned a lot and the professor was very helpful.",
    "I felt like I was teaching myself most of the time.",
    "The group projects were painful but I did enjoy the content.",
    "Clear grading, fair assignments, and useful material. Would recommend.",
    "SCOPE is life-changing... in the worst possible way.",
    "Labs were fun but lectures were a bit slow.",
    "Excellent pacing, feedback was always prompt.",
    "Hard to follow, exams didn't reflect what we learned.",
    "Everything was organized and the TA support was awesome."
]

print("🔎 SCOPE Review Predictions:\n")
for review in scope_reviews:
    demo_predict(review)
    print("-" * 80)