In [None]:
#pip freeze > requirements.txt

Note: you may need to restart the kernel to use updated packages.


# Task 2

**Claudio Gonzalez Gonzalez**

Environment Setup:
- Required python version: 3.12.2
- requirements.txt available in the folder

## **Cell 0-1: Load and preprocess data**

In [12]:
# - Classes depend on data loading and preprocessing.
# - Preprocessor should be built first and used across all models.
import pandas as pd
import re

class DataHandler:
    """
    Handles loading and preprocessing of tweet datasets.
    """
    def __init__(self, train_path, test_path):
        self.train_path = train_path
        self.test_path = test_path
        self.df_train = None
        self.df_test = None

    def load_data(self):
        """Load training and test datasets (.csv or .parquet)."""
        if self.train_path.endswith(".parquet.gzip"):
            self.df_train = pd.read_parquet(self.train_path)
            self.df_test = pd.read_parquet(self.test_path)
        else:
            self.df_train = pd.read_csv(self.train_path)
            self.df_test = pd.read_csv(self.test_path)

        # Ensure consistent indexing
        self.df_train.reset_index(drop=True, inplace=True)
        self.df_test.reset_index(drop=True, inplace=True)
        print(f"Train shape: {self.df_train.shape}, Test shape: {self.df_test.shape}")
        return self.df_train, self.df_test


class TweetPreprocessor:
    """
    Preprocess tweets by removing noise and normalizing text.
    """
    def __init__(self):
        self.patterns = {
            "html": re.compile(r"<[^>]+>"),
            "urls": re.compile(r"http\S+|www\S+|https\S+"),
            "unicode": re.compile(r"\\u[\dA-Fa-f]{4}"),
            "non_ascii": re.compile(r"[^\x00-\x7F]+"),
            "punct": re.compile(r"[^a-z\s]")
        }

    def clean(self, text: str) -> str:
        """Clean a single tweet."""
        text = text.lower()
        text = self.patterns["html"].sub("", text)
        text = self.patterns["urls"].sub("", text)
        text = self.patterns["unicode"].sub("", text)
        text = self.patterns["non_ascii"].sub("", text)
        text = self.patterns["punct"].sub("", text)
        text = " ".join(text.split())
        return text

    def apply(self, df):
        """Apply cleaning to a DataFrame with 'content' column."""
        df["clean_text"] = df["content"].apply(self.clean)
        return df

## **Cell 2: Dictionary-based sentiment analysis (VADER)**

In [13]:
# - Initializes the VADER analyzer
# - Predicts sentiment for a DataFrame
# - Stores predictions for evaluation

import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer

class DictionarySentimentAnalyzer:
    """
    Applies a lexicon-based sentiment analysis (VADER).
    """
    def __init__(self):
        nltk.download("vader_lexicon")
        self.analyzer = SentimentIntensityAnalyzer()

    def predict(self, df):
        """
        Apply sentiment analysis to a DataFrame.
        Returns the DataFrame with a new column 'dict_pred'.
        """
        def vader_sentiment(text):
            score = self.analyzer.polarity_scores(text)["compound"]
            return int(score >= 0)

        df["dict_pred"] = df["clean_text"].apply(vader_sentiment)
        return df

## **Cell 3: TF-IDF + Logistic Regression classifier**

In [14]:
# - Initializes TF-IDF and Logistic Regression
# - Trains on the cleaned training data
# - Predicts on test data
# - Optionally returns predictions for evaluation


from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

class TfidfLogisticClassifier:
    """
    TF-IDF vectorizer + Logistic Regression sentiment classifier.
    """
    def __init__(self, max_features=5000, max_iter=200):
        self.vectorizer = TfidfVectorizer(max_features=max_features)
        self.model = LogisticRegression(max_iter=max_iter)

    def train(self, df_train):
        """
        Fit TF-IDF and train Logistic Regression.
        """
        X_train = self.vectorizer.fit_transform(df_train["clean_text"])
        y_train = df_train["sentiment"].astype(int)
        self.model.fit(X_train, y_train)

    def predict(self, df_test):
        """
        Transform test data and predict sentiment.
        Adds 'tfidf_pred' column to the DataFrame.
        """
        X_test = self.vectorizer.transform(df_test["clean_text"])
        preds = self.model.predict(X_test)
        df_test["tfidf_pred"] = preds
        return df_test

## **Cell 4: RNN Classifier with Own Embeddings**

In [15]:
# - Tokenize text and pad sequences
# - Build an LSTM-based RNN model with learned embeddings
# - Handle train/validation split and early stopping
# - Train, predict, and optionally evaluate performance


import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.callbacks import EarlyStopping

class RNNClassifier:
    """
    RNN-based sentiment classifier with custom embeddings.
    """
    def __init__(self, vocab_size=10000, embedding_dim=64, lstm_units=64, max_len=50):
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.lstm_units = lstm_units
        self.max_len = max_len
        self.tokenizer = Tokenizer(num_words=vocab_size, oov_token="<OOV>")
        self.model = None

    def preprocess(self, texts):
        """
        Tokenize and pad sequences for input to RNN.
        """
        seqs = self.tokenizer.texts_to_sequences(texts)
        return pad_sequences(seqs, maxlen=self.max_len, padding='post', truncating='post')

    def build_model(self):
        """
        Build the RNN model with embedding + LSTM.
        """
        model = Sequential([
            Embedding(input_dim=self.vocab_size, output_dim=self.embedding_dim, input_length=self.max_len),
            LSTM(self.lstm_units),
            Dense(1, activation="sigmoid")
        ])
        model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
        self.model = model

    def train(self, df_train, val_size=0.2, epochs=10, batch_size=32):
        """
        Fit tokenizer, prepare data, train the model with early stopping.
        """
        texts = df_train["clean_text"]
        labels = df_train["sentiment"].astype(int)
        
        # Fit tokenizer and transform data
        self.tokenizer.fit_on_texts(texts)
        X = self.preprocess(texts)
        y = labels

        # Split into training/validation sets
        X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=val_size, random_state=42)

        # Build and train model
        self.build_model()
        early_stop = EarlyStopping(monitor="val_loss", patience=2, restore_best_weights=True)
        self.model.fit(X_train, y_train, validation_data=(X_val, y_val),
                       epochs=epochs, batch_size=batch_size, callbacks=[early_stop], verbose=1)

    def predict(self, df_test):
        """
        Tokenize and predict sentiment for the test set.
        """
        X_test = self.preprocess(df_test["clean_text"])
        preds = (self.model.predict(X_test) > 0.5).astype(int)
        df_test["rnn_pred"] = preds
        return df_test

## **Cell 5: RNN Classifier with pretrained GloVe embeddings**

In [16]:
# - Tokenize and pad text sequences (shared logic)
# - Load pretrained GloVe embeddings
# - Build an RNN with the embedding layer initialized from GloVe
# - Train with validation split and early stopping
# - Predict sentiments on the test set

import os
import zipfile
import urllib.request
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping

class RNNGloveClassifier:
    """
    RNN-based sentiment classifier using pretrained GloVe embeddings.
    """
    def __init__(self, vocab_size=10000, embedding_dim=100, lstm_units=64, max_len=50):
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.lstm_units = lstm_units
        self.max_len = max_len
        self.tokenizer = Tokenizer(num_words=vocab_size, oov_token="<OOV>")
        self.model = None
        self.embedding_matrix = None

    def preprocess(self, texts):
        """
        Tokenize and pad sequences for RNN input.
        """
        seqs = self.tokenizer.texts_to_sequences(texts)
        return pad_sequences(seqs, maxlen=self.max_len, padding='post', truncating='post')

    def load_glove_embeddings(self, glove_path="glove.6B.100d.txt"):
        """
        Download (if needed) and load GloVe embeddings.
        """
        if not os.path.exists(glove_path):
            print("Downloading GloVe embeddings...")
            url = "http://nlp.stanford.edu/data/glove.6B.zip"
            urllib.request.urlretrieve(url, "glove.6B.zip")
            with zipfile.ZipFile("glove.6B.zip", "r") as zip_ref:
                zip_ref.extract("glove.6B.100d.txt")
        
        embeddings_index = {}
        with open(glove_path, encoding='utf8') as f:
            for line in f:
                values = line.split()
                word = values[0]
                coeffs = np.asarray(values[1:], dtype='float32')
                embeddings_index[word] = coeffs
        
        word_index = self.tokenizer.word_index
        embedding_matrix = np.zeros((self.vocab_size, self.embedding_dim))
        for word, i in word_index.items():
            if i < self.vocab_size:
                vector = embeddings_index.get(word)
                if vector is not None:
                    embedding_matrix[i] = vector
        self.embedding_matrix = embedding_matrix

    def build_model(self):
        """
        Build the RNN model with pretrained embeddings.
        """
        model = Sequential([
            Embedding(input_dim=self.vocab_size, output_dim=self.embedding_dim, 
                      weights=[self.embedding_matrix], input_length=self.max_len, trainable=False),
            LSTM(self.lstm_units),
            Dropout(0.3),
            Dense(1, activation="sigmoid")
        ])
        model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
        self.model = model

    def train(self, df_train, val_size=0.2, epochs=10, batch_size=32):
        """
        Fit tokenizer, build embedding matrix, and train model.
        """
        texts = df_train["clean_text"]
        labels = df_train["sentiment"].astype(int)

        # Fit tokenizer
        self.tokenizer.fit_on_texts(texts)
        X = self.preprocess(texts)
        y = labels

        # Load GloVe embeddings
        self.load_glove_embeddings()

        # Split train/validation
        X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=val_size, random_state=42)

        # Build and train model
        self.build_model()
        early_stop = EarlyStopping(monitor="val_loss", patience=2, restore_best_weights=True)
        self.model.fit(X_train, y_train, validation_data=(X_val, y_val),
                       epochs=epochs, batch_size=batch_size, callbacks=[early_stop], verbose=1)

    def predict(self, df_test):
        """
        Predict sentiments on test set.
        """
        X_test = self.preprocess(df_test["clean_text"])
        preds = (self.model.predict(X_test) > 0.5).astype(int)
        df_test["rnn_glove_pred"] = preds
        return df_test

## **Cell 6: Huggingface Transformer Pipeline (pre-trained sentiment model)**

In [17]:
# - Load a pre-trained Huggingface model for sentiment analysis (e.g., DistilBERT)
# - Use the pipeline() API to generate predictions
# - Map results to binary sentiment labels
# - Attach predictions to the test DataFrame


from transformers import pipeline

class TransformerPipelineSentiment:
    """
    Uses a Huggingface pre-trained transformer for sentiment analysis
    without additional fine-tuning.
    """
    def __init__(self, model_name="distilbert-base-uncased-finetuned-sst-2-english"):
        self.model_name = model_name
        self.classifier = pipeline("sentiment-analysis", model=self.model_name)

    def predict(self, df_test, batch_size=32):
        """
        Apply the transformer pipeline to the test DataFrame.
        Adds 'transformer_pred' column with binary predictions.
        """
        texts = df_test["clean_text"].tolist()
        results = self.classifier(texts, truncation=True, padding=True, batch_size=batch_size)
        preds = [1 if r["label"].upper() == "POSITIVE" else 0 for r in results]
        df_test["transformer_pred"] = preds
        return df_test

## **Cell 7: Fine-tune DistilBERT Transformer (Uses Huggingface Trainer for inheritance-based fine-tuning)**

In [18]:
# - Fine-tune a DistilBERT transformer for binary sentiment classification.
# - Tokenize train and test datasets for Huggingface Trainer API.
# - Train for 1 epoch with evaluation metrics (Accuracy, Precision, Recall, F1).
# - Store a trained Trainer object for predictions.
# - Add a finetuned_pred column to the test DataFrame for results aggregation.



from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

class DistilBERTFineTuner(TransformerPipelineSentiment):
    """
    Inherits from TransformerPipelineSentiment.
    Adds fine-tuning capability using Huggingface Trainer API.
    """
    def __init__(self, model_name="distilbert-base-uncased", num_labels=2):
        super().__init__(model_name)
        self.tokenizer = DistilBertTokenizerFast.from_pretrained(model_name)
        self.model = DistilBertForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)
        self.trainer = None

    def _compute_metrics(self, pred):
        labels = pred.label_ids
        preds = np.argmax(pred.predictions, axis=1)
        precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
        acc = accuracy_score(labels, preds)
        return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}

    def train(self, df_train, df_test, epochs=1):
        """
        Fine-tune the transformer on training data.
        """
        train_encodings = self.tokenizer(df_train["clean_text"].tolist(), truncation=True, padding=True)
        test_encodings = self.tokenizer(df_test["clean_text"].tolist(), truncation=True, padding=True)

        train_dataset = Dataset.from_dict({
            "input_ids": train_encodings["input_ids"],
            "attention_mask": train_encodings["attention_mask"],
            "labels": df_train["sentiment"].astype(int).tolist()
        })

        test_dataset = Dataset.from_dict({
            "input_ids": test_encodings["input_ids"],
            "attention_mask": test_encodings["attention_mask"],
            "labels": df_test["sentiment"].astype(int).tolist()
        })

        training_args = TrainingArguments(
            output_dir="./results",
            num_train_epochs=epochs,
            per_device_train_batch_size=16,
            per_device_eval_batch_size=64,
            save_strategy="no",
            logging_dir="./logs",
            logging_steps=50,
            seed=42
        )

        self.trainer = Trainer(
            model=self.model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=test_dataset,
            compute_metrics=self._compute_metrics
        )

        self.trainer.train()

    def predict(self, df_test):
        """
        Use the fine-tuned model for predictions.
        """
        test_encodings = self.tokenizer(df_test["clean_text"].tolist(), truncation=True, padding=True)
        test_dataset = Dataset.from_dict({
            "input_ids": test_encodings["input_ids"],
            "attention_mask": test_encodings["attention_mask"],
            "labels": df_test["sentiment"].astype(int).tolist()
        })

        preds = self.trainer.predict(test_dataset).predictions
        preds = np.argmax(preds, axis=1)
        df_test["finetuned_pred"] = preds
        return df_test

## **Cell 8: Evaluate all models**

In [19]:
# - Accept predictions from multiple models
# - Compute binary classification metrics (Accuracy, Precision, Recall, F1)
# - Store results for each model
# - Generate a summary DataFrame (for Cell 9)

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import pandas as pd

class EvaluationManager:
    """
    Collects predictions from multiple models and computes evaluation metrics.
    """
    def __init__(self):
        self.results = {}

    def add_results(self, model_name, y_true, y_pred):
        """
        Adds evaluation metrics for a given model.
        """
        metrics = {
            "accuracy": accuracy_score(y_true, y_pred),
            "precision": precision_score(y_true, y_pred),
            "recall": recall_score(y_true, y_pred),
            "f1": f1_score(y_true, y_pred)
        }
        self.results[model_name] = metrics

    def get_summary(self, sort_by="f1"):
        """
        Returns a sorted DataFrame of all model results.
        """
        df_results = pd.DataFrame(self.results).T
        df_results = df_results.round(4)
        return df_results.sort_values(sort_by, ascending=False)

## **Main Cell: Execute Entire Sentiment Analysis Pipeline**

In [20]:
# - Calls all classes and methods from previous cells  
# - Loads data, preprocesses text  
# - Runs all models (Dictionary, TF-IDF, RNNs, Transformers)  
# - Fine-tunes **DistilBERT** using Huggingface Trainer  
# - Evaluates all approaches and generates a summary table

# -------------------------------
# Cell 0 & 1: Load and preprocess data
# -------------------------------
data_handler = DataHandler("btc_tweets_train.parquet.gzip", "btc_tweets_test.parquet.gzip")
df_train, df_test = data_handler.load_data()

preprocessor = TweetPreprocessor()
df_train = preprocessor.apply(df_train)
df_test = preprocessor.apply(df_test)

# -------------------------------
# Cell 2: Dictionary-based sentiment analysis (VADER)
# -------------------------------
dict_analyzer = DictionarySentimentAnalyzer()
df_test = dict_analyzer.predict(df_test)

# -------------------------------
# Cell 3: TF-IDF + Logistic Regression classifier
# -------------------------------
tfidf_clf = TfidfLogisticClassifier()
tfidf_clf.train(df_train)
df_test = tfidf_clf.predict(df_test)

# -------------------------------
# Cell 4: RNN with custom embeddings
# -------------------------------
rnn_clf = RNNClassifier()
rnn_clf.train(df_train)
df_test = rnn_clf.predict(df_test)

# -------------------------------
# Cell 5: RNN with pretrained GloVe embeddings
# -------------------------------
rnn_glove_clf = RNNGloveClassifier()
rnn_glove_clf.train(df_train)
df_test = rnn_glove_clf.predict(df_test)

# -------------------------------
# Cell 6: Huggingface Transformer Pipeline (pre-trained sentiment model)
# -------------------------------
transformer_pipe = TransformerPipelineSentiment()
df_test = transformer_pipe.predict(df_test)

# -------------------------------
# Cell 7: Fine-tune DistilBERT Transformer
# -------------------------------
distilbert_finetuner = DistilBERTFineTuner()
distilbert_finetuner.train(df_train, df_test)
df_test = distilbert_finetuner.predict(df_test)



# -------------------------------
# Cell 8: Evaluate all models
# -------------------------------
eval_mgr = EvaluationManager()
eval_mgr.add_results("VADER Lexicon", df_test["sentiment"], df_test["dict_pred"])
eval_mgr.add_results("TF-IDF + Logistic", df_test["sentiment"], df_test["tfidf_pred"])
eval_mgr.add_results("RNN Custom Emb", df_test["sentiment"], df_test["rnn_pred"])
eval_mgr.add_results("RNN GloVe", df_test["sentiment"], df_test["rnn_glove_pred"])
eval_mgr.add_results("Huggingface Pipeline", df_test["sentiment"], df_test["transformer_pred"])
eval_mgr.add_results("DistilBERT Fine-Tuned", df_test["sentiment"], df_test["finetuned_pred"])

# -------------------------------
# Final summary table
# -------------------------------
final_results = eval_mgr.get_summary()
print("Final Sentiment Model Performance Summary (ordered by F1 score):")
display(final_results)


Train shape: (1500, 5), Test shape: (500, 5)
Epoch 1/10


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/claudio/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
E0000 00:00:1754084618.746520 9028013 meta_optimizer.cc:967] PluggableGraphOptimizer failed: INVALID_ARGUMENT: Failed to deserialize the `graph_buf`.


[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 23ms/step - accuracy: 0.7956 - loss: 0.5589 - val_accuracy: 0.8367 - val_loss: 0.4500
Epoch 2/10
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 18ms/step - accuracy: 0.7923 - loss: 0.5005 - val_accuracy: 0.8367 - val_loss: 0.4484
Epoch 3/10
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 17ms/step - accuracy: 0.8054 - loss: 0.4877 - val_accuracy: 0.8367 - val_loss: 0.4321
Epoch 4/10
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 17ms/step - accuracy: 0.8554 - loss: 0.3965 - val_accuracy: 0.8367 - val_loss: 0.4806
Epoch 5/10
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 16ms/step - accuracy: 0.9043 - loss: 0.3147 - val_accuracy: 0.8400 - val_loss: 0.5444
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step
Epoch 1/10




[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 21ms/step - accuracy: 0.7777 - loss: 0.6319 - val_accuracy: 0.8267 - val_loss: 0.4609
Epoch 2/10
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 15ms/step - accuracy: 0.7920 - loss: 0.4943 - val_accuracy: 0.8200 - val_loss: 0.4519
Epoch 3/10
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 15ms/step - accuracy: 0.8078 - loss: 0.4776 - val_accuracy: 0.8200 - val_loss: 0.4281
Epoch 4/10
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 15ms/step - accuracy: 0.8161 - loss: 0.4249 - val_accuracy: 0.8233 - val_loss: 0.3981
Epoch 5/10
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 15ms/step - accuracy: 0.8246 - loss: 0.3912 - val_accuracy: 0.8167 - val_loss: 0.3987
Epoch 6/10
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 15ms/step - accuracy: 0.8385 - loss: 0.3968 - val_accuracy: 0.8100 - val_loss: 0.3982
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[

Device set to use mps:0
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Device set to use mps:0
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
50,0.4702




Final Sentiment Model Performance Summary (ordered by F1 score):


Unnamed: 0,accuracy,precision,recall,f1
DistilBERT Fine-Tuned,0.866,0.877,0.9703,0.9213
RNN GloVe,0.828,0.8472,0.9604,0.9002
RNN Custom Emb,0.818,0.838,0.9604,0.895
TF-IDF + Logistic,0.81,0.8109,0.9975,0.8946
VADER Lexicon,0.824,0.8744,0.9134,0.8935
Huggingface Pipeline,0.452,0.9514,0.3391,0.5
