# npr MC2: Sentiment Analysis

In [None]:
import os

DEV_SET_FRAC = 0.001
OVERWRITE_SETS_WITH_DEV = True
os.environ["WANDB_ENTITY"] = "lang-based-yappers"
os.environ["WANDB_PROJECT"] = "amazon_sentiment_analysis"
SEED = 1337

In [None]:
import random

import numpy as np
import torch

torch.use_deterministic_algorithms(True)

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('mps') if torch.backends.mps.is_available() else torch.device('cpu')
print('Device:', device)

def set_seed():
    random.seed(SEED)
    np.random.seed(SEED)
    
    torch.manual_seed(SEED)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(SEED)
        torch.cuda.manual_seed_all(SEED)

    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed()

# Loading the Dataset

In [None]:
from datasets import load_dataset

dataset = load_dataset("amazon_polarity")
dataset

In [None]:
train_ds, test_ds = dataset['train'], dataset['test']
train_df, test_df = train_ds.to_pandas(), test_ds.to_pandas()

In [None]:
# get unique labels
train_df['label'].unique()

In [None]:
train_df.info()

In [None]:
test_df.info()

In [None]:
train_df_dev = train_df.sample(frac=DEV_SET_FRAC, random_state=1337)
test_df_dev = test_df.sample(frac=DEV_SET_FRAC, random_state=1337)

if OVERWRITE_SETS_WITH_DEV:
    train_df = train_df_dev
    test_df = test_df_dev
    
    train_ds = train_ds.select(range(len(train_df)))
    test_ds = test_ds.select(range(len(test_df)))

print(f"Training data shape: {train_df_dev.shape}, Testing data shape: {test_df_dev.shape}")

# Exploratory Data Analysis

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(8, 5))
sns.countplot(x='label', data=train_df)
plt.title('Label Distribution in Training Data')
plt.xlabel('Label')
plt.ylabel('Frequency')
plt.show()

In [None]:
train_df['review_length'] = train_df['content'].apply(len)
plt.figure(figsize=(10, 6))
sns.histplot(train_df['review_length'], bins=50, kde=True)
plt.title('Distribution of Review Lengths in Training Data')
plt.xlabel('Review Length')
plt.ylabel('Frequency')
plt.show()

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

def plot_most_common_words(df, top_n=20):
    pos_reviews = df[df['label'] == 1]['content']
    neg_reviews = df[df['label'] == 0]['content']

    vectorizer_pos = CountVectorizer(stop_words='english')
    vectorizer_neg = CountVectorizer(stop_words='english')

    pos_word_count = vectorizer_pos.fit_transform(pos_reviews)
    neg_word_count = vectorizer_neg.fit_transform(neg_reviews)

    pos_sum_words = pos_word_count.sum(axis=0)
    neg_sum_words = neg_word_count.sum(axis=0)

    pos_words_freq = [(word, pos_sum_words[0, idx]) for word, idx in
                      zip(vectorizer_pos.get_feature_names_out(), range(pos_sum_words.shape[1]))]
    neg_words_freq = [(word, neg_sum_words[0, idx]) for word, idx in
                      zip(vectorizer_neg.get_feature_names_out(), range(neg_sum_words.shape[1]))]

    pos_words_freq = sorted(pos_words_freq, key=lambda x: x[1], reverse=True)
    neg_words_freq = sorted(neg_words_freq, key=lambda x: x[1], reverse=True)

    words, freq = zip(*pos_words_freq[:top_n])
    plt.figure(figsize=(10, 5))
    plt.bar(words, freq)
    plt.title('Most common words in positive reviews')
    plt.xticks(rotation=90)
    plt.show()

    words, freq = zip(*neg_words_freq[:top_n])
    plt.figure(figsize=(10, 5))
    plt.bar(words, freq)
    plt.title('Most common words in negative reviews')
    plt.xticks(rotation=90)
    plt.show()
    
plot_most_common_words(train_df_dev)

In [None]:
from wordcloud import WordCloud, STOPWORDS

def generate_word_cloud(text, title):
    wordcloud = WordCloud(width=800, height=800,
                          background_color='white',
                          stopwords=set(STOPWORDS),
                          min_font_size=10).generate(text)

    plt.figure(figsize=(8, 8), facecolor=None)
    plt.imshow(wordcloud)
    plt.axis("off")
    plt.tight_layout(pad=0)
    plt.title(title)
    plt.show()

pos_reviews_text = " ".join(review for review in train_df_dev[train_df_dev['label'] == 1]['content'])
generate_word_cloud(pos_reviews_text, "Word Cloud for Positive Reviews")

In [None]:
neg_reviews_text = " ".join(review for review in train_df_dev[train_df_dev['label'] == 0]['content'])
generate_word_cloud(neg_reviews_text, "Word Cloud for Negative Reviews")

# Fine-tuning

## Pretrained Model

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

MODEL_NAME = 'sentence-transformers/all-MiniLM-L6-v2'
    
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, force_download=True)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, force_download=True) # TODO: current model only supports 2 labels (positive, negative) but we have 3 labels (positive, neutral, negative)

In [None]:
# print count model parameters
model_parameters = filter(lambda p: p.requires_grad, model.parameters())
params = sum([np.prod(p.size()) for p in model_parameters])
print(f"Model has {params} trainable parameters.")

## Creation of Stratified Subsets

In [None]:
from datasets import Dataset
from sklearn.model_selection import train_test_split
import pandas as pd

def create_cumulative_stratified_subsets(df, percentages, label_column, as_datasets=True):
    """
    Creates cumulative, stratified subsets of a dataset based on increasing percentages,
    ensuring each subset is a superset of the previous ones.

    :param df: pandas DataFrame to be split.
    :param percentages: Increasing list of percentages for subset sizes.
    :param label_column: Name of the column for stratification.
    :return: Dictionary with percentages as keys and subset DataFrames as values.
    """
    if not isinstance(df, pd.DataFrame):
        raise ValueError("The input must be a pandas DataFrame.")
    
    if percentages[-1] != 100:
        raise ValueError("The last percentage must be 100 to ensure full dataset coverage.")

    subsets = {}
    cumulative_subset_indices = set()

    for percentage in percentages:
        target_subset_size = int(len(df) * (percentage / 100))
        additional_rows_needed = target_subset_size - len(cumulative_subset_indices)
        
        if additional_rows_needed <= 0:
            continue  # Skip if no new rows need to be added (defensive, should not happen)

        remaining_data = df.loc[~df.index.isin(cumulative_subset_indices)]
        if additional_rows_needed >= len(remaining_data):
            new_subset = remaining_data
        else:
            _, new_subset = train_test_split(
                remaining_data, 
                test_size=additional_rows_needed / len(remaining_data), 
                stratify=remaining_data[label_column],
                random_state=SEED
            )

        cumulative_subset_indices.update(new_subset.index)
        subsets[percentage] = df.loc[sorted(cumulative_subset_indices)]

    if as_datasets:
        for percentage, subset in subsets.items():
            subsets[percentage] = Dataset.from_pandas(subset)

    return subsets

train_ds_subsets = create_cumulative_stratified_subsets(train_df, [i for i in range(10, 101, 10)], 'label', as_datasets=True)
for percentage, subset in train_ds_subsets.items():
    print(f"Subset size for {percentage}%: {len(subset)}, type: {type(subset)}")

## Training

In [None]:
from transformers import TrainingArguments

training_epochs = 3
training_batch_size = 16
logging_steps = len(train_df) // training_batch_size



# TODO: Use hyperparams for fine-tuning stated on https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english
training_args = TrainingArguments(output_dir=MODEL_NAME,
                                  num_train_epochs=training_epochs,
                                  learning_rate=2e-5,
                                  per_device_train_batch_size=training_batch_size,
                                  per_device_eval_batch_size=training_batch_size,
                                  weight_decay=0.01,
                                  evaluation_strategy="epoch",
                                  disable_tqdm=False,
                                  logging_steps=logging_steps,
                                  log_level="error",
                                  report_to="wandb",
                                  run_name="amazon_sentiment_analysis",
                                  optim="adamw_torch"
                                  )

In [None]:
def tokenize(batch, max_length=512):
    return tokenizer(batch['content'], padding=True, truncation=True, max_length=max_length)

def tokenize_dataset(dataset, max_size=100, process_batch_size=100, batched=True):
    """ Tokenizes the dataset """
    if not isinstance(dataset, Dataset):
        raise ValueError("The dataset must be a huggingface Dataset object.")
    return dataset.map(lambda batch: tokenize(batch, max_size), batched=batched, batch_size=process_batch_size)

train_ds_tokenized = tokenize_dataset(train_ds)
test_ds_tokenized = tokenize_dataset(test_ds)

## Metrics

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

def compute_metrics(pred):
    labels, preds = pred.label_ids, pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds, average="weighted")
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1}

In [None]:
from transformers import Trainer, TrainerCallback

class EpochResultsCallback(TrainerCallback):
    """A custom callback to capture and log results at the end of each epoch."""
    def __init__(self):
        self.results_df = pd.DataFrame(columns=['Epoch', 'Validation Loss', 'Accuracy', 'F1'])
    
    def on_evaluate(self, args, state, control, metrics=None, **kwargs):
        new_row = {
            'Epoch': state.epoch,
            'Validation Loss': metrics['eval_loss'],
            'Accuracy': metrics.get('eval_accuracy', None),
            'F1': metrics.get('eval_f1', None)
        }
        self.results_df = pd.concat([self.results_df, pd.DataFrame(new_row, index=[0])])


def fine_tune_model(model, training_args, train_dataset, eval_dataset, tokenizer):
    epoch_results_callback = EpochResultsCallback()
    trainer = Trainer(
        model=model,
        args=training_args,
        compute_metrics=compute_metrics,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        tokenizer=tokenizer,
        callbacks=[epoch_results_callback]
    )
    
    trainer.evaluate()
    
    torch.cuda.empty_cache()
    trainer.train()

    return epoch_results_callback.results_df

eval_df = fine_tune_model(model, training_args, train_ds_tokenized, test_ds_tokenized, tokenizer)

In [None]:
eval_df

In [None]:
# ## TODO WIP is broke but ideally I want to extract the embeddings and predictions from the model and add them to the dataset so we can look at them in phoenix
# def postprocess(batch):
#     inputs = {k:v.to(device) for k,v in batch.items() if k in tokenizer.model_input_names}
#     with torch.no_grad():
#         out = model(**inputs)                         # Extract prediction labels
#         pred_label = out.logits.argmax(dim=1)           # Extract embedding vectors
#         hidden_states = torch.stack(out.hidden_states)  # (layer_#, batch_size, seq_length/or/num_tokens, hidden_size)
#         embeddings = hidden_states[-1][:,0,:]           # Select last layer, then CLS token vector
#     return {"text_vector": embeddings.cpu().numpy(), "pred_label": pred_label.cpu().numpy()}
# 
# batch_size = 100
# 
# 
# train_ds.set_format("torch", columns=["input_ids", "attention_mask"])
# train_ds = train_ds.map(postprocess, batched=True, batch_size=batch_size)
# 
# test_ds.set_format("torch", columns=["input_ids", "attention_mask"])
# test_ds = test_ds.map(postprocess, batched=True, batch_size=batch_size)

In [None]:
# def postprocess(batch, model, tokenizer, device):
#     inputs = {k: v.to(device) for k, v in batch.items() if k in tokenizer.model_input_names}
#     
#     with torch.no_grad():
#         out = model(**inputs)
#         pred_label = out.logits.argmax(dim=1)
#         hidden_states = torch.stack(out.hidden_states)
#         embeddings = hidden_states[-1][:, 0, :]
#     
#     return {"text_vector": embeddings.cpu().numpy(), "pred_label": pred_label.cpu().numpy()}
# 
# def apply_postprocess_to_dataset(dataset, model, tokenizer, device, batch_size):
#     dataset.set_format("torch", columns=["input_ids", "attention_mask"])
#     return dataset.map(lambda batch: postprocess(batch, model, tokenizer, device),
#                        batched=True, batch_size=batch_size)
# 
# batch_size = 100
# train_ds = apply_postprocess_to_dataset(train_ds, model, tokenizer, device, batch_size)
# test_ds = apply_postprocess_to_dataset(test_ds, model, tokenizer, device, batch_size)