### Install some libraries


In [1]:
!pip install datasets
!pip install transformers
!pip install evaluate

Collecting datasets
  Downloading datasets-3.5.1-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.1-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.4/491.4 kB[0m [31m12.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2025.3.0-py3-none-any.whl (

In [2]:
## This code here just makes it so you don't need an API
## key for Weights and Biases. Just run it, and you're good.
import pandas as pd
import numpy as np
import os
os.environ["WANDB_DISABLED"] = "true"
import datasets
from datasets import Dataset, DatasetDict

## Part 2: Loading and processing the data
Loading the kaggle dataset from our GitHub, then modifying it and turning it into a pandas dataframe

In [3]:
! curl -O https://raw.githubusercontent.com/gaylorav/NLPFinal/main/bg_descriptions_v2.csv
! ls

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 3230k  100 3230k    0     0  6667k      0 --:--:-- --:--:-- --:--:-- 6660k
bg_descriptions_v2.csv	sample_data


In [4]:
#read the csv into a pandas df, then modify sentiment to work with BERT
descriptions_df = pd.read_csv("bg_descriptions_v2.csv", on_bad_lines="skip")
#print(len(descriptions_df[descriptions_df["average"]<5.5]))
#print(len(descriptions_df[descriptions_df["sentiment"]==2]))
#turning the dataframe into a huggingface dataset with the correct column names so that it works with the code below
descriptions_df=descriptions_df[["description","sentiment"]]
descriptions_df = descriptions_df.rename(columns={'description': 'text', 'sentiment': 'label'})
descriptions_ds = Dataset.from_pandas(descriptions_df)
print(descriptions_ds)

Dataset({
    features: ['text', 'label'],
    num_rows: 2600
})


80/20 train/test split

In [5]:
#train_ds = descriptions_ds.shuffle(seed=42).select([i for i in list(range(12000))])
#test_ds = descriptions_ds.shuffle(seed=42).select([i for i in list(range(3000))])
split=descriptions_ds.train_test_split(test_size=0.2)
train_ds = split["train"]
test_ds = split["test"]

Check

In [6]:
print("Text:", train_ds[0]["text"])
print("Label:", train_ds[0]["label"])


Text: A sheepdog's life is never easy... Especially when the shepherd is half asleep... The first player to bring 5 sheep in his sheepfold win the game. At his turn, each player flip one card and depending on the picture on the card, every players must act and grab a counter or not. The quickest player can obtain a sheep while those who make mistakes can lose one... 
Label: 0


### Tokenization

Tokenize so that it works with the `distilbert-base-uncased` model.

In [7]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer helper function

In [8]:
import torch

def preprocess_function(examples):
    # Check if text is empty
    for i, text in enumerate(examples["text"]):
        if not text:
            print(f"Warning: Empty text at index {i}")
            examples["text"][i] = " "
    # Tokenize
    tokenized_output = tokenizer(examples["text"], padding=True, truncation=True)
    return tokenized_output


Tokenizing train and test sets

In [9]:
# (removing batched=True)
tokenized_train = train_ds.map(preprocess_function)
tokenized_test = test_ds.map(preprocess_function)



Map:   0%|          | 0/2080 [00:00<?, ? examples/s]

Map:   0%|          | 0/520 [00:00<?, ? examples/s]

Checking the tokenized data

In [10]:
print(tokenized_train)
print("Text:", tokenized_train[0]["text"])
print("Label:", tokenized_train[0]["label"])
print(type(tokenized_train["label"]))
print("Input IDs:", tokenized_train[0]["input_ids"])
print("Attention Mask:", tokenized_train[0]["attention_mask"])

# Convert token IDs back to tokens
tokens = tokenizer.convert_ids_to_tokens(tokenized_train[0]["input_ids"])
print("Tokenized text:", tokens)

Dataset({
    features: ['text', 'label', 'input_ids', 'attention_mask'],
    num_rows: 2080
})
Text: A sheepdog's life is never easy... Especially when the shepherd is half asleep... The first player to bring 5 sheep in his sheepfold win the game. At his turn, each player flip one card and depending on the picture on the card, every players must act and grab a counter or not. The quickest player can obtain a sheep while those who make mistakes can lose one... 
Label: 0
<class 'list'>
Input IDs: [101, 1037, 8351, 16168, 1005, 1055, 2166, 2003, 2196, 3733, 1012, 1012, 1012, 2926, 2043, 1996, 11133, 2003, 2431, 6680, 1012, 1012, 1012, 1996, 2034, 2447, 2000, 3288, 1019, 8351, 1999, 2010, 8351, 10371, 2663, 1996, 2208, 1012, 2012, 2010, 2735, 1010, 2169, 2447, 11238, 2028, 4003, 1998, 5834, 2006, 1996, 3861, 2006, 1996, 4003, 1010, 2296, 2867, 2442, 2552, 1998, 6723, 1037, 4675, 2030, 2025, 1012, 1996, 4248, 4355, 2447, 2064, 6855, 1037, 8351, 2096, 2216, 2040, 2191, 12051, 2064, 4558, 20

Getting the data collator so we can have PyTorch tensors for training with DistilBERT

In [11]:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

## Part 3: Setting up the training (a.k.a. the fine-tuning)
Downloading the model

In [12]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Defining the function to compute our metrics: accuracy, and f1, as well as precision, recall, and f1 for each label

In [13]:
import numpy as np
from evaluate import load
from sklearn.metrics import precision_recall_fscore_support

def compute_metrics(eval_pred):
    load_accuracy = load("accuracy")
    load_f1 = load("f1")

    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    accuracy = load_accuracy.compute(predictions=predictions, references=labels)["accuracy"]
    f1 = load_f1.compute(predictions=predictions, references=labels, average="weighted")["f1"]

    # Calculate precision, recall, and F1-score for each class
    precision, recall, f1_score, support = precision_recall_fscore_support(
        labels, predictions, average=None, labels=[0, 1]
    )

    # Create a dictionary to store the results
    metrics = {
        "accuracy": accuracy,
        "f1": f1,
    }

    # Add precision and recall for each class to the dictionary
    for i, label in enumerate([0, 1]):
        metrics[f"precision_class_{label}"] = precision[i]
        metrics[f"recall_class_{label}"] = recall[i]
       # metrics[f"f1_score_class_{label}"] = f1_score[i] # You already have weighted f1

    return metrics

Kept the google drive stuff to decrease the risk of breaking anything

In [14]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [15]:
#training details
training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/nlp_proj",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    report_to="none"
)
#getting PyTorch tensors and inputting arguments for training and evaluation
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)


## Part 4: Training

Takes about 50 minutes on the full dataset (15000 reviews)

In [16]:
trainer.train()

Step,Training Loss
500,0.2448


TrainOutput(global_step=650, training_loss=0.2067971185537485, metrics={'train_runtime': 524.8836, 'train_samples_per_second': 19.814, 'train_steps_per_second': 1.238, 'total_flos': 1331591268533568.0, 'train_loss': 0.2067971185537485, 'epoch': 5.0})

## Part 5: Evaluation


In [17]:
trainer.evaluate()

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.79k [00:00<?, ?B/s]

{'eval_loss': 0.31470441818237305,
 'eval_accuracy': 0.9038461538461539,
 'eval_f1': 0.9038461538461539,
 'eval_precision_class_0': 0.9108527131782945,
 'eval_recall_class_0': 0.8969465648854962,
 'eval_precision_class_1': 0.8969465648854962,
 'eval_recall_class_1': 0.9108527131782945,
 'eval_runtime': 9.7147,
 'eval_samples_per_second': 53.527,
 'eval_steps_per_second': 3.397,
 'epoch': 5.0}

##Results

class_0: negative

class_1: positive


```
{'eval_loss': 0.31470441818237305,
 'eval_accuracy': 0.9038461538461539,
 'eval_f1': 0.9038461538461539,
 'eval_precision_class_0': 0.9108527131782945,
 'eval_recall_class_0': 0.8969465648854962,
 'eval_precision_class_1': 0.8969465648854962,
 'eval_recall_class_1': 0.9108527131782945,
 'eval_runtime': 9.7147,
 'eval_samples_per_second': 53.527,
 'eval_steps_per_second': 3.397,
 'epoch': 5.0}
```

