### Install some libraries


In [9]:
!pip install datasets
!pip install transformers
!pip install evaluate



In [10]:
## This code here just makes it so you don't need an API
## key for Weights and Biases. Just run it, and you're good.
import pandas as pd
import numpy as np
import os
os.environ["WANDB_DISABLED"] = "true"
import datasets
from datasets import Dataset, DatasetDict

## Part 2: Loading and processing the data
Loading the kaggle dataset from our GitHub, then modifying it and turning it into a pandas dataframe

In [11]:
! curl -O https://raw.githubusercontent.com/gaylorav/NLPFinal/main/bg_descriptions.csv
! ls

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0100 18.1M  100 18.1M    0     0   123M      0 --:--:-- --:--:-- --:--:--  124M
bg_descriptions.csv  sample_data


In [12]:
#read the csv into a pandas df, then modify sentiment to work with BERT
descriptions_df = pd.read_csv("bg_descriptions.csv", on_bad_lines="skip")
descriptions_df["sentiment"] = descriptions_df["sentiment"] + 1

#turning the dataframe into a huggingface dataset with the correct column names so that it works with the code below
descriptions_df=descriptions_df[["description","sentiment"]]
descriptions_df = descriptions_df.rename(columns={'description': 'text', 'sentiment': 'label'})
descriptions_ds = Dataset.from_pandas(descriptions_df)
print(descriptions_ds)

Dataset({
    features: ['text', 'label'],
    num_rows: 15589
})


80/20 train/test split

In [13]:
#train_ds = descriptions_ds.shuffle(seed=42).select([i for i in list(range(12000))])
#test_ds = descriptions_ds.shuffle(seed=42).select([i for i in list(range(3000))])
split=descriptions_ds.train_test_split(test_size=0.2)
train_ds = split["train"]
test_ds = split["test"]

Check

In [14]:
print("Text:", train_ds[0]["text"])
print("Label:", train_ds[0]["label"])


Text: Trouble is brewing high above the city of angels! The mysterious Rocketeer  who can blast through the skies with an ingenious jet-pack  must stop a sinister plot. Hollywood actor Neville Sinclair is scheming to steal the rocket's blueprints as this breakthrough of modern engineering could revolutionize the future of flight. But in Sinclair's hands, it could also fuel the dark future of warfare. In the two-player game The Rocketeer: Fate of the Future, you play as either the heroes or the villains. On your turn, pick one of your characters and take an action with them, optionally playing cards from your hand that match that character's symbol. Card actions are: Move, Tussle (combat), Gain Grit, Raise Rocket Token (heroes), Recruit a Soldier (villain). Players alternate turns until all six characters (three on each side) have taken actions and are exhausted, then players gain rewards based on control of the locations and prepare for a new round. The end of the game happens when the

### Tokenization

Tokenize so that it works with the `distilbert-base-uncased` model.

In [15]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer helper function

In [16]:
import torch

def preprocess_function(examples):
    # Check if text is empty
    for i, text in enumerate(examples["text"]):
        if not text:
            print(f"Warning: Empty text at index {i}")
            examples["text"][i] = " "
    # Tokenize
    tokenized_output = tokenizer(examples["text"], padding=True, truncation=True)
    return tokenized_output


Tokenizing train and test sets

In [17]:
# (removing batched=True)
tokenized_train = train_ds.map(preprocess_function)
tokenized_test = test_ds.map(preprocess_function)



Map:   0%|          | 0/12471 [00:00<?, ? examples/s]

Map:   0%|          | 0/3118 [00:00<?, ? examples/s]

Checking the tokenized data

In [18]:
print(tokenized_train)
print("Text:", tokenized_train[0]["text"])
print("Label:", tokenized_train[0]["label"])
print(type(tokenized_train["label"]))
print("Input IDs:", tokenized_train[0]["input_ids"])
print("Attention Mask:", tokenized_train[0]["attention_mask"])

# Convert token IDs back to tokens
tokens = tokenizer.convert_ids_to_tokens(tokenized_train[0]["input_ids"])
print("Tokenized text:", tokens)

Dataset({
    features: ['text', 'label', 'input_ids', 'attention_mask'],
    num_rows: 12471
})
Text: Trouble is brewing high above the city of angels! The mysterious Rocketeer  who can blast through the skies with an ingenious jet-pack  must stop a sinister plot. Hollywood actor Neville Sinclair is scheming to steal the rocket's blueprints as this breakthrough of modern engineering could revolutionize the future of flight. But in Sinclair's hands, it could also fuel the dark future of warfare. In the two-player game The Rocketeer: Fate of the Future, you play as either the heroes or the villains. On your turn, pick one of your characters and take an action with them, optionally playing cards from your hand that match that character's symbol. Card actions are: Move, Tussle (combat), Gain Grit, Raise Rocket Token (heroes), Recruit a Soldier (villain). Players alternate turns until all six characters (three on each side) have taken actions and are exhausted, then players gain rewards ba

Getting the data collator so we can have PyTorch tensors for training with DistilBERT

In [19]:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

## Part 3: Setting up the training (a.k.a. the fine-tuning)
Downloading the model

In [20]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=3)

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Defining the function to compute our metrics: accuracy, and f1, as well as precision, recall, and f1 for each label

In [21]:
import numpy as np
from evaluate import load
from sklearn.metrics import precision_recall_fscore_support

def compute_metrics(eval_pred):
    load_accuracy = load("accuracy")
    load_f1 = load("f1")

    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    accuracy = load_accuracy.compute(predictions=predictions, references=labels)["accuracy"]
    f1 = load_f1.compute(predictions=predictions, references=labels, average="weighted")["f1"]

    # Calculate precision, recall, and F1-score for each class
    precision, recall, f1_score, support = precision_recall_fscore_support(
        labels, predictions, average=None, labels=[0, 1, 2]
    )

    # Create a dictionary to store the results
    metrics = {
        "accuracy": accuracy,
        "f1": f1,
    }

    # Add precision and recall for each class to the dictionary
    for i, label in enumerate([0, 1, 2]):
        metrics[f"precision_class_{label}"] = precision[i]
        metrics[f"recall_class_{label}"] = recall[i]
       # metrics[f"f1_score_class_{label}"] = f1_score[i] # You already have weighted f1

    return metrics

Kept the google drive stuff to decrease the risk of breaking anything

In [22]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [23]:
#training details
training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/nlp_proj",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    report_to="none"
)
#getting PyTorch tensors and inputting arguments for training and evaluation
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)


## Part 4: Training

Takes about 50 minutes on the full dataset (15000 reviews)

In [24]:
trainer.train()

Step,Training Loss
500,0.8626
1000,0.7855
1500,0.7458
2000,0.6307
2500,0.5959
3000,0.4846
3500,0.4197


TrainOutput(global_step=3900, training_loss=0.6203822757036258, metrics={'train_runtime': 3099.3848, 'train_samples_per_second': 20.119, 'train_steps_per_second': 1.258, 'total_flos': 7847077598878428.0, 'train_loss': 0.6203822757036258, 'epoch': 5.0})

## Part 5: Evaluation


In [25]:
trainer.evaluate()

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.79k [00:00<?, ?B/s]

{'eval_loss': 1.0438246726989746,
 'eval_accuracy': 0.6173829377806286,
 'eval_f1': 0.6198353699962124,
 'eval_precision_class_0': 0.5786407766990291,
 'eval_recall_class_0': 0.463452566096423,
 'eval_precision_class_1': 0.3410757946210269,
 'eval_recall_class_1': 0.39914163090128757,
 'eval_precision_class_2': 0.7551820728291316,
 'eval_recall_class_2': 0.759009009009009,
 'eval_runtime': 48.6566,
 'eval_samples_per_second': 64.082,
 'eval_steps_per_second': 4.008,
 'epoch': 5.0}

##Results

class_0: negative

class_1: neutral

class_2: positive


```
{'eval_loss': 1.0438246726989746,
 'eval_accuracy': 0.6173829377806286,
 'eval_f1': 0.6198353699962124,
 'eval_precision_class_0': 0.5786407766990291,
 'eval_recall_class_0': 0.463452566096423,
 'eval_precision_class_1': 0.3410757946210269,
 'eval_recall_class_1': 0.39914163090128757,
 'eval_precision_class_2': 0.7551820728291316,
 'eval_recall_class_2': 0.759009009009009,
 'eval_runtime': 48.6566,
 'eval_samples_per_second': 64.082,
 'eval_steps_per_second': 4.008,
 'epoch': 5.0}
```

