### Install some libraries


In [1]:
!pip install datasets
!pip install transformers
!pip install evaluate

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.12.0-py3-none-any.wh

In [2]:
## This code here just makes it so you don't need an API
## key for Weights and Biases. Just run it, and you're good.
import pandas as pd
import numpy as np
import os
os.environ["WANDB_DISABLED"] = "true"
import datasets
from datasets import Dataset, DatasetDict

## Part 2: Loading and processing the data
Loading the kaggle dataset from our GitHub, then modifying it and turning it into a pandas dataframe

In [3]:
! curl -O https://raw.githubusercontent.com/gaylorav/NLPFinal/main/bg_descriptions.csv
! ls

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 32.1M  100 32.1M    0     0  19.6M      0  0:00:01  0:00:01 --:--:-- 19.6M
bg_descriptions.csv  sample_data


In [4]:
#cleaning the original csv and getting everything into the right format and data type
descriptions_df = pd.read_csv("bg_descriptions.csv", on_bad_lines="skip")
descriptions_df['id'] = descriptions_df['id'].astype(int)
descriptions_df['usersrated'] = descriptions_df['usersrated'].astype(int)
descriptions_df['average'] = descriptions_df['average'].astype(float)
descriptions_df = descriptions_df[descriptions_df["usersrated"] > 100]

#setting labels, 0 is negative, 1 is neutral, 2 is positive
conditions = [
    descriptions_df['average'] < 6,
    (descriptions_df['average'] >= 6) & (descriptions_df['average'] <= 6.5),
    descriptions_df['average'] > 6.5
]

choices = [0, 1, 2]

descriptions_df['sentiment'] = np.select(conditions, choices)

#turning the dataframe into a huggingface dataset with the correct column names so that it works with the code below
descriptions_df=descriptions_df[["description","sentiment"]]
descriptions_df = descriptions_df.rename(columns={'description': 'text', 'sentiment': 'label'})
descriptions_ds = Dataset.from_pandas(descriptions_df)
descriptions_ds = descriptions_ds.remove_columns(["__index_level_0__"])
print(descriptions_ds)

15589
                                         description  sentiment
0  In CATAN (formerly The Settlers of Catan), pla...          2
1  Carcassonne is a tile placement game in which ...          2
2  In Pandemic, several virulent diseases have br...          2
3  You are the leader of one of the 7 great citie...          2
4  In the 2400s, mankind begins to terraform the ...          2
Dataset({
    features: ['text', 'label'],
    num_rows: 15589
})


80/20 train/test split

In [5]:
#train_ds = descriptions_ds.shuffle(seed=42).select([i for i in list(range(12000))])
#test_ds = descriptions_ds.shuffle(seed=42).select([i for i in list(range(3000))])
split=descriptions_ds.train_test_split(test_size=0.2)
train_ds = split["train"]
test_ds = split["test"]

Check

In [6]:
print("Text:", train_ds[0]["text"])
print("Label:", train_ds[0]["label"])


Text: The Walking Dead Card Game, based on Wolfgang Kramer's 6 nimmt!, features the same basic gameplay as that card game while adding six character cards and two modes of play: Survival and Hero.&#10;&#10;To play the game, you shuffle the 104 number cards, lay out four cards face-up to start the four rows, then deal a number of cards to each player. Each turn, players simultaneously choose and reveal a card from their hand, then add the cards to the rows, with cards being placed in ascending order based on their number; each card is placed in the row that ends with the highest number that's below the card's number. When the sixth card is placed in a row, the owner of that card claims the other five cards and the sixth card becomes the first card in a new row.&#10;&#10;In addition to a number from 1 to 104, each card has a zombie point value. In Survival mode (for 3-10 players), players want to collect as few points as possible, while in Hero mode (for 2-6 players) you want to collect 

### Tokenization

Tokenize so that it works with the `distilbert-base-uncased` model.

In [7]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer helper function

In [8]:
import torch

def preprocess_function(examples):
    # Check if text is empty
    for i, text in enumerate(examples["text"]):
        if not text:
            print(f"Warning: Empty text at index {i}")
            examples["text"][i] = " "
    # Tokenize
    tokenized_output = tokenizer(examples["text"], padding=True, truncation=True)
    return tokenized_output


Tokenizing train and test sets

In [9]:
# (removing batched=True)
tokenized_train = train_ds.map(preprocess_function)
tokenized_test = test_ds.map(preprocess_function)



Map:   0%|          | 0/12471 [00:00<?, ? examples/s]

Map:   0%|          | 0/3118 [00:00<?, ? examples/s]

Checking the tokenized data

In [10]:
print(tokenized_train)
print("Text:", tokenized_train[0]["text"])
print("Label:", tokenized_train[0]["label"])
print(type(tokenized_train["label"]))
print("Input IDs:", tokenized_train[0]["input_ids"])
print("Attention Mask:", tokenized_train[0]["attention_mask"])

# Convert token IDs back to tokens
tokens = tokenizer.convert_ids_to_tokens(tokenized_train[0]["input_ids"])
print("Tokenized text:", tokens)

Dataset({
    features: ['text', 'label', 'input_ids', 'attention_mask'],
    num_rows: 12471
})
Text: The Walking Dead Card Game, based on Wolfgang Kramer's 6 nimmt!, features the same basic gameplay as that card game while adding six character cards and two modes of play: Survival and Hero.&#10;&#10;To play the game, you shuffle the 104 number cards, lay out four cards face-up to start the four rows, then deal a number of cards to each player. Each turn, players simultaneously choose and reveal a card from their hand, then add the cards to the rows, with cards being placed in ascending order based on their number; each card is placed in the row that ends with the highest number that's below the card's number. When the sixth card is placed in a row, the owner of that card claims the other five cards and the sixth card becomes the first card in a new row.&#10;&#10;In addition to a number from 1 to 104, each card has a zombie point value. In Survival mode (for 3-10 players), players wan

Getting the data collator so we can have PyTorch tensors for training with DistilBERT

In [11]:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

## Part 3: Setting up the training (a.k.a. the fine-tuning)
Downloading the model

In [12]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=3)

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Defining the function to compute our metrics: accuracy, and f1, as well as precision, recall, and f1 for each label

In [14]:
import numpy as np
from evaluate import load
from sklearn.metrics import precision_recall_fscore_support

def compute_metrics(eval_pred):
    load_accuracy = load("accuracy")
    load_f1 = load("f1")

    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    accuracy = load_accuracy.compute(predictions=predictions, references=labels)["accuracy"]
    f1 = load_f1.compute(predictions=predictions, references=labels, average="weighted")["f1"]

    # Calculate precision, recall, and F1-score for each class
    precision, recall, f1_score, support = precision_recall_fscore_support(
        labels, predictions, average=None, labels=[0, 1, 2]
    )

    # Create a dictionary to store the results
    metrics = {
        "accuracy": accuracy,
        "f1": f1,
    }

    # Add precision and recall for each class to the dictionary
    for i, label in enumerate([0, 1, 2]):
        metrics[f"precision_class_{label}"] = precision[i]
        metrics[f"recall_class_{label}"] = recall[i]
       # metrics[f"f1_score_class_{label}"] = f1_score[i] # You already have weighted f1

    return metrics

Kept the google drive stuff to decrease the risk of breaking anything

In [15]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [16]:
#training details
training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/nlp_proj",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    report_to="none"
)
#getting PyTorch tensors and inputting arguments for training and evaluation
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)


## Part 4: Training

Takes about 50 minutes on the full dataset (15000 reviews)

In [17]:
trainer.train()

Step,Training Loss
500,0.865
1000,0.7912
1500,0.7393
2000,0.6346
2500,0.5758
3000,0.4804
3500,0.4082


TrainOutput(global_step=3900, training_loss=0.6150329707219051, metrics={'train_runtime': 3114.4655, 'train_samples_per_second': 20.021, 'train_steps_per_second': 1.252, 'total_flos': 8135438913087480.0, 'train_loss': 0.6150329707219051, 'epoch': 5.0})

## Part 5: Evaluation


In [18]:
trainer.evaluate()

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.79k [00:00<?, ?B/s]

{'eval_loss': 1.037530779838562,
 'eval_accuracy': 0.6282873636946761,
 'eval_f1': 0.6321339320817945,
 'eval_precision_class_0': 0.5766283524904214,
 'eval_recall_class_0': 0.456752655538695,
 'eval_precision_class_1': 0.3216520650813517,
 'eval_recall_class_1': 0.39176829268292684,
 'eval_precision_class_2': 0.7796327212020033,
 'eval_recall_class_2': 0.7770382695507487,
 'eval_runtime': 46.347,
 'eval_samples_per_second': 67.275,
 'eval_steps_per_second': 4.207,
 'epoch': 5.0}