In [1]:
import numpy as np
import pandas as pd

import torch
from sklearn.model_selection import train_test_split

from transformers import BertTokenizer, BertForSequenceClassification
from transformers import TrainingArguments, Trainer
from transformers import EarlyStoppingCallback

## Read Data

In [2]:
train = pd.read_csv("hw4_train.csv")
train, valid = train_test_split(train, test_size=0.2, shuffle=False)

In [3]:
train.head()

Unnamed: 0,id,comment_text,toxic
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0


In [4]:
valid.head()

Unnamed: 0,id,comment_text,toxic
127656,aac649d868cbe885,March 2007 (UTC)\nIs Image:Marist high school ...,0
127657,aac68e69d96d625a,Notice to all\n\nI changed my username in acco...,0
127658,aac73bf42ef22ff9,"""\nWP articles are not genealogical entries or...",0
127659,aac894fa28474fdf,REDIRECT Talk:John Rogers (footballer),0
127660,aaca8c54dc5222af,NFL Draft\nAre you batch copy-and-pasting the ...,0


In [5]:
test= pd.read_csv("hw4_test.csv")
test.head()

Unnamed: 0,id,comment_text,toxic
0,0001ea8717f6de06,Thank you for understanding. I think very high...,0
1,000247e83dcc1211,:Dear god this site is horrible.,0
2,0002f87b16116a7f,"""::: Somebody will invariably try to add Relig...",0
3,0003e1cccfd5a40a,""" \n\n It says it right there that it IS a typ...",0
4,00059ace3e3e9a53,""" \n\n == Before adding a new product to the l...",0


## BERT Trainer

For Problem 1 and 3

In [6]:
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [7]:
# Create torch dataset
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels:
            item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.encodings["input_ids"])
    

X_train = list(train["comment_text"])
y_train = list(train["toxic"])
X_valid = list(valid["comment_text"])
y_valid = list(valid["toxic"])
X_train_tokenized = tokenizer(X_train, padding=True, truncation=True, max_length=64)
X_valid_tokenized = tokenizer(X_valid, padding=True, truncation=True, max_length=64)
train_dataset = Dataset(X_train_tokenized, y_train)
valid_dataset = Dataset(X_valid_tokenized, y_valid)

In [8]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = model.to(device)

In [9]:
from sklearn.metrics import mean_squared_error, f1_score
def compute_metrics(p):    
    pred, labels = p
    pred = np.argmax(pred, axis=1)
    f1 = f1_score(y_true=labels, y_pred=pred)
    f1_macro = f1_score(y_true=labels, y_pred=pred, average="macro")
    return {"f1": f1, "f1_macro": f1_macro} 

# Define Trainer
args = TrainingArguments(
    output_dir="output",
    evaluation_strategy="steps",
    eval_steps=500,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    seed=0,
    load_best_model_at_end=True,
)
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
)

# Train pre-trained model
trainer.train()

***** Running training *****
  Num examples = 127656
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 47871


Step,Training Loss,Validation Loss,F1,F1 Macro
500,0.1898,0.196311,0.748995,0.861553
1000,0.1836,0.198148,0.642234,0.807276
1500,0.1795,0.180519,0.726165,0.851226
2000,0.1711,0.15987,0.76329,0.868464
2500,0.1646,0.17852,0.713194,0.844483
3000,0.2441,0.155726,0.771305,0.874738
3500,0.2091,0.226451,0.710343,0.842764
4000,0.1918,0.195745,0.754402,0.865601
4500,0.2049,0.22275,0.653444,0.812984


***** Running Evaluation *****
  Num examples = 31915
  Batch size = 8
Saving model checkpoint to output\checkpoint-500
Configuration saved in output\checkpoint-500\config.json
Model weights saved in output\checkpoint-500\pytorch_model.bin
***** Running Evaluation *****
  Num examples = 31915
  Batch size = 8
Saving model checkpoint to output\checkpoint-1000
Configuration saved in output\checkpoint-1000\config.json
Model weights saved in output\checkpoint-1000\pytorch_model.bin
***** Running Evaluation *****
  Num examples = 31915
  Batch size = 8
Saving model checkpoint to output\checkpoint-1500
Configuration saved in output\checkpoint-1500\config.json
Model weights saved in output\checkpoint-1500\pytorch_model.bin
***** Running Evaluation *****
  Num examples = 31915
  Batch size = 8
Saving model checkpoint to output\checkpoint-2000
Configuration saved in output\checkpoint-2000\config.json
Model weights saved in output\checkpoint-2000\pytorch_model.bin
***** Running Evaluation *****


TrainOutput(global_step=4500, training_loss=0.19316699727376302, metrics={'train_runtime': 1133.0318, 'train_samples_per_second': 338.003, 'train_steps_per_second': 42.25, 'total_flos': 1183999749120000.0, 'train_loss': 0.19316699727376302, 'epoch': 0.28})

In [10]:
# predict on the test dataset
X_test = list(valid["comment_text"])
y_test = list(valid["toxic"])
X_test_tokenized = tokenizer(X_test, padding=True, truncation=True, max_length=64)
test_dataset = Dataset(X_valid_tokenized, y_valid)

In [11]:
raw_pred, _, _ = trainer.predict(test_dataset)
pred = np.argmax(raw_pred, axis=1)
f1_score(y_true=y_test, y_pred=pred, average="macro")

***** Running Prediction *****
  Num examples = 31915
  Batch size = 8


0.8747375608994603