In [51]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [52]:
cd /content/drive/MyDrive/hate_speech/models

/content/drive/MyDrive/hate_speech/models


In [53]:
! pip install transformers==4.25.1

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


# BERT for "offensive.language"

In [55]:
label_to_class="offensive.language"

In [54]:
import torch.nn as nn
from transformers import DistilBertForSequenceClassification,BertForSequenceClassification, BertTokenizer, AdamW, get_linear_schedule_with_warmup
import pandas as pd
import torch
from torch.utils.data import TensorDataset, random_split, DataLoader, RandomSampler, SequentialSampler
import time
import pickle
from tqdm import tqdm
import matplotlib.pyplot as plt
import random
import numpy as np
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from sklearn.metrics import precision_recall_fscore_support, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split

import torch
from transformers import TrainingArguments, Trainer
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import EarlyStoppingCallback
     

In [56]:
# Define pretrained tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
model = BertForSequenceClassification.from_pretrained('bert-base-uncased',  num_labels=2)
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

loading file vocab.txt from cache at /root/.cache/huggingface/hub/models--bert-base-uncased/snapshots/0a6aa9128b6194f4f3c4db429b6cb4891cdb421b/vocab.txt
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at None
loading file tokenizer_config.json from cache at /root/.cache/huggingface/hub/models--bert-base-uncased/snapshots/0a6aa9128b6194f4f3c4db429b6cb4891cdb421b/tokenizer_config.json
loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--bert-base-uncased/snapshots/0a6aa9128b6194f4f3c4db429b6cb4891cdb421b/config.json
Model config BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_positio

In [57]:
csv = pd.read_csv('../data/ourdata/full_train.csv',header=0)
# create a new csv df
csv_new = pd.DataFrame(csv, columns=[label_to_class, "tweet_hashed"])
# drop all rows that have any NaN values
csv_new_clean = csv_new.dropna(axis=0,how="any")

In [58]:
# calculate max length of the tweets
X= list(csv_new_clean["tweet_hashed"])
max_length = 0
for x in X:
    ids = tokenizer.encode(x)
    max_length = max(len(ids),max_length)

In [59]:
y = list(csv_new_clean[label_to_class])

In [60]:
X_train, X_dev, y_train, y_dev = train_test_split(X, y, test_size=0.2, random_state=42)

In [61]:
X_train_tokenized = tokenizer(X_train, padding=True, truncation=True, max_length=max_length)
X_dev_tokenized = tokenizer(X_dev, padding=True, truncation=True, max_length=max_length)


In [62]:
# Create torch dataset
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels:
            item["labels"] = torch.tensor(self.labels[idx]).long()
        return item

    def __len__(self):
        return len(self.encodings["input_ids"])


In [63]:
train_dataset = Dataset(X_train_tokenized, y_train)
dev_dataset = Dataset(X_dev_tokenized, y_dev)

In [64]:
# Define Trainer parameters
def compute_metrics(p):
    pred, labels = p
    pred = np.argmax(pred, axis=1)
    #labels= np.argmax(labels, axis=1)

    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    recall = recall_score(y_true=labels, y_pred=pred, average = "weighted")
    precision = precision_score(y_true=labels, y_pred=pred, average = "weighted")
    f1 = f1_score(y_true=labels, y_pred=pred, average = "weighted")

    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

In [65]:
# Define Trainer

args = TrainingArguments(
    output_dir="output",
    evaluation_strategy="steps",
    eval_steps=500,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    seed=0,
    load_best_model_at_end=True,
)
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=dev_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=5)],
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [66]:
# Train pre-trained model
trainer.train()

***** Running training *****
  Num examples = 53223
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 19959
  Number of trainable parameters = 109483778


Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
500,0.604,0.613519,0.745378,0.759086,0.745378,0.736754
1000,0.5535,0.696246,0.573801,0.733958,0.573801,0.516235
1500,0.5156,0.467818,0.803172,0.809095,0.803172,0.800062
2000,0.5043,0.4761,0.79979,0.799488,0.79979,0.799349
2500,0.4873,0.498677,0.81204,0.8121,0.81204,0.812068
3000,0.4732,0.452309,0.809109,0.812308,0.809109,0.80953
3500,0.4707,0.465473,0.816774,0.81683,0.816774,0.816096
4000,0.4761,0.480282,0.79979,0.799834,0.79979,0.798922
4500,0.4603,0.459546,0.818954,0.818816,0.818954,0.818466
5000,0.4745,0.464104,0.822261,0.826688,0.822261,0.820069


***** Running Evaluation *****
  Num examples = 13306
  Batch size = 8
Saving model checkpoint to output/checkpoint-500
Configuration saved in output/checkpoint-500/config.json
Model weights saved in output/checkpoint-500/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 13306
  Batch size = 8
Saving model checkpoint to output/checkpoint-1000
Configuration saved in output/checkpoint-1000/config.json
Model weights saved in output/checkpoint-1000/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 13306
  Batch size = 8
Saving model checkpoint to output/checkpoint-1500
Configuration saved in output/checkpoint-1500/config.json
Model weights saved in output/checkpoint-1500/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 13306
  Batch size = 8
Saving model checkpoint to output/checkpoint-2000
Configuration saved in output/checkpoint-2000/config.json
Model weights saved in output/checkpoint-2000/pytorch_model.bin
***** Running Evaluation *****


TrainOutput(global_step=5500, training_loss=0.5051745688698509, metrics={'train_runtime': 2721.0768, 'train_samples_per_second': 58.679, 'train_steps_per_second': 7.335, 'total_flos': 3549943692240000.0, 'train_loss': 0.5051745688698509, 'epoch': 0.83})

In [75]:
# test
test_csv = pd.read_csv('../data/ourdata/full_test.csv', header=0)

# create a new csv df
test_csv_new = pd.DataFrame(test_csv, columns=["id",	"version",	"batch.tweet", label_to_class, "tweet.id", "tweet_hashed"])
# drop all rows that have any NaN values
test_csv_new_clean = test_csv_new.dropna(axis=0,how="any")

X_test = tokenizer(list(test_csv_new_clean["tweet_hashed"]), padding=True, truncation=True, max_length=max_length)
y_test = [int(label) for label in list(test_csv_new_clean[label_to_class])]

test_dataset = Dataset(X_test, y_test)

In [68]:
# Load trained model
model_path = "output/checkpoint-3000"
trained_model = BertForSequenceClassification.from_pretrained(model_path, num_labels=2)

loading configuration file output/checkpoint-3000/config.json
Model config BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "problem_type": "single_label_classification",
  "torch_dtype": "float32",
  "transformers_version": "4.25.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading weights file output/checkpoint-3000/pytorch_model.bin
All model checkpoint weights were used when initializing BertForSequenceClassification.

All the weights of BertForSequenceC

In [69]:
# Define tester
args = TrainingArguments(
    output_dir="tester",
    dataloader_pin_memory=False
)
tester = Trainer(model=trained_model, args=args)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [70]:
# Make prediction
raw_pred, _, _ = tester.predict(test_dataset)

***** Running Prediction *****
  Num examples = 22282
  Batch size = 8


In [71]:
# Preprocess raw predictions
y_pred = np.argmax(raw_pred, axis=1)

In [72]:
print(classification_report(y_test, y_pred, target_names=["non-"+label_to_class,label_to_class]))

                        precision    recall  f1-score   support

non-offensive.language       0.71      0.84      0.77      9832
    offensive.language       0.85      0.73      0.79     12450

              accuracy                           0.78     22282
             macro avg       0.78      0.78      0.78     22282
          weighted avg       0.79      0.78      0.78     22282



In [76]:
test_csv_new_clean.insert(6,label_to_class+"_preds_bert_full",y_pred)

In [77]:
test_csv_new_clean.to_csv("../data/preds/" + label_to_class+ "_preds_bert_full.csv")