# BERT

In [5]:
import torch.nn as nn
from transformers import BertForSequenceClassification, BertTokenizer, AdamW, get_linear_schedule_with_warmup
import pandas as pd
import torch
from torch.utils.data import TensorDataset, random_split, DataLoader, RandomSampler, SequentialSampler
import time
import pickle
from tqdm import tqdm
import matplotlib.pyplot as plt
import random
import numpy as np
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from sklearn.metrics import precision_recall_fscore_support, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split

import torch
from transformers import TrainingArguments, Trainer
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import EarlyStoppingCallback

from collections import Counter
     

In [None]:
label_to_class= "offensive.language"  # enter the label to be classified
version = "E" # enter the version

In [6]:
# Define pretrained tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [7]:
# Code for running the raw data

# # no drops
# # csv = pd.read_csv('../data/version'+version+'_train.csv',header=0)

# # # create a new csv df
# # csv_new = pd.DataFrame(csv, columns=[label_to_class, "tweet_hashed"])
# # # drop all rows that have any NaN values
# # csv_new_clean = csv_new.dropna(axis=0,how="any")

# # drops to 3 examples per tweet
# csv = pd.read_csv('../data/version'+version+'_train.csv',header=0)
# # create a new csv df
# csv_new = pd.DataFrame(csv, columns=[label_to_class, "tweet_hashed"])
# # drop all rows that have any NaN values
# csv_new_clean = csv_new.dropna(axis=0,how="any")
# # save tweets into a list
# tweet=list(csv_new_clean['tweet_hashed'])
# # count tweet freqs
# tweet_count=Counter(tweet)
# for t, c in tweet_count.items():
#     if c > 3:
#         # get the index for a specific tweet into a list
#         index = csv_new_clean[csv_new_clean.tweet_hashed == t].index.tolist()
#         # randomly choose index to drop
#         index_to_drop = random.sample(index, c-3)
#         csv_new_clean = csv_new_clean.drop(index_to_drop, axis=0)

In [None]:
# Code for running the sampled data

csv = pd.read_csv("../data_sampled/version"+version+label_to_class+'_train_sampled.csv',header=0)

In [8]:
# calculate max length of the tweets
X= list(csv["tweet_hashed"])
max_length = 0
for x in X:
    ids = tokenizer.encode(x)
    max_length = max(len(ids),max_length)

torch.save(max_length, "bert.max_length."+label_to_class+version)

y = list(csv[label_to_class])
X_train, X_dev, y_train, y_dev = train_test_split(X, y, test_size=0.2, random_state=42)
X_train_tokenized = tokenizer(X_train, padding=True, truncation=True, max_length=max_length)
X_dev_tokenized = tokenizer(X_dev, padding=True, truncation=True, max_length=max_length)

In [9]:
# Create torch dataset
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels:
            item["labels"] = torch.tensor(self.labels[idx]).long()
        return item

    def __len__(self):
        return len(self.encodings["input_ids"])

In [10]:
train_dataset = Dataset(X_train_tokenized, y_train)
dev_dataset = Dataset(X_dev_tokenized, y_dev)

In [11]:
# Define Trainer parameters
def compute_metrics(p):
    pred, labels = p
    pred = np.argmax(pred, axis=1)
    #labels= np.argmax(labels, axis=1)

    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    recall = recall_score(y_true=labels, y_pred=pred, average = "weighted")
    precision = precision_score(y_true=labels, y_pred=pred, average = "weighted")
    f1 = f1_score(y_true=labels, y_pred=pred, average = "weighted")

    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

In [12]:
# Define Trainer

args = TrainingArguments(
    output_dir=label_to_class+version,
    evaluation_strategy="steps",
    eval_steps=500,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=15,
    seed=42,
    metric_for_best_model="f1",
    load_best_model_at_end=True,
)
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=dev_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=5)],
)

In [None]:
# Train pre-trained model
trainer.train()

***** Running training *****
  Num examples = 5397
  Num Epochs = 15
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 10125
  Number of trainable parameters = 109483778


Step,Training Loss,Validation Loss


***** Running Evaluation *****
  Num examples = 1350
  Batch size = 8


# test the models

In [None]:
for test_data_version in ["A","B","C","D","E"]:

  # init test file
  test_csv = pd.read_csv('../data/version'+test_data_version+'_test_s.csv', header=0)

  # create a new csv df with all the original columns
  test_csv_new = pd.DataFrame(test_csv, columns=["id", "version","batch.tweet","tweet.id", "tweet_hashed", "hate.speech", "offensive.language"])

  def softmax(x):
      exp_x = np.exp(x)
      return exp_x / np.sum(exp_x)

  for label_to_test in ["hate.speech", "offensive.language"]:
    for version_to_test in ["A","B","C","D","E"]:

      # preprocess the test data
      max_length = torch.load("bert.max_length."+label_to_test+version_to_test, map_location=DEVICE)

      X_test = tokenizer(list(test_csv_new["tweet_hashed"]), padding=True, truncation=True, max_length=max_length)

      test_dataset = Dataset(X_test)

      # Load trained model
      model_path = "bert.model."+label_to_test+version_to_test
      trained_model = BertForSequenceClassification.from_pretrained(model_path, num_labels=2)

      # Define tester
      args = TrainingArguments(
          output_dir="tester",
          dataloader_pin_memory=False,
      )
      tester = Trainer(model=trained_model, args=args)

      # Make prediction
      raw_pred, _, _ = tester.predict(test_dataset)
      
      y_pred_scores = []
      for i in raw_pred:
        y_pred_scores.append(softmax(i)[1])

      # Preprocess raw predictions
      y_pred = np.argmax(raw_pred, axis=1)

      if label_to_test == "hate.speech":
        if version_to_test == "A":
          column = 7
        elif version_to_test == "B":
          column = 9
        elif version_to_test == "C":
          column = 11
        elif version_to_test == "D":
          column = 13
        elif version_to_test == "E":
          column = 15
        else:
          raise KeyError
      elif label_to_test == "offensive.language":
        if version_to_test == "A":
          column = 17
        elif version_to_test == "B":
          column = 19
        elif version_to_test == "C":
          column = 21
        elif version_to_test == "D":
          column = 23
        elif version_to_test == "E":
          column = 25
        else:
          raise KeyError
      else:
        raise KeyError

      test_csv_new.insert(column,label_to_test+"_preds_"+version_to_test,y_pred)
      test_csv_new.insert(column+1,label_to_test+"_preds_"+version_to_test+"_scores",y_pred_scores)

  test_csv_new.to_csv("../preds_median/bert_test"+test_data_version+"_median.csv")