# BERT

In [80]:
!pip install transformers==4.28.1

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Collecting transformers==4.28.1
  Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)
[K     |████████████████████████████████| 7.0 MB 5.7 MB/s eta 0:00:01
Installing collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.26.1
    Uninstalling transformers-4.26.1:
      Successfully uninstalled transformers-4.26.1
Successfully installed transformers-4.28.1


In [82]:
import torch.nn as nn
from transformers import BertForSequenceClassification, BertTokenizer, get_linear_schedule_with_warmup
from torch.optim import AdamW
import pandas as pd
import torch
from torch.utils.data import TensorDataset, random_split, DataLoader, RandomSampler, SequentialSampler
import time
import pickle
from tqdm import tqdm
import matplotlib.pyplot as plt
import random
import numpy as np
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_auc_score
from sklearn.metrics import precision_recall_fscore_support, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
import torch
from transformers import TrainingArguments, Trainer
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import EarlyStoppingCallback
from collections import Counter

In [116]:
def softmax(x):
    exp_x = np.exp(x)
    return exp_x / np.sum(exp_x)
# Create torch dataset
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels:
            item["labels"] = torch.tensor(self.labels[idx]).long()
        return item

    def __len__(self):
        return len(self.encodings["input_ids"])

In [119]:
l = ["Epoch"]
for i in range(1,21):
    l.append(str(i))
train_log = pd.DataFrame(columns=l)

test_csv = dict()
test_csv_new = dict()
for test_data_version in ["A","B","C","D","E"]:
    # init test file
    test_csv[test_data_version] = pd.read_csv('../data/version'+test_data_version+'_test_s.csv', header=0)
    # create a new csv df with all the original columns
    test_csv_new[test_data_version] = pd.DataFrame(test_csv[test_data_version], columns=["id","version","batch.tweet","tweet.id", "tweet_hashed", "hate.speech", "offensive.language"])

In [120]:
for rate in range(100,101): # in case of few shot training
    for label_to_class in ["hate.speech", "offensive.language"]:
        for version in ["A","B","C", "D","E"]:
            model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased', num_labels=2)
            tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased', do_lower_case=False)
            DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
            EPOCHS = 20
            BATCH_SIZE = 16
            model=model.to(DEVICE)
            optimizer = AdamW(model.parameters(),lr = 5e-5, eps = 1e-12)
            train_csv_full = pd.read_csv("../data_sampled/train_dev_split/version"+version+label_to_class+'_train_sampled_train.csv',header=0)
            if rate < 100:
                split_i=round(len(train_csv_full)*rate*0.01)
                train = train_csv_full[:split_i]
                for i in range(6):
                    if train["tweet_hashed"][split_i-1]==train_csv_full["tweet_hashed"][split_i+i]:
                        train.loc[len(train)]=train_csv_full.loc[split_i+i]
            else:
                train = train_csv_full
            dev = pd.read_csv("../data_sampled/train_dev_split/version"+version+label_to_class+'_train_sampled_dev.csv',header=0)
            X_train = list(train["tweet_hashed"])
            y_train = list(train[label_to_class])
            X_dev=list(dev["tweet_hashed"])
            y_dev = list(dev[label_to_class])
            # calculate max length of the tweets
            max_length = 0
            for x in X_train:
                ids = tokenizer.encode(x)
                max_length = max(len(ids),max_length)
            torch.save(max_length, "bert.max_length."+label_to_class+version)
            X_train_tokenized = tokenizer(X_train, padding=True, truncation=True, max_length=max_length)
            X_dev_tokenized = tokenizer(X_dev, padding=True, truncation=True, max_length=max_length)

            train_dataset = Dataset(X_train_tokenized, y_train)
            dev_dataset = Dataset(X_dev_tokenized, y_dev)

            train_loader = DataLoader(train_dataset,
                                        batch_size=BATCH_SIZE,
                                        shuffle=True)

            dev_loader = DataLoader(dev_dataset,
                                    batch_size=BATCH_SIZE)

            total_steps = len(train_loader) * EPOCHS
            scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps = 0, num_training_steps = total_steps)
            
            #train_acc_list = [str(rate)+"_"+label_to_class+version+"_train"]
            #dev_acc_list = [str(rate)+"_"+label_to_class+version+"_dev"]
            train_f1_list = [str(rate)+"_"+label_to_class+version+"_train"]
            dev_f1_list = [str(rate)+"_"+label_to_class+version+"_dev"]
            train_loss_list = [str(rate)+"_"+label_to_class+version+"_train_loss"]
            dev_loss_list = [str(rate)+"_"+label_to_class+version+"_dev_loss"]
            
            #best_dev_acc = 0
            best_dev_f1 = 0
            best_epoch = 0
            print(f'Start training rate {rate}%, {label_to_class}, {version}...')
            start_time = time.time()
            for e in range(EPOCHS):
                print('training {} epoch...'.format(e+1))
                
                ### train
                model.train(True)
                train_loss = 0
                train_true_list=[]
                train_preds_list=[]
                train_preds_score_list=[]
                for batch in tqdm(train_loader):
                    input = batch['input_ids'].to(DEVICE)
                    mask = batch['attention_mask'].to(DEVICE)
                    token_type_ids=batch["token_type_ids"].to(DEVICE)
                    label = batch['labels'].to(DEVICE)
                    model.zero_grad()
                    loss, logits = model(input_ids=input.to(DEVICE), 
                        attention_mask=mask.to(DEVICE), 
                        token_type_ids=token_type_ids,
                        labels=label.to(DEVICE),return_dict=False)
                    predict_label = torch.argmax(logits, dim=1)
                    train_preds_list.extend(predict_label.tolist())
                    train_true_list.extend(label.tolist())
                    batch_preds_scores = []
                    for i in logits.tolist():
                        batch_preds_scores.append(softmax(i)[1])
                    train_preds_score_list.extend(batch_preds_scores)

                    train_loss += loss.item()
                    loss.backward()
                    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
                    optimizer.step()
                    scheduler.step()
                #train_acc = accuracy_score(train_true_list,train_preds_list)
                train_f1 = f1_score(train_true_list,train_preds_list, average="weighted")
                train_loss = train_loss / len(train_loader)
                #train_acc_list.append(train_acc)
                train_f1_list.append(train_f1)
                train_loss_list.append(train_loss)
                
                ### dev
                model.train(False)
                dev_loss = 0
                dev_true_list=[]
                dev_preds_list=[]
                dev_preds_score_list=[]
                with torch.no_grad():
                    for batch in tqdm(dev_loader):
                        input = batch['input_ids'].to(DEVICE)
                        mask = batch['attention_mask'].to(DEVICE)
                        token_type_ids=batch["token_type_ids"].to(DEVICE)
                        label = batch['labels'].to(DEVICE)
                        loss, logits = model(input_ids=input, attention_mask=mask, token_type_ids=token_type_ids,labels=label,return_dict=False)
                        predict_label = torch.argmax(logits, dim=1)
                        dev_loss += loss.item()
                        dev_preds_list.extend(predict_label.tolist())
                        dev_true_list.extend(label.tolist())
                        batch_preds_scores = []
                        for i in logits.tolist():
                            batch_preds_scores.append(softmax(i)[1])
                        train_preds_score_list.extend(batch_preds_scores)
                dev_loss = dev_loss / len(dev_loader)
                #dev_acc = accuracy_score(dev_true_list,dev_preds_list)
                dev_f1 = f1_score(dev_true_list,dev_preds_list, average="weighted")
                dev_f1_list.append(dev_f1)
                #dev_acc_list.append(dev_acc)
                dev_loss_list.append(dev_loss)
                #print(f'Epoch {e + 1}: train loss: {train_loss:.4f}, train acc: {train_acc:.4f}')
                #print(f'Epoch {e + 1}: dev loss: {dev_loss:.4f}, dev acc: {dev_acc:.4f}')
                print(f'Epoch {e + 1}: train loss: {train_loss:.4f}, train f1: {train_f1:.4f}')
                print(f'Epoch {e + 1}: dev loss: {dev_loss:.4f}, dev f1: {dev_f1:.4f}')
                sec = time.time()-start_time
                #if dev_acc > best_dev_acc:
                if dev_f1 > best_dev_f1:
                    #best_dev_acc = dev_acc
                    best_dev_f1 = dev_f1
                    best_epoch = e + 1
                    print(f'*** Epoch {e + 1}: dev metric higher than best dev metric, model saved!')
                    torch.save(model, "bert.model."+label_to_class+version)
            #print(f'Training finished! Best epoch is {best_epoch}, best dev acc is {best_dev_acc:.4f}, {sec} seconds used.')
            print(f'Training finished! Best epoch is {best_epoch}, best dev f1 is {best_dev_f1:.4f}, {sec} seconds used.')
            #train_log.loc[len(train_log)]=train_acc_list
            #train_log.loc[len(train_log)]=dev_acc_list
            train_log.loc[len(train_log)]=train_f1_list
            train_log.loc[len(train_log)]=dev_f1_list
            train_log.loc[len(train_log)]=train_loss_list
            train_log.loc[len(train_log)]=dev_loss_list

    ### test
    for test_data_version in ["A","B","C","D","E"]:

        X_test_tokenized = tokenizer(list(test_csv_new[test_data_version]["tweet_hashed"]), padding=True, truncation=True, max_length=max_length)

        # choose trained model version to test
        for label_to_test in ["hate.speech", "offensive.language"]:
            for version_to_test in ["A","B","C","D","E"]:
                # preprocess the test data
                max_length = torch.load("bert.max_length."+label_to_test+version_to_test, map_location=DEVICE)
                y_test = list(test_csv_new[test_data_version][label_to_test])

                test_dataset = Dataset(X_test_tokenized, y_test)
                test_loader = DataLoader(
                                    test_dataset,
                                    batch_size=BATCH_SIZE,
                                    )
                # Load trained model
                trained_model = torch.load("bert.model."+label_to_test+version_to_test, map_location=DEVICE)
                trained_model.eval()
                preds_list = []
                preds_scores = []
                with torch.no_grad():
                    for batch in tqdm(test_loader):
                        input = batch['input_ids'].to(DEVICE)
                        mask = batch['attention_mask'].to(DEVICE)
                        token_type_ids=batch["token_type_ids"].to(DEVICE)
                        label = batch['labels'].to(DEVICE)
                        loss, logits = trained_model(input_ids=input, attention_mask=mask, token_type_ids=token_type_ids,labels=label,return_dict=False)
                        predict_label = torch.argmax(logits, dim=1)
                        preds_list.extend(predict_label.tolist())
                        y_pred_scores = []
                        for i in logits.tolist():
                            y_pred_scores.append(softmax(i)[1])
                        preds_scores.extend(y_pred_scores)
                preds_list = [int(i) for i in preds_list]
                test_csv_new[test_data_version][str(rate)+"_"+label_to_test+"_preds_"+version_to_test] = preds_list
                test_csv_new[test_data_version][str(rate)+"_"+label_to_test+"_preds_"+version_to_test+"_scores"]=preds_scores


IndentationError: unexpected indent (1471463378.py, line 151)

In [None]:
#train_log.to_csv("bert_train_accs_median_1to100.csv")
train_log.to_csv("bert_train_f1_median_1to100.csv")
for test_data_version in ["A","B","C","D","E"]:
    test_csv_new[test_data_version].to_csv("bert_test"+test_data_version+"_median_1to100.csv")