In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
dir = '/content/drive/MyDrive/VU-thesis-2023/'

In [None]:
%%capture

!pip install transformers

In [None]:
import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torch import nn
from torch.nn import CrossEntropyLoss
from torch.optim import Adam
import numpy as np
import pandas as pd
from transformers import BertTokenizer
from transformers import BertModel
from transformers.models.bert.modeling_bert import BertPreTrainedModel
from transformers.modeling_outputs import SequenceClassifierOutput
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
import random
from tqdm import tqdm, trange
from sklearn.metrics import classification_report
from datetime import date
import os
import shutil
from scipy.special import softmax
import re

In [None]:
# code from https://towardsdatascience.com/text-classification-with-bert-in-pytorch-887965e5820f
# code from https://towardsdatascience.com/how-to-use-datasets-and-dataloader-in-pytorch-for-custom-text-data-270eed7f7c00


In [None]:

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

class Dataset1(Dataset):

    def __init__(self, df, label_map):

        self.labels = [label_map[label] for label in df['labels']]
        self.texts = [tokenizer(text,
                               padding='max_length', max_length = 40, truncation=True,
                                return_tensors="pt") for text in df['text']]

    def classes(self):
        return self.labels

    def __len__(self):
        return len(self.labels)

    def get_batch_labels(self, idx):
        # Fetch a batch of labels
        return self.labels[idx]

    def get_batch_texts(self, idx):
        # Fetch a batch of inputs
        return self.texts[idx]

    def __getitem__(self, idx):

        batch_texts = self.get_batch_texts(idx)
        batch_y = self.get_batch_labels(idx)

        # batch = {"text": batch_texts, "label": batch_y, 'taskname': taskname}
        return batch_texts, batch_y

In [None]:

class BertSingletaskClassifier(BertPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels
        self.config = config

        self.bert = BertModel(config)
        classifier_dropout = (
            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
        )
        self.dropout = nn.Dropout(classifier_dropout)
        self.classifier = nn.Linear(config.hidden_size, self.num_labels)

        # Initialize weights and apply final processing
        self.post_init()


    def forward(
        self,
        input_ids=None,
        attention_mask = None,
        token_type_ids = None,
        position_ids = None,
        head_mask = None,
        inputs_embeds = None,
        labels = None,
        output_attentions = None,
        output_hidden_states = None,
        return_dict = None,
    ):
        r"""
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        pooled_output = outputs[1]

        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)

        loss = None
        if labels is not None:
            if self.config.problem_type is None:
                if self.num_labels == 1:
                    self.config.problem_type = "regression"
                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
                    self.config.problem_type = "single_label_classification"
                else:
                    self.config.problem_type = "multi_label_classification"

            if self.config.problem_type == "regression":
                loss_fct = MSELoss()
                if self.num_labels == 1:
                    loss = loss_fct(logits.squeeze(), labels.squeeze())
                else:
                    loss = loss_fct(logits, labels)
            elif self.config.problem_type == "single_label_classification":
                loss_fct = CrossEntropyLoss()
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            elif self.config.problem_type == "multi_label_classification":
                loss_fct = BCEWithLogitsLoss()
                loss = loss_fct(logits, labels)
        if not return_dict:
            output = (logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        return SequenceClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )


In [None]:

def train(train_df, num_labels, labels_dict, batchsize, num_epochs, learning_rate, out_dir):

  #make dataset from data
  train_data_hate = Dataset1(train_df, labels_dict)
  a=[]
  for i in range(int(len(train_data_hate)/batchsize)):
      a.append(1)

  print("len(a)=",len(a), 'so there are', len(a), 'training batches per epoch.')
  random.shuffle(a)
  #print("a=",a)
  print('There are', a.count(1), 'batches for this task.')


  use_cuda = torch.cuda.is_available()
  device = torch.device("cuda" if use_cuda else "cpu")

  model = BertSingletaskClassifier.from_pretrained('bert-base-uncased', num_labels=num_labels)
  optimizer = Adam(model.parameters(), lr= learning_rate) #parameters?
  model.to('cuda')

  epoch=0
  for _ in trange(num_epochs, desc="Epoch"):
      dataloader1 = DataLoader(train_data_hate, batch_size=batchsize, shuffle=True)

      random.shuffle(a)
      print("\na[:20]=",a[:20])
      epoch+=1
      model.train()
      tr_loss = 0
      nb_tr_examples, nb_tr_steps = 0, 0
      for step, number in enumerate((tqdm(a, desc="Iteration"))):
          if number==1:batch=dataloader1.__iter__().__next__()


          texts, labels = batch
          input_ids = texts['input_ids'].squeeze(1)
          token_type_ids = texts['token_type_ids'].squeeze(1)
          attention_mask = texts['attention_mask'].squeeze(1)

          input_ids = input_ids.to(device)
          token_type_ids = token_type_ids.to(device)
          attention_mask = attention_mask.to(device)
          labels = labels.to(device)



          train_output = model(input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask, labels=labels)
          loss = train_output.loss
          tr_loss += loss.item()
          nb_tr_steps += 1

          model.zero_grad()
          loss.backward()
          optimizer.step()

      print('\nmean loss:', tr_loss/nb_tr_steps)


  # save model
  torch.save(model.state_dict(), 'ST-bert-base-uncased.pt')

  if not os.path.exists(out_dir):
      os.mkdir(out_dir)

  shutil.copy('ST-bert-base-uncased.pt', out_dir)

  return model

In [None]:
def test(model, eval_df, labels_dict, batchsize, output_file, report_file=None):
  use_cuda = torch.cuda.is_available()
  device = torch.device("cuda" if use_cuda else "cpu")

  eval_data = Dataset1(eval_df, labels_dict)
  eval_dataloader = DataLoader(eval_data, batch_size=batchsize, shuffle=False)

  eval_loss, eval_accuracy = 0, 0
  nb_eval_steps, nb_eval_examples = 0, 0

  all_labels = []
  all_outputs = []

  model.eval()
  with open(output_file,"w") as f:
    f.write("Text\tPrediction\tGold\n")
    for texts, labels in eval_dataloader:
          input_ids = texts['input_ids'].squeeze(1)
          token_type_ids = texts['token_type_ids'].squeeze(1)
          attention_mask = texts['attention_mask'].squeeze(1)

          input_ids = input_ids.to(device)
          token_type_ids = token_type_ids.to(device)
          attention_mask = attention_mask.to(device)
          labels = labels.to(device)

          with torch.no_grad():
            #tmp_eval_loss, logits = model(input_ids, token_type_ids, attention_mask, labels)
            eval_output = model(input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask, labels=labels)

          logits = eval_output.logits.detach().cpu().numpy()
          labels = labels.to('cpu').numpy()
          input_ids = input_ids.to('cpu')
          outputs = np.argmax(logits, axis=1) #outputs = predictions
          for id, pred, gold in zip(input_ids, outputs, labels): #write predictions and gold labels to file
            f.write(' '.join(tokenizer.convert_ids_to_tokens(id))+"\t"+str(pred)+"\t"+str(gold)+"\n")

          tmp_eval_accuracy=np.sum(outputs == labels)

          eval_loss += eval_output.loss.mean().item()
          eval_accuracy += tmp_eval_accuracy

          nb_eval_examples += input_ids.size(0)
          nb_eval_steps += 1

          for output in outputs:
            all_outputs.append(output)
          for label in labels:
            all_labels.append(label)

  eval_loss = eval_loss / nb_eval_steps
  eval_accuracy = eval_accuracy / nb_eval_examples
  print('eval accuracy:', eval_accuracy)
  print('eval loss:', eval_loss, '\n')

  report = classification_report(all_labels, all_outputs, output_dict=True)
  print(report)
  if report_file is not None:
    df_report = pd.DataFrame(report).transpose()
    df_report.to_csv(report_file, sep='\t')

### AbuseEval

In [None]:
print('----Loading train data----')
train_hate1 = pd.read_csv(f'{dir}data/abuseeval/train.tsv', sep='\t')
train_hate2 = pd.read_csv(f'{dir}data/abuseeval/dev.tsv', sep='\t')
train_hate = pd.concat([train_hate1, train_hate2])
print('----Loading test data----')
eval_hate = pd.read_csv(f'{dir}data/abuseeval/test.tsv', sep='\t')

labels_dict = {0:0, 1:1, 2:2}
num_labels = 3

for run in [1, 2, 3, 4, 5]:
  finetuned_model = train(train_hate, num_labels, labels_dict, 16, 3, 2e-5, dir+'models/baseline_abuse_3ep_16_2e-5_RUN'+str(run))
  test(finetuned_model, eval_hate, labels_dict, 8, dir+'results/baseline_abuse_RUN'+str(run)+'.tsv' )

### Implicit Hate Corpus

In [None]:
print('----Loading train data----')
train_hate = pd.read_csv(f'{dir}data/implicithate/train1.tsv', sep='\t')

print('----Loading test data----')
eval_hate = pd.read_csv(f'{dir}data/implicithate/test1.tsv', sep='\t')

labels_dict = {'not_hate':0, 'explicit_hate':1, 'implicit_hate':2}
num_labels = 3

for run in [1, 2, 3, 4, 5]:
  finetuned_model = train(train_hate, num_labels, labels_dict, 16, 3, 2e-5, dir+'models/baseline_imp3class_3ep_16_2e-5_RUN'+str(run))
  test(finetuned_model, eval_hate, labels_dict, 8, dir+'results/baseline_imp3class_RUN'+str(run)+'.tsv', dir+'results/baseline_imp3class_report'+str(run)+'.tsv')

### TRAC

In [None]:
print('----Loading train data----')
train_hate1 = pd.read_csv(f'{dir}data/trac/train.tsv', sep='\t')
train_hate2 = pd.read_csv(f'{dir}data/trac/dev.tsv', sep='\t')
train_hate = pd.concat([train_hate1, train_hate2])
print('----Loading test data----')
eval_hate = pd.read_csv(f'{dir}data/trac/fb_test.tsv', sep='\t')

labels_dict = {'NAG':0, 'CAG':1, 'OAG':2}
num_labels = 3

for run in [1, 2, 3, 4, 5]:
  finetuned_model = train(train_hate, num_labels, labels_dict, 16, 3, 2e-5, dir+'models/baseline_trac_3ep_16_2e-5_RUN'+str(run))
  test(finetuned_model, eval_hate, labels_dict, 8, dir+'results/baseline_trac_RUN'+str(run)+'.tsv' )

### Composite (abuseval, implicit hate and abuseeval)

#### Combine data

In [None]:
# labels_dict2 = {'NAG':0, 'CAG':1, 'OAG':2}
# labels_dict3 = {'not_hate':0, 'explicit_hate':1, 'implicit_hate':2}

print('----Loading train data----')
#abuseeval
train_hate11 = pd.read_csv(f'{dir}data/abuseeval/train.tsv', sep='\t')
train_hate12 = pd.read_csv(f'{dir}data/abuseeval/dev.tsv', sep='\t')
train_hate1 = pd.concat([train_hate11, train_hate12])

#trac
train_hate21 = pd.read_csv(f'{dir}data/trac/train.tsv', sep='\t')
train_hate22 = pd.read_csv(f'{dir}data/trac/dev.tsv', sep='\t')
train_hate2 = pd.concat([train_hate21, train_hate22])
train_hate2.replace({'NAG':0, 'OAG':1, 'CAG':2}, inplace=True)
train_hate2.drop(columns=['id'], inplace=True)

#implicit hate
train_hate3 = pd.read_csv(f'{dir}data/implicithate/train1.tsv', sep='\t')
train_hate3.replace({'not_hate':0, 'explicit_hate':1, 'implicit_hate':2}, inplace=True)

#combine
train_hate = pd.concat([train_hate1, train_hate2, train_hate3])

print('----Loading test data----')
#abuseeval
eval_hate1 = pd.read_csv(f'{dir}data/abuseeval/test.tsv', sep='\t')
#trac
eval_hate2 = pd.read_csv(f'{dir}data/trac/fb_test.tsv', sep='\t')
eval_hate2.replace({'NAG':0, 'OAG':1, 'CAG':2}, inplace=True)
eval_hate2.drop(columns=['Id'], inplace=True)
#implicit hate
eval_hate3 = pd.read_csv(f'{dir}data/implicithate/test1.tsv', sep='\t')
eval_hate3.replace({'not_hate':0, 'explicit_hate':1, 'implicit_hate':2}, inplace=True)

#combine
#eval_hate = pd.concat([eval_hate1, eval_hate2, eval_hate3])

In [None]:
assert len(train_hate) == 45424, "length "
assert (train_hate['labels']==0).sum() == 27408, 'labels'
assert (train_hate['labels']==1).sum() == 6313, 'labels'
assert (train_hate['labels']==2).sum() == 11703, 'labels'

#### train/test

In [None]:
num_labels = 3
labels_dict = {0:0, 1:1, 2:2}

for run in [1, 2, 3, 4, 5]:
  finetuned_model = train(train_hate, num_labels, labels_dict, 16, 3, 2e-5, dir+'models/baseline_all_3ep_16_2e-5_RUN'+str(run))
  test(finetuned_model, eval_hate1, labels_dict, 8, dir+'results/baseline_all_abuse_RUN'+str(run)+'.tsv', dir+'results/baseline_all_abuse_report'+str(run)+'.tsv')
  test(finetuned_model, eval_hate2, labels_dict, 8, dir+'results/baseline_all_trac_RUN'+str(run)+'.tsv', dir+'results/baseline_all_trac_report'+str(run)+'.tsv')
  test(finetuned_model, eval_hate3, labels_dict, 8, dir+'results/baseline_all_imp_RUN'+str(run)+'.tsv', dir+'results/baseline_all_imp_report'+str(run)+'.tsv')