In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
dir = '/content/drive/MyDrive/VU-thesis-2023/'

In [None]:
%%capture

!pip install transformers

In [None]:
import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torch import nn
from torch.nn import CrossEntropyLoss
from torch.optim import Adam
import numpy as np
import pandas as pd
from transformers import BertTokenizer
from transformers import BertModel, BertConfig
from transformers.models.bert.modeling_bert import BertPreTrainedModel
from transformers.modeling_outputs import SequenceClassifierOutput
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
import random
from tqdm import tqdm, trange
from sklearn.metrics import classification_report
import os
import shutil

## Define functions and classes

In [None]:
# code from https://towardsdatascience.com/text-classification-with-bert-in-pytorch-887965e5820f
# code from https://towardsdatascience.com/how-to-use-datasets-and-dataloader-in-pytorch-for-custom-text-data-270eed7f7c00


In [None]:

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

class Dataset1(Dataset):

    def __init__(self, df, taskname, label_map):

        self.labels = [label_map[label] for label in df['labels']]
        self.texts = [tokenizer(text,
                               padding='max_length', max_length = 40, truncation=True,
                                return_tensors="pt") for text in df['text']]
        self.taskname = taskname

    def classes(self):
        return self.labels

    def __len__(self):
        return len(self.labels)

    def get_batch_labels(self, idx):
        # Fetch a batch of labels
        return self.labels[idx]

    def get_batch_texts(self, idx):
        # Fetch a batch of inputs
        return self.texts[idx]

    def __getitem__(self, idx):

        batch_texts = self.get_batch_texts(idx)
        batch_y = self.get_batch_labels(idx)
        taskname = self.taskname

        return batch_texts, batch_y, taskname

In [None]:

class BertMultitaskClassifier(BertPreTrainedModel):
    def __init__(self, config, labels_map):
        super().__init__(config)
        self.num_labels1 = labels_map[0]
        self.num_labels2 = labels_map[1]

        self.config = config

        self.bert = BertModel(config)
        classifier_dropout = (
            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
        )
        self.dropout = nn.Dropout(classifier_dropout)
        self.classifier1 = nn.Linear(config.hidden_size, self.num_labels1)
        self.classifier2 = nn.Linear(config.hidden_size, self.num_labels2)


        # Initialize weights and apply final processing
        self.post_init()


    def forward(
        self,
        input_ids=None,
        attention_mask = None,
        token_type_ids = None,
        position_ids = None,
        head_mask = None,
        inputs_embeds = None,
        labels = None,
        output_attentions = None,
        output_hidden_states = None,
        return_dict = None,
        taskname=None
    ):
        r"""
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        pooled_output = outputs[1]

        pooled_output = self.dropout(pooled_output)
        if taskname.item()==1:logits = self.classifier1(pooled_output) #taskname is a torch tensor
        if taskname.item()==2:logits = self.classifier2(pooled_output)



        loss = None
        if labels is not None:
            self.config.problem_type = 'single_label_classification'


            if self.config.problem_type == "regression":
                loss_fct = MSELoss()
                if self.num_labels[0] == 1:
                    loss = loss_fct(logits.squeeze(), labels.squeeze())
                else:
                    loss = loss_fct(logits, labels)
            elif self.config.problem_type == "single_label_classification":
                loss_fct = CrossEntropyLoss()
                if taskname.item()==1:
                  loss = loss_fct(logits.view(-1, self.num_labels1), labels.view(-1))
                if taskname.item()==2:
                  loss = loss_fct(logits.view(-1, self.num_labels2), labels.view(-1))



            elif self.config.problem_type == "multi_label_classification":
                loss_fct = BCEWithLogitsLoss()
                loss = loss_fct(logits, labels)
        if not return_dict:
            output = (logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        return SequenceClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )


In [None]:

def train(train_df1, train_df2,
          labels_dict, num_labels, batchsize, num_epochs, learning_rate,
          eval_df=None, eval_task=None, eval_result_file=None, model_save_dir=None):


  print('----Preparing data----')
  labels_dict1 = labels_dict[1]
  labels_dict2 = labels_dict[2]


  train_data1 = Dataset1(train_df1, '1', labels_dict1)
  train_data2 = Dataset1(train_df2, '2', labels_dict2)


  a=[]
  for i in range(int(len(train_data1)/batchsize)):
      a.append(1)
  for i in range(int(len(train_data2)/batchsize)):
      a.append(2)


  print("len(a)=",len(a), 'so there are', len(a), 'training batches per epoch.')
  random.shuffle(a)
  #print("a=",a)
  print('There are', a.count(1), 'batches for task one.')
  print('There are', a.count(2), 'batches for task two.')


  use_cuda = torch.cuda.is_available()
  device = torch.device("cuda" if use_cuda else "cpu")


  model = BertMultitaskClassifier.from_pretrained("bert-base-uncased", labels_map=num_labels) #config=config

  optimizer = Adam(model.parameters(), lr= learning_rate)
  model.to(device)

  epoch=0
  for _ in trange(num_epochs, desc="Epoch"):
      print('----Training----')
      dataloader1 = DataLoader(train_data1, batch_size=batchsize, shuffle=True)
      dataloader2 = DataLoader(train_data2, batch_size=batchsize, shuffle=True)

      random.shuffle(a)
      print("\na[:20]=",a[:20])
      epoch+=1
      model.train()
      tr_loss = 0
      nb_tr_examples, nb_tr_steps = 0, 0
      for step, number in enumerate((tqdm(a, desc="Iteration"))):
          if number==1:batch=dataloader1.__iter__().__next__()
          if number==2:batch=dataloader2.__iter__().__next__()

          texts, labels, tasknames = batch
          input_ids = texts['input_ids'].squeeze(1)
          token_type_ids = texts['token_type_ids'].squeeze(1)
          attention_mask = texts['attention_mask'].squeeze(1)

          input_ids = input_ids.to(device)
          token_type_ids = token_type_ids.to(device)
          attention_mask = attention_mask.to(device)
          labels = labels.to(device)

          taskname = tasknames[0] #per batch all the tasknames are the same, so just taking the first one to pass to the bertmodel
          if taskname=='1':task_name = torch.tensor([1], device='cuda')
          if taskname=='2':task_name = torch.tensor([2], device='cuda')

          output = model(input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask, labels=labels, taskname=task_name)
          loss = output.loss

          nb_tr_steps += 1
          model.zero_grad()
          loss.backward()
          optimizer.step()

          tr_loss += loss.item()

      print('\nmean loss:', tr_loss/nb_tr_steps)

      if eval_df is not None:
        print('----Evaluating----')
        if eval_task == '1':labels_diction = labels_dict1
        if eval_task == '2':labels_diction=labels_dict2


        eval_data = Dataset1(eval_df, eval_task, labels_diction)
        eval_dataloader = DataLoader(eval_data, batch_size=batchsize, shuffle=False)

        model.eval()

        all_labels = []
        all_outputs = []

        for texts, labels, tasknames in eval_dataloader:
          input_ids = texts['input_ids'].squeeze(1)
          token_type_ids = texts['token_type_ids'].squeeze(1)
          attention_mask = texts['attention_mask'].squeeze(1)

          input_ids = input_ids.to(device)
          token_type_ids = token_type_ids.to(device)
          attention_mask = attention_mask.to(device)
          labels = labels.to(device)
          taskname = tasknames[0] #per batch all the tasknames are the same, so just taking the first one to pass to the bertmodel
          if taskname=='1':task_name = torch.tensor([1], device='cuda')
          if taskname=='2':task_name = torch.tensor([2], device='cuda')


          with torch.no_grad():
            output = model(input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask, labels=labels, taskname=task_name)

          logits = output.logits.detach().cpu().numpy()
          labels = labels.to('cpu').numpy()
          outputs = np.argmax(logits, axis=1) #outputs = predictions

        for output in outputs:
          all_outputs.append(output)
        for label in labels:
          all_labels.append(label)

        report = classification_report(all_labels, all_outputs, output_dict=True)
        macroavg = report['macro avg']['f1-score']
        with open(dir+eval_result_file, 'a') as out:
          out.write(str(learning_rate)+'\t'+str(batchsize)+'\t'+str(epoch)+'\t'+str(macroavg)+'\n')

  print('----Done training!----')
  # save model
  if model_save_dir:
    print('----Saving model----')
    torch.save(model.state_dict(), 'MT-bert-base-uncased.pt')
    if not os.path.exists(model_save_dir):
        os.mkdir(model_save_dir)

    shutil.copy('MT-bert-base-uncased.pt', model_save_dir)
  return model

In [None]:
def predict(model, eval_df, batchsize, task, labels_dict, use_gold_labels=False):
  use_cuda = torch.cuda.is_available()
  device = torch.device("cuda" if use_cuda else "cpu")

  print('----Preparing data----')
  eval_data = Dataset1(eval_df, task, labels_dict)
  eval_dataloader = DataLoader(eval_data, batch_size=batchsize, shuffle=False)

  eval_loss, eval_accuracy = 0, 0
  nb_eval_steps, nb_eval_examples = 0, 0

  all_labels = []
  all_outputs = []

  model.eval()
  print('----Predicting----')
  for texts, labels, tasknames in eval_dataloader:
    input_ids = texts['input_ids'].squeeze(1)
    token_type_ids = texts['token_type_ids'].squeeze(1)
    attention_mask = texts['attention_mask'].squeeze(1)

    input_ids = input_ids.to(device)
    token_type_ids = token_type_ids.to(device)
    attention_mask = attention_mask.to(device)
    if use_gold_labels:
      labels = labels.to(device)
    taskname = tasknames[0] #per batch all the tasknames are the same, so just taking the first one to pass to the bertmodel
    if taskname=='1':task_name = torch.tensor([1], device='cuda')
    if taskname=='2':task_name = torch.tensor([2], device='cuda')


    with torch.no_grad():
            #tmp_eval_loss, logits = model(input_ids, token_type_ids, attention_mask, task_name, labels)
      if use_gold_labels:
        output = model(input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask, labels=labels, taskname=task_name)
      else:
        output = model(input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask, labels=None, taskname=task_name)

    logits = output.logits.detach().cpu().numpy()
    if use_gold_labels:
      labels = labels.to('cpu').numpy()
    outputs = np.argmax(logits, axis=1) #outputs = predictions

    tmp_eval_accuracy=np.sum(outputs == labels)

    if use_gold_labels:
      eval_loss += output.loss.mean().item()
      eval_accuracy += tmp_eval_accuracy

    nb_eval_examples += input_ids.size(0)
    nb_eval_steps += 1

    for output in outputs:
      all_outputs.append(output)
    if use_gold_labels:
      for label in labels:
        all_labels.append(label)

  print('----Done!----')
  if use_gold_labels:
    eval_loss = eval_loss / nb_eval_steps
    print('eval loss:', eval_loss, '\n')
    eval_accuracy = eval_accuracy / nb_eval_examples
    print('eval accuracy:', eval_accuracy, '\n')

  if use_gold_labels:
    return all_labels, all_outputs
  else:
    return all_outputs


In [None]:
def report(gold_labels, outputs, report_file=None):
  report = classification_report(gold_labels, outputs, output_dict=True)
  print(report)
  if report_file is not None:
    df_report = pd.DataFrame(report).transpose()
    df_report.to_csv(report_file, sep='\t')


In [None]:
#train
def train_and_predict(df1, df2,
                      labels_dict, num_labels, batchsize, epochs, lr,
                      model_save_dir,
                      all_preds_file,
                      report_file,
                      reverse_dict, run):

  finetuned_model = train(df1, df2,
                        labels_dict, num_labels, batchsize, epochs, lr,
                        model_save_dir=model_save_dir+str(run))

  #predict with each classifier on the three gold datasets
  golds, preds = predict(finetuned_model, eval_hate, 8, '1', labels_dict[1], use_gold_labels=True) #main task
  auxpreds = predict(finetuned_model, eval_hate, 8, '2', labels_dict[1]) #aux task

  #save classification reports
  report(golds, preds, report_file=report_file+str(run)+'.tsv')

  #save all predictions on gold datasets
  with open(all_preds_file+str(run)+'.tsv',"w") as f:
      f.write("Gold\tmaintask\tauxtask\n")
      for gold, pred, auxpred in zip(golds, preds, auxpreds):
        f.write(reverse_dict[1][gold]+"\t"+reverse_dict[1][pred]+"\t"+reverse_dict[2][auxpred]+"\n")

### AbuseEval + sentiment

In [None]:
print('----Loading train data----')
train_hate1 = pd.read_csv(f'{dir}data/abuseeval/train.tsv', sep='\t')
train_hate2 = pd.read_csv(f'{dir}data/abuseeval/dev.tsv', sep='\t')
train_hate = pd.concat([train_hate1, train_hate2])

train_sent = pd.read_csv(f'{dir}data/sentiment/test2016.tsv', sep='\t')

print('----Loading test data----')
eval_hate = pd.read_csv(f'{dir}data/abuseeval/test.tsv', sep='\t')

In [None]:
labels_dict = {1: {0:0, 1:1, 2:2},
               2: {'neutral': 0, 'positive':1, 'negative': 2}}

reverse_labels_dict ={1: {0:'not_abuse', 1:'explicit_abuse', 2:'implicit_abuse'},
                      2: {0: 'neutral', 1: 'positive', 2: 'negative'}}

num_labels = [3, 3]

In [None]:
for run in [1, 2, 3, 4, 5]:
  train_and_predict(train_hate, train_sent,
                  labels_dict, num_labels, 16, 3, 2e-5,
                  dir+'models/abuseeval+sent_3ep_16_2e-5_RUN',
                  dir+'results/abuseeval+sent_abuse_RUN',
                  dir+'results/abuseeval+sent_report',
                  reverse_labels_dict, run)

### AbuseEval + emotion

In [None]:
print('----Loading train data----')
train_hate1 = pd.read_csv(f'{dir}data/abuseeval/train.tsv', sep='\t')
train_hate2 = pd.read_csv(f'{dir}data/abuseeval/dev.tsv', sep='\t')
train_hate = pd.concat([train_hate1, train_hate2])
train_emo = pd.read_csv(f'{dir}data/emotion/tec.tsv', sep='\t')
print('----Loading test data----')
eval_hate = pd.read_csv(f'{dir}data/abuseeval/test.tsv', sep='\t')

In [None]:
labels_dict = {1: {0:0, 1:1, 2:2},
               2: {'anger': 0, 'disgust':1, 'fear': 2, 'joy': 3, 'sadness': 4, 'surprise': 5}}

reverse_labels_dict ={1: {0:'not_abuse', 1:'explicit_abuse', 2:'implicit_abuse'},
                      2: {0: 'anger', 1: 'disgust', 2: 'fear', 3: 'joy', 4: 'sadness', 5: 'surprise'}}

num_labels = [3, 6]

In [None]:
for run in [1, 2, 3, 4, 5]:
  	train_and_predict(train_hate, train_emo,
                  labels_dict, num_labels, 16, 3, 2e-5,
                  dir+'models/abuseeval+emo_3ep_16_2e-5_RUN',
                  dir+'results/abuseeval+emo_RUN',
                  dir+'results/abuseeval+emo_report',
                  reverse_labels_dict, run)

### AbuseEval + sarcasm

In [None]:
print('----Loading train data----')
train_hate1 = pd.read_csv(f'{dir}data/abuseeval/train.tsv', sep='\t')
train_hate2 = pd.read_csv(f'{dir}data/abuseeval/dev.tsv', sep='\t')
train_hate = pd.concat([train_hate1, train_hate2])

train_sarc1 = pd.read_csv(f'{dir}data/sarcasm/twitter-train-nocontext.tsv', sep='\t')
train_sarc2 = pd.read_csv(f'{dir}data/sarcasm/reddit-train-nocontext.tsv', sep='\t')
train_sarcasm = pd.concat([train_sarc1, train_sarc2])

print('----Loading test data----')
eval_hate = pd.read_csv(f'{dir}data/abuseeval/test.tsv', sep='\t')

In [None]:
labels_dict = {1: {0:0, 1:1, 2:2},
               2: {'NOT_SARCASM': 0, 'SARCASM':1}}

reverse_labels_dict = {1: {0:'not_abuse', 1:'explicit_abuse', 2:'implicit_abuse'},
               2: {0: 'not_sarcasm', 1: 'sarcasm'}}

num_labels = [3, 2]

In [None]:
for run in [1, 2, 3, 4, 5]:
  train_and_predict(train_hate, train_sarcasm,
                  labels_dict, num_labels, 16, 3, 2e-5,
                  dir+'models/abuseeval+sarc_3ep_16_2e-5_RUN',
                  dir+'results/abuseeval+sarc_RUN',
                  dir+'results/abuseeval+sarc_report',
                  reverse_labels_dict, run)

### AbuseEval + irony

In [None]:
print('----Loading train data----')
train_hate1 = pd.read_csv(f'{dir}data/abuseeval/train.tsv', sep='\t')
train_hate2 = pd.read_csv(f'{dir}data/abuseeval/dev.tsv', sep='\t')
train_hate = pd.concat([train_hate1, train_hate2])

train_irony = pd.read_csv(f'{dir}data/irony/train.tsv', sep='\t')

print('----Loading test data----')
eval_hate = pd.read_csv(f'{dir}data/abuseeval/test.tsv', sep='\t')

In [None]:
labels_dict = {1: {0:0, 1:1, 2:2},
               2: {0:0, 1:1}}

reverse_labels_dict = {1: {0:'not_abuse', 1:'explicit_abuse', 2:'implicit_abuse'},
               2: {0:'not_irony', 1:'irony'}}

num_labels = [3, 2]

In [None]:
for run in [1, 2, 3, 4, 5]:
  train_and_predict(train_hate, train_irony,
                  labels_dict, num_labels, 16, 3, 2e-5,
                  dir+'models/abuseeval+iro_3ep_16_2e-5_RUN',
                  dir+'results/abuseeval+iro_RUN',
                  dir+'results/abuseeval+iro_report',
                  reverse_labels_dict, run)

### TRAC + sentiment

In [None]:
train_hate1 = pd.read_csv(f'{dir}data/trac/train.tsv', sep='\t')
train_hate2 = pd.read_csv(f'{dir}data/trac/dev.tsv', sep='\t')
train_hate = pd.concat([train_hate1, train_hate2])

eval_hate = pd.read_csv(f'{dir}data/trac/fb_test.tsv', sep='\t')

train_sent = pd.read_csv(f'{dir}data/sentiment/test2016.tsv', sep='\t')

In [None]:
labels_dict = {1: {'NAG':0, 'OAG':1, 'CAG':2},
               2: {'neutral': 0, 'positive':1, 'negative': 2}}

reverse_labels_dict ={1: {0: 'not_aggression', 1: 'overt_aggression', 2: 'covert_aggression'},
                      2: {0: 'neutral', 1: 'positive', 2: 'negative'}}

num_labels = [3, 3]

In [None]:
for run in [1, 2, 3, 4, 5]:
  train_and_predict(train_hate, train_sent,
                  labels_dict, num_labels, 16, 3, 2e-5,
                  dir+'models/trac+sent_3ep_16_2e-5_RUN',
                  dir+'results/trac+sent_RUN',
                  dir+'results/trac+sent_report',
                  reverse_labels_dict, run)

### TRAC + emotion

In [None]:
train_hate1 = pd.read_csv(f'{dir}data/trac/train.tsv', sep='\t')
train_hate2 = pd.read_csv(f'{dir}data/trac/dev.tsv', sep='\t')
train_hate = pd.concat([train_hate1, train_hate2])

eval_hate = pd.read_csv(f'{dir}data/trac/fb_test.tsv', sep='\t')

train_emo = pd.read_csv(f'{dir}data/emotion/tec.tsv', sep='\t')

In [None]:
labels_dict = {1: {'NAG':0, 'OAG':1, 'CAG':2},
               2: {'anger': 0, 'disgust':1, 'fear': 2, 'joy': 3, 'sadness': 4, 'surprise': 5}}

reverse_labels_dict = {1: {0: 'not_aggression', 1: 'overt_aggression', 2: 'covert_aggression'},
               2: {0: 'anger', 1: 'disgust', 2: 'fear', 3: 'joy', 4: 'sadness', 5: 'surprise'}}

num_labels = [3, 6]

In [None]:
for run in [1, 2, 3, 4, 5]:
  train_and_predict(train_hate, train_emo,
                  labels_dict, num_labels, 16, 3, 2e-5,
                  dir+'models/trac+emo_3ep_16_2e-5_RUN',
                  dir+'results/trac+emo_RUN',
                  dir+'results/trac+emo_report',
                  reverse_labels_dict, run)

### TRAC + sarcasm

In [None]:
train_hate1 = pd.read_csv(f'{dir}data/trac/train.tsv', sep='\t')
train_hate2 = pd.read_csv(f'{dir}data/trac/dev.tsv', sep='\t')
train_hate = pd.concat([train_hate1, train_hate2])

eval_hate = pd.read_csv(f'{dir}data/trac/fb_test.tsv', sep='\t')

train_sarc1 = pd.read_csv(f'{dir}data/sarcasm/twitter-train-nocontext.tsv', sep='\t')
train_sarc2 = pd.read_csv(f'{dir}data/sarcasm/reddit-train-nocontext.tsv', sep='\t')
train_sarcasm = pd.concat([train_sarc1, train_sarc2])

In [None]:
labels_dict = {1: {'NAG':0, 'OAG':1, 'CAG':2},
               2: {'NOT_SARCASM': 0, 'SARCASM':1}}

reverse_labels_dict = {1: {0: 'not_aggression', 1: 'overt_aggression', 2: 'covert_aggression'},
                       2: {0: 'not_sarcasm', 1: 'sarcasm'}}

num_labels = [3, 2]

In [None]:
for run in [1, 2, 3, 4, 5]:
  train_and_predict(train_hate, train_sarcasm,
                  labels_dict, num_labels, 16, 3, 2e-5,
                  dir+'models/trac+sarc_3ep_16_2e-5_RUN',
                  dir+'results/trac+sarc_RUN',
                  dir+'results/trac+sarc_report',
                  reverse_labels_dict, run)

### TRAC + irony

In [None]:
train_hate1 = pd.read_csv(f'{dir}data/trac/train.tsv', sep='\t')
train_hate2 = pd.read_csv(f'{dir}data/trac/dev.tsv', sep='\t')
train_hate = pd.concat([train_hate1, train_hate2])

train_irony = pd.read_csv(f'{dir}data/irony/train.tsv', sep='\t')

eval_hate = pd.read_csv(f'{dir}data/trac/fb_test.tsv', sep='\t')

In [None]:
labels_dict = {1: {'NAG':0, 'OAG':1, 'CAG':2},
               2: {0:0, 1:1}}

reverse_labels_dict = {1: {0: 'not_aggression', 1: 'overt_aggression', 2: 'covert_aggression'},
               2: {0:'not_irony', 1:'irony'}}

num_labels = [3, 2]

In [None]:
for run in [1, 2, 3, 4, 5]:
  train_and_predict(train_hate, train_irony,
                  labels_dict, num_labels, 16, 3, 2e-5,
                  dir+'models/trac+iro_3ep_16_2e-5_RUN',
                  dir+'results/trac+iro_RUN',
                  dir+'results/trac+iro_report',
                  reverse_labels_dict, run)

### IHC + sentiment

In [None]:
train_hate = pd.read_csv(f'{dir}data/implicithate/train1.tsv', sep='\t')

eval_hate = pd.read_csv(f'{dir}data/implicithate/test1.tsv', sep='\t')

train_sent = pd.read_csv(f'{dir}data/sentiment/test2016.tsv', sep='\t')

In [None]:
labels_dict = {1: {'not_hate':0, 'explicit_hate':1, 'implicit_hate':2},
               2: {'neutral': 0, 'positive':1, 'negative': 2}}

reverse_labels_dict ={1: {0: 'not_hate', 1: 'explicit_hate', 2: 'implicit_hate'},
                      2: {0: 'neutral', 1: 'positive', 2: 'negative'}}

num_labels = [3, 3]

In [None]:
for run in [1, 2, 3, 4, 5]:
  train_and_predict(train_hate, train_sent,
                  labels_dict, num_labels, 16, 3, 2e-5,
                  dir+'models/ihc+sent_3ep_16_2e-5_RUN',
                  dir+'results/ihc+sent_RUN',
                  dir+'results/ihc+sent_report',
                  reverse_labels_dict, run)

### IHC + emotion

In [None]:
train_hate = pd.read_csv(f'{dir}data/implicithate/train1.tsv', sep='\t')
train_emo = pd.read_csv(f'{dir}data/emotion/tec.tsv', sep='\t')

eval_hate = pd.read_csv(f'{dir}data/implicithate/test1.tsv', sep='\t')

In [None]:
labels_dict = {1: {'not_hate':0, 'explicit_hate':1, 'implicit_hate':2},
               2: {'anger': 0, 'disgust':1, 'fear': 2, 'joy': 3, 'sadness': 4, 'surprise': 5}}

reverse_labels_dict ={1: {0: 'not_hate', 1: 'explicit_hate', 2: 'implicit_hate'},
                      2: {0: 'anger', 1: 'disgust', 2: 'fear', 3: 'joy', 4: 'sadness', 5: 'surprise'}}

num_labels = [3, 6]

In [None]:
for run in [1, 2, 3, 4, 5]:
  train_and_predict(train_hate, train_emo,
                  labels_dict, num_labels, 16, 3, 2e-5,
                  dir+'models/ihc+emo_3ep_16_2e-5_RUN',
                  dir+'results/ihc+emo_RUN',
                  dir+'results/ihc+emo_report',
                  reverse_labels_dict, run)

### IHC + sarcasm

In [None]:
train_hate = pd.read_csv(f'{dir}data/implicithate/train1.tsv', sep='\t')

eval_hate = pd.read_csv(f'{dir}data/implicithate/test1.tsv', sep='\t')

train_sarc1 = pd.read_csv(f'{dir}data/sarcasm/twitter-train-nocontext.tsv', sep='\t')
train_sarc2 = pd.read_csv(f'{dir}data/sarcasm/reddit-train-nocontext.tsv', sep='\t')
train_sarcasm = pd.concat([train_sarc1, train_sarc2])

In [None]:
labels_dict = {1: {'not_hate':0, 'explicit_hate':1, 'implicit_hate':2},
               2: {'NOT_SARCASM': 0, 'SARCASM':1}}

reverse_labels_dict ={1: {0: 'not_hate', 1: 'explicit_hate', 2: 'implicit_hate'},
                      2: {0: 'not_sarcasm', 1: 'sarcasm'}}

num_labels = [3, 2]

In [None]:
for run in [1, 2, 3, 4, 5]:
  train_and_predict(train_hate, train_sarcasm,
                  labels_dict, num_labels, 16, 3, 2e-5,
                  dir+'models/ihc+sarc_3ep_16_2e-5_RUN',
                  dir+'results/ihc+sarc_RUN',
                  dir+'results/ihc+sarc_report',
                  reverse_labels_dict, run)

### IHC + irony

In [None]:
train_hate = pd.read_csv(f'{dir}data/implicithate/train1.tsv', sep='\t')

eval_hate = pd.read_csv(f'{dir}data/implicithate/test1.tsv', sep='\t')

train_irony = pd.read_csv(f'{dir}data/irony/train.tsv', sep='\t')

In [None]:
labels_dict = {1: {'not_hate':0, 'explicit_hate':1, 'implicit_hate':2},
               2: {0:0, 1:1}}

reverse_labels_dict ={1: {0: 'not_hate', 1: 'explicit_hate', 2: 'implicit_hate'},
                      2: {0:'not_irony', 1:'irony'}}

num_labels = [3, 2]

In [None]:
for run in [1, 2, 3, 4, 5]:
  train_and_predict(train_hate, train_irony,
                  labels_dict, num_labels, 16, 3, 2e-5,
                  dir+'models/ihc+iro_3ep_16_2e-5_RUN',
                  dir+'results/ihc+iro_RUN',
                  dir+'results/ihc+iro_report',
                  reverse_labels_dict, run)