In [None]:
%%capture
!pip install transformers
!pip install wandb
# !pip install lightning

# MTL Base Model approach
- Fine-tune mutli language DistilBert Model
- Two sperate classification layer
  - One output neuron for Soft label and soft label
- Adam optimizer with cosine LR decay
- Shuffle all datasets while training
- Should be used as shared base for DataAsTask models

#### Heads
- Single neuron output for two layers
  - BCE Loss for hard label and percentage of Soft Label
  - Example: 2 of 6 annotator labeled 1:
    - Hard label 0: Soft_label_1: 0.33 
  - $Loss = (BCELoss(HardLabel) + BCELoss(SoftLabel1) * 2 )/2$
- Only one Single layer &rarr; Focus on transformer fine tuning

#### Possible Improvements
- Romanian dataset improved results for this pre trained model
  - It was not used for the submission run
- Use another model than DistilBert
- Different heads worked fine as well
  - Just for Soft Labels: 2 heads BCE and KL Div on Soft labels
- Weight the Error to compensate unbalanced datasets
  - Did not improve this model (weights per batch: see commented code below)
- Improved fine tuning of "typical parameters": LR, Optimizer, ...






In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import wandb
wandb.login()

ERROR:wandb.jupyter:Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33msheuschk[0m ([33mcapture_disagreement[0m). Use [1m`wandb login --relogin`[0m to force relogin


True

In [None]:
from drive.MyDrive.cicl_data.helpers import read_data
# from drive.MyDrive.cicl_data.code import CustomLabelDataset

In [None]:
import json
import numpy as np
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, get_scheduler
# from datasets import Dataset
import torch.nn.functional as Fun
from torch.utils.data import Dataset, random_split, DataLoader
import torch.nn as nn

from tqdm.notebook import tqdm


In [None]:
seed = 14
torch.manual_seed(seed)
device = "cuda" if torch.cuda.is_available() else "cpu"

In [None]:
data_dict = read_data()
df_all = pd.concat([data_dict[k] for k in data_dict.keys()])

In [None]:
def read_romanian(usage="train"):
  """@usage: 'train'/'dev'"""
  current_file = '/content/drive/MyDrive/cicl_data/romanian/rom_'+ usage +'.json' 
  data = json.load(open(current_file, 'r', encoding = 'UTF-8'))                                   
 
  def extract_soft_labels(row):
    return list(row.values())

  def transform_data(data, name):
    df = pd.DataFrame(data).transpose()
    df = df.astype({"hard_label": int}, errors='raise') 
    df['data_set'] = name
    df["soft_list"] = df["soft_label"].apply(extract_soft_labels)
    return df

  df = transform_data(data, "rom")
  return df

In [None]:
# rom_data = read_romanian()
# rom_data["split"] = "train"

# rom_data_small = rom_data.sample(5000, random_state=42)
# df_all = pd.concat([df_all, rom_data_small])

# All data
# df_all = pd.concat([df_all, rom_data])

In [None]:
def extract_soft_labels(row):
  return row[1]

In [None]:
df_all["sl_1s"] = df_all["soft_list"].apply(extract_soft_labels)

### Pretrained model

In [None]:
from transformers import DistilBertModel, DistilBertTokenizer

In [None]:
# Maybe load from wandb in future
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-multilingual-cased')

In [None]:
# from transformers import BertModel
# base_model = DistilBertModel.from_pretrained('distilbert-base-multilingual-cased')
# base_model = BertModel.from_pretrained("bert-base-multilingual-cased")
# class_model = AutoModelForSequenceClassification.from_pretrained("lanwuwei/GigaBERT-v4-Arabic-and-English", num_labels=2)

# output of model: https://huggingface.co/docs/transformers/main_classes/output#transformers.modeling_outputs.BaseModelOutputWithPoolingAndCrossAttentions

In [None]:
class MTLModel(nn.Module):
  def __init__(self, base_model):
    super().__init__()
    self.bert = base_model
    self.dropout = nn.Dropout(0.2)
    self.act = nn.Tanh()

    # Soft head
    self.lin_s1 = nn.Linear(768, 1)
    self.lin_h1 = nn.Linear(768, 1)
    self.sigmoid = nn.Sigmoid()

  def forward(self, input_ids, attention_mask):
    """a linear layer on top of the pooled output (https://huggingface.co/transformers/v3.0.2/model_doc/bert.html#bertforsequenceclassification)"""

    x = self.bert(input_ids=input_ids, attention_mask=attention_mask)
    hidden_state = x[0]
    pooler = hidden_state[:, 0]

    x_s = self.lin_s1(pooler)
    x_s = torch.flatten(self.sigmoid(x_s))

    x_h = self.lin_h1(pooler)
    x_h = torch.flatten(self.sigmoid(x_h))

    return x_h.to(torch.float64), x_s.to(torch.float64)

In [None]:
base_model = DistilBertModel.from_pretrained('distilbert-base-multilingual-cased')
model = MTLModel(base_model).to(device)

Some weights of the model checkpoint at distilbert-base-multilingual-cased were not used when initializing DistilBertModel: ['vocab_projector.bias', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


## Prepare Data

In [None]:
MaxLen = 240

In [None]:
class CustomLabelDataset(Dataset):
    def __init__(self, df_all):
        self.text = list(map(self.tokenize_func, df_all["text"]))
        self.soft_labels = df_all["soft_list"] 
        self.hard_labels = df_all["hard_label"]
        self.hard_labels_1h = Fun.one_hot(torch.tensor(df_all['hard_label'].values))
        self.soft_labels_1s = df_all["sl_1s"] # 0.33 of soft labels like {"1": 0.33, "0": 0.67}

    def __len__(self):
        return len(self.text)
      
    def tokenize_func(self, text):
        return tokenizer(text, truncation=True, max_length=MaxLen, padding="max_length", add_special_tokens=True)

    def __getitem__(self, idx):
        input = {"attention_mask": torch.tensor(self.text[idx]["attention_mask"]),
                 "input_ids": torch.tensor(self.text[idx]["input_ids"])}
        return input, self.hard_labels_1h[idx], torch.tensor(self.soft_labels[idx]), torch.tensor(self.hard_labels[idx]), torch.tensor(self.soft_labels_1s[idx])


In [None]:
# Init dataset
dataset = CustomLabelDataset(df_all)
batch_size = 64

train_size = len(dataset)

train_dataloader = DataLoader(
    dataset,
    batch_size=batch_size,
    shuffle=True)


In [None]:
# initialize Evaluation dataset
data_dict_dev = read_data("dev")
df_dev = pd.concat([data_dict_dev[k] for k in data_dict_dev.keys()])

df_dev["sl_1s"] = df_dev["soft_list"].apply(extract_soft_labels)

dev_dataset = CustomLabelDataset(df_dev)
dev_batch_size = 64
dev_size = len(dev_dataset)

dev_dataloader = DataLoader(
    dev_dataset,
    batch_size=dev_batch_size)

In [None]:
tokenizer.decode(dataset[0][0]["input_ids"])

'[CLS] النسويه يعني نصير رجل قولتك وبعدين اذا الوحده تبا تسرح وتمرح لازم تكون رجال [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]

## Optimization

In [None]:
from torch.optim import Adam, AdamW
from transformers import get_cosine_schedule_with_warmup

In [None]:
# Alternative
num_epochs = 7
total_steps = num_epochs * len(train_dataloader)
warmup_steps = 0.1 * total_steps
training_steps = 0.9 * total_steps

LR = 5e-05

optimizer = AdamW(model.parameters(), lr = LR)
lr_scheduler = get_cosine_schedule_with_warmup(optimizer, warmup_steps, training_steps)


In [None]:
# bce_loss = nn.BCEWithLogitsLoss()
bce_loss = nn.BCELoss()
# kl_loss = nn.KLDivLoss(reduction="batchmean")
# ce_loss = nn.CrossEntropyLoss()

In [None]:
""" # Creates weights for BCE Loss (unused here)
def create_batch_weigts_bce(labels):
  n_batch_size = len(labels)
  _, counts = torch.unique(labels, return_counts=True)
  weight0 = torch.full((n_batch_size,), n_batch_size/ counts[0])
  weight1 = torch.full((n_batch_size,), n_batch_size/ counts[1])
  w = torch.where(labels==1, weight1, labels)
  w = torch.where(w==0, weight0, w)
  return w / 2"""

## Training

In [None]:
run = wandb.init(
    project="MTL_DBert",
    config={
        "epochs": num_epochs,
        "batch_size": batch_size,
        "device": device,
        "Seed": seed,
        "token_max_len": MaxLen,
        "LR": LR
        },
      save_code = True,
      tags = ["distilbert", "cosine_schedule" "MAxLen240", "MTL", "BCE", "Soft", "2H1N" "all_rom"]
      )

In [None]:
# from drive.MyDrive.cicl_data.helpers import ce_eval_func
def ce_eval_func(model, eval_dataloader, eval_size, epsilon=1e-12, device="cuda"):
  model.eval()
  cross_error = 0

  for i, batch in enumerate(tqdm(eval_dataloader, 0)):
    input_ids = batch[0]["input_ids"].to(device, dtype=torch.long)
    attention_mask = batch[0]["attention_mask"].to(device, dtype=torch.long)
    soft_labels = batch[2].to(device)

    with torch.no_grad():
      _, pred = model(input_ids, attention_mask=attention_mask)
    # pred = nn.Sigmoid()(pred)
    pred = pred.reshape(len(pred), 1)
    probabilities = torch.cat((1-pred, pred), dim=-1)
    # probabilities = torch.softmax(pred, axis=-1)
    
    predictions = torch.clip(probabilities, epsilon, 1. - epsilon)
    cross_error += -torch.sum(soft_labels * torch.log(predictions + 1e-9))

  return cross_error / eval_size



In [None]:
ce_before = ce_eval_func(model, dev_dataloader, dev_size, device=device)
wandb.log({"eval/ce_before_training": ce_before})
print(f"CE before training: {ce_before}")

In [None]:
# Train
last_ce = 10
smallest_ce = 10
eval_counter = False

for e in range(num_epochs):
  model.train()
  loss_batches = 0
  epoch_loss = 0
  epoch_len = len(train_dataloader)

  for i, batch in enumerate(train_dataloader):
    input_ids = batch[0]["input_ids"].to(device, dtype=torch.long)
    attention_mask = batch[0]["attention_mask"].to(device, dtype=torch.long)
    soft_labels_1 = batch[4].to(device, dtype=torch.float64)
    # soft_labels = batch[2].to(device, dtype=torch.float64)
    # hard_labels = batch[1].to(device, dtype=torch.float64)
    hard_label = batch[3].to(device, dtype=torch.float64)

    # predict
    optimizer.zero_grad()

    pred_hl, pred_sl = model(input_ids=input_ids, attention_mask=attention_mask)

    # Loss
    loss = bce_loss(pred_sl, soft_labels_1) * 2
    loss += bce_loss(pred_hl, hard_label)
    loss /= 2

    loss.backward()
    optimizer.step()
    lr_scheduler.step()

    # Log
    loss_batches += loss.item()
    epoch_loss += loss.item()

    log_n_batches = 20
    if i % log_n_batches == 0:
      if i != 0:
        print(f"{e+1}: Last {log_n_batches} batches avg loss: {loss_batches/log_n_batches:>7f}  [{i}/{epoch_len}]")
        wandb.log({"train/loss_over_batches": loss_batches/log_n_batches})
        wandb.log({"train/epochs": e})
      loss_batches = 0
  
  epoch_loss /= i  
  print(f"Epoch [{e+1}/{num_epochs}] mean loss: {epoch_loss:>6f}")
  wandb.log({"train/epoch_loss": epoch_loss})

  # Eval error
  ce = ce_eval_func(model, dev_dataloader, dev_size, device=device)
  print(f"Epoch [{e+1}/{num_epochs}] Eval CE  : {ce:>6f}")
  wandb.log({"eval/epoch_ce": ce})

  # Stop after Eval CE raises 2 times in a row (Simple early stopping)
  if ce > last_ce:
    if eval_counter is True:
      print("Interrupt: Eval Error is raising")
      break;
    eval_counter = True
  elif ce < smallest_ce:
    torch.save(model.state_dict(), 'model.pt')
    print(f"Epoch [{e+1}/{num_epochs}] Save model state")
    eval_counter = False
    smallest_ce = ce
  else:
    eval_counter = False
  
  last_ce = ce



Model dependent improvements:

## Evaluation

In [None]:
model_best = MTLModel(base_model)
model_best.load_state_dict(torch.load('model.pt'))
model_best = model_best.to(device)

In [None]:
# Final Cross Entropy Error
cross_error = ce_eval_func(model_best, dev_dataloader, dev_size, device=device)
print(f"CE error: {cross_error}")
wandb.log({"dev/ce": cross_error})

In [None]:
model.eval()
cross_error = 0
epsilon = 1e-12
for i, batch in enumerate(dev_dataloader):
  input_ids = batch[0]["input_ids"].to(device, dtype = torch.long)
  attention_mask = batch[0]["attention_mask"].to(device, dtype = torch.long)
  soft_labels = batch[2].to(device)

  with torch.no_grad():
    pred = model(input_ids, attention_mask=attention_mask)
  # pred = pred.reshape(len(pred), 1)
  # probabilities = torch.cat((1-pred, pred), dim=-1)
  probabilities = torch.softmax(pred, axis=-1)
  predictions = torch.clip(probabilities, epsilon, 1. - epsilon)
  cross_error += -torch.sum(soft_labels * torch.log(predictions + 1e-9))
  break



In [None]:
print(predictions)
print(soft_labels)

### Finish

In [None]:
raise Exception

In [None]:
# torch.save(model.state_dict(), 'model.pt')
# model.load_state_dict(torch.load(PATH), strict=False)
artifact = wandb.Artifact(name='model_param', type='model')
artifact.add_file(local_path="model.pt")
run.log_artifact(artifact);

In [None]:
wandb.finish()

## TSV files

In [None]:
raise Exception

In [None]:
import os
import csv

In [None]:
filepaths = ["/content/ArMIS_results.tsv", "/content/ConvAbuse_results.tsv", "/content/HS-Brexit_results.tsv", "/content/MD-Agreement_results.tsv"]
epsilon = 1e-12

for fp in filepaths:
  if os.path.exists(fp):
    os.remove(fp)

for key in data_dict_dev.keys():
  data_dict_dev[key]["sl_1s"] = data_dict_dev[key]["soft_list"].apply(extract_soft_labels)
  tsv_dataset = CustomLabelDataset(data_dict_dev[key])
  tsv_dataloader = DataLoader(tsv_dataset, shuffle=False, batch_size=1)
  filepath_write = f"/content/{key}_results.tsv"
  
  if "HS-" in key:
    task = "HS"
  elif "MD-" in key:
    task = "MD"
  elif "Conv" in key:
    task = "Abu"
  elif "MIS" in key:
    task = "Mis"

  with open(filepath_write, 'w', newline='') as tsvfile:
      writer = csv.writer(tsvfile, delimiter='\t', lineterminator='\n')
      for i, batch in enumerate(tqdm(tsv_dataloader, 0)):
        input_ids = batch[0]["input_ids"].to(device, dtype = torch.long)
        attention_mask = batch[0]["attention_mask"].to(device, dtype = torch.long)
        token_type_ids = batch[0]["token_type_ids"].to(device, dtype = torch.long)

        with torch.no_grad():
          pred = model(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, task=task)
        # logits = pred.logits
        # probability = torch.softmax(pred, axis=-1)
        pred = pred.reshape(len(pred), 1)
        probability = torch.cat((1-pred, pred), dim=-1)
        # probability = torch.softmax(pred, axis=-1)
        prediction = torch.round(pred)
        probability = torch.clip(probability, epsilon, 1. - epsilon) # Really necessary?
        writer.writerow([int(prediction[0].item()), probability[0][0].item(), probability[0][1].item()])


In [None]:
from zipfile import ZipFile

filepath = "res.zip" 

if os.path.exists(filepath):
    os.remove(filepath)

#loop over filepath names throws an string index out of range for whatever reason(also can't use content here, not sure why)
with ZipFile(filepath, 'w') as zipObj:
  zipObj.write("MD-Agreement_results.tsv")
  zipObj.write("ArMIS_results.tsv")
  zipObj.write("HS-Brexit_results.tsv")
  zipObj.write("ConvAbuse_results.tsv")

In [None]:
from google.colab import files
files.download("res.zip")