<a href="https://colab.research.google.com/github/cicl-iscl/LeWiDi_SemEval2023/blob/main/Notebooks/MTL/MTL_1_header_both_tasks.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%%capture
!pip install transformers
!pip install wandb

# MTL approach
- Use 1 header
- Train hard and soft labels for each second batch

&rarr; Codalab: Mean CE of 0.51 (best 21.12)

In [2]:
import wandb
wandb.login()

ERROR:wandb.jupyter:Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit: 

··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [4]:
from drive.MyDrive.cicl_data.helpers import read_data
# from drive.MyDrive.cicl_data.code import CustomLabelDataset

In [5]:
import numpy as np
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, get_scheduler
# from datasets import Dataset
from sklearn.metrics import f1_score
import torch.nn.functional as Fun
from torch.utils.data import Dataset, random_split, DataLoader
from torch.optim import AdamW
import torch.nn as nn

from tqdm.notebook import tqdm


In [6]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [7]:
data_dict = read_data()
df_all = pd.concat([data_dict[k] for k in data_dict.keys()])

### Pretrained model

In [8]:
# Maybe load from wandb in future
tokenizer = AutoTokenizer.from_pretrained("lanwuwei/GigaBERT-v4-Arabic-and-English", do_lower_case=True)

Downloading:   0%|          | 0.00/24.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/578 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/458k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/157 [00:00<?, ?B/s]

In [9]:
from transformers import BertModel
# base_model = BertModel.from_pretrained("lanwuwei/GigaBERT-v4-Arabic-and-English")
model = AutoModelForSequenceClassification.from_pretrained("lanwuwei/GigaBERT-v4-Arabic-and-English", num_labels=2)

# output of model: https://huggingface.co/docs/transformers/main_classes/output#transformers.modeling_outputs.BaseModelOutputWithPoolingAndCrossAttentions

Downloading:   0%|          | 0.00/500M [00:00<?, ?B/s]

Some weights of the model checkpoint at lanwuwei/GigaBERT-v4-Arabic-and-English were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassifica

In [19]:
model = model.to(device)

In [20]:
for name, param in model.named_parameters():
  if name not in ["classifier.weight", "classifier.bias"]:
    param.requires_grad = False

## Prepare Data

In [11]:
class CustomLabelDataset(Dataset):
    def __init__(self, df_all):
        self.text = list(map(self.tokenize_func, df_all["text"]))
        self.soft_labels = df_all["soft_list"] 
        self.hard_labels = df_all["hard_label"]
        self.hard_labels_1h = Fun.one_hot(torch.tensor(df_all['hard_label'].values))

    def __len__(self):
        return len(self.text)
      
    def tokenize_func(self, text):
        return tokenizer(text, padding="max_length", truncation=True, max_length=240)

    def __getitem__(self, idx):
        input = {"attention_mask": torch.tensor(self.text[idx]["attention_mask"]),
                 "token_type_ids": torch.tensor(self.text[idx]["token_type_ids"]),
                 "input_ids": torch.tensor(self.text[idx]["input_ids"])}
        return input, self.hard_labels_1h[idx], torch.tensor(self.soft_labels[idx]), torch.tensor(self.hard_labels[idx])

In [12]:
# Init dataset
dataset = CustomLabelDataset(df_all)
batch_size = 4

train_size = int(len(dataset) * 0.9)
eval_size = len(dataset) - train_size
train_dataset, eval_dataset = random_split(dataset, [train_size, eval_size])

train_dataloader = DataLoader(
    train_dataset,
    batch_size=batch_size,
    shuffle=True)

eval_dataloader = DataLoader(
    eval_dataset,
    batch_size=batch_size)

## Optimization

In [21]:
# Optimizer
num_epochs = 3

num_training_steps = num_epochs * len(train_dataloader)
optimizer = AdamW(model.parameters())
lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)

In [14]:
# Loss
loss_fn = nn.CrossEntropyLoss()

## Training

In [15]:
run = wandb.init(
    project="mtl-1l",
    config={
        "epochs": num_epochs,
        "batch_size": batch_size,
        "device": device
        },
      save_code = True,
      tags = ["bert_arabic_english", "MTL", "1_head", "CE_loss", "alternating"],
      )
wandb.watch(model, log_freq=100)

[34m[1mwandb[0m: Currently logged in as: [33msheuschk[0m. Use [1m`wandb login --relogin`[0m to force relogin


[]

In [22]:
def ce_eval_func(model, eval_dataloader, eval_size, epsilon=1e-12):
  model.eval()
  cross_error = 0

  for i, batch in enumerate(tqdm(eval_dataloader, 0)):
    input_ids = batch[0]["input_ids"].to(device, dtype = torch.long)
    attention_mask = batch[0]["attention_mask"].to(device, dtype = torch.long)
    token_type_ids = batch[0]["token_type_ids"].to(device, dtype = torch.long)
    soft_labels = batch[2].to(device)

    with torch.no_grad():
      pred = model(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
    probabilities = torch.softmax(pred.logits, axis=-1)
    predictions = torch.clip(probabilities, epsilon, 1. - epsilon)
    cross_error += -torch.sum(soft_labels * torch.log(predictions + 1e-9))

  return cross_error / eval_size

In [25]:
# Train

for e in range(num_epochs):
  model.train()
  loss_batches = 0
  epoch_loss = 0
  epoch_len = len(train_dataloader)

  for i, batch in enumerate(train_dataloader):
    input_ids = batch[0]["input_ids"].to(device, dtype=torch.long)
    attention_mask = batch[0]["attention_mask"].to(device, dtype=torch.long)
    token_type_ids = batch[0]["token_type_ids"].to(device, dtype=torch.long)
    soft_labels, hard_labels = batch[2].to(device), batch[1].to(device, dtype=torch.float)

    # predict
    optimizer.zero_grad()
    pred = model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)

    # loss
    if i % 2 == 0:
      loss = loss_fn(pred.logits, hard_labels)      
    else:
      loss = loss_fn(pred.logits, soft_labels)
    
    loss.backward()
    optimizer.step()
    lr_scheduler.step()

    # Log
    loss_batches += loss.item()
    epoch_loss += loss.item()
    if i % 2 == 0:
      wandb.log({"train/hard_loss": loss.item()})      
    else:
      wandb.log({"train/soft_loss": loss.item()})
    
    log_n_batches = 200
    if i % log_n_batches == 0:
      if i == 0:
        log_n_batches = 1
      print(f"{e+1}: Last {log_n_batches} batches avg loss: {loss_batches/log_n_batches:>7f}  [{i}/{epoch_len}]")
      wandb.log({"train/loss_over_batches": loss_batches/log_n_batches})
      wandb.log({"train/epochs": e})
      loss_batches = 0
  
  epoch_loss /= i  # Not completely correct (Loss per batch but not every batch has same size)
  print(f"Epoch [{e+1}/{num_epochs}] mean loss: {epoch_loss:>7f}")
  wandb.log({"train/epoch_loss": epoch_loss})

  # Eval error
  ce = ce_eval_func(model, eval_dataloader, eval_size)
  print(f"Epoch [{e+1}/{num_epochs}] Eval CE: {ce:>6f}")
  wandb.log({"eval/epoch_ce": ce})
  

1: Last 1 batches avg loss: 1.281032  [0/2347]
1: Last 200 batches avg loss: 0.596349  [200/2347]
1: Last 200 batches avg loss: 0.548928  [400/2347]
1: Last 200 batches avg loss: 0.537066  [600/2347]
1: Last 200 batches avg loss: 0.534190  [800/2347]
1: Last 200 batches avg loss: 0.588616  [1000/2347]
1: Last 200 batches avg loss: 0.552204  [1200/2347]
1: Last 200 batches avg loss: 0.605685  [1400/2347]
1: Last 200 batches avg loss: 0.577308  [1600/2347]
1: Last 200 batches avg loss: 0.568807  [1800/2347]
1: Last 200 batches avg loss: 0.565667  [2000/2347]
1: Last 200 batches avg loss: 0.566932  [2200/2347]
Epoch [1/3] mean loss: 0.567199


  0%|          | 0/261 [00:00<?, ?it/s]

Epoch [1/3] CE: 0.577434
2: Last 1 batches avg loss: 0.589458  [0/2347]
2: Last 200 batches avg loss: 0.541446  [200/2347]
2: Last 200 batches avg loss: 0.552655  [400/2347]
2: Last 200 batches avg loss: 0.561119  [600/2347]
2: Last 200 batches avg loss: 0.533551  [800/2347]
2: Last 200 batches avg loss: 0.538394  [1000/2347]
2: Last 200 batches avg loss: 0.546680  [1200/2347]
2: Last 200 batches avg loss: 0.524455  [1400/2347]
2: Last 200 batches avg loss: 0.553437  [1600/2347]
2: Last 200 batches avg loss: 0.539862  [1800/2347]
2: Last 200 batches avg loss: 0.545046  [2000/2347]
2: Last 200 batches avg loss: 0.555062  [2200/2347]
Epoch [2/3] mean loss: 0.544944


  0%|          | 0/261 [00:00<?, ?it/s]

Epoch [2/3] CE: 0.541573
3: Last 1 batches avg loss: 0.287620  [0/2347]
3: Last 200 batches avg loss: 0.532516  [200/2347]
3: Last 200 batches avg loss: 0.496321  [400/2347]
3: Last 200 batches avg loss: 0.533861  [600/2347]
3: Last 200 batches avg loss: 0.512489  [800/2347]
3: Last 200 batches avg loss: 0.502177  [1000/2347]
3: Last 200 batches avg loss: 0.559179  [1200/2347]
3: Last 200 batches avg loss: 0.557357  [1400/2347]
3: Last 200 batches avg loss: 0.548351  [1600/2347]
3: Last 200 batches avg loss: 0.540538  [1800/2347]
3: Last 200 batches avg loss: 0.548753  [2000/2347]
3: Last 200 batches avg loss: 0.527700  [2200/2347]
Epoch [3/3] mean loss: 0.533183


  0%|          | 0/261 [00:00<?, ?it/s]

Epoch [3/3] CE: 0.533956


Model dependent improvements:
- Weight error for 0 and 1, because data is unbalanced
- Weight loss of hard and soft labels (F.E. Only fine tune on soft after training on hard)
- As Evaluation error is still going down, I think I can go for more epochs

## Evaluation

In [26]:
# initialize Evaluation dataset
data_dict_dev = read_data("dev")
df_dev = pd.concat([data_dict_dev[k] for k in data_dict_dev.keys()])

dev_dataset = CustomLabelDataset(df_dev)
dev_batch_size = 4
dev_size = len(dev_dataset)

dev_dataloader = DataLoader(
    dev_dataset,
    batch_size=dev_batch_size)

In [27]:
# Cross Entropy Error
cross_error = ce_eval_func(model, dev_dataloader, dev_size)
print(f"CE error: {cross_error}")
wandb.log({"dev/ce": cross_error})


  0%|          | 0/557 [00:00<?, ?it/s]

CE error: 0.5225147008895874


In [28]:
def f1_eval_func(model, eval_dataloader, eval_size):
  model.eval()
  f1_error = 0

  for i, batch in enumerate(tqdm(eval_dataloader, 0)):
    input_ids = batch[0]["input_ids"].to(device, dtype = torch.long)
    attention_mask = batch[0]["attention_mask"].to(device, dtype = torch.long)
    token_type_ids = batch[0]["token_type_ids"].to(device, dtype = torch.long)
    hard_labels = batch[3].to(device)

    with torch.no_grad():
      pred = model(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
    f1_error += f1_score(hard_labels.cpu(), pred.logits.argmax(1).cpu(), average='micro')

  return f1_error / i

In [29]:
# F1 micro Error

f1 = f1_eval_func(model, dev_dataloader, dev_size)
print(f"F1 error: {f1}")
wandb.log({"dev/f1": f1})

  0%|          | 0/557 [00:00<?, ?it/s]

F1 error: 0.7468525179856115


In [30]:
# Read example results/ logits of model
model.eval()
cross_error = 0
epsilon = 1e-12
for i, batch in enumerate(eval_dataloader):
  input_ids = batch[0]["input_ids"].to(device, dtype = torch.long)
  attention_mask = batch[0]["attention_mask"].to(device, dtype = torch.long)
  token_type_ids = batch[0]["token_type_ids"].to(device, dtype = torch.long)
  soft_labels = batch[2].to(device)

  with torch.no_grad():
    pred = model(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
  probabilities = torch.softmax(pred.logits, axis=-1)
  predictions = torch.clip(probabilities, epsilon, 1. - epsilon)
  cross_error += -torch.sum(soft_labels * torch.log(predictions + 1e-9))
  break



In [31]:
print(predictions)
print(soft_labels)

tensor([[0.8690, 0.1310],
        [0.7764, 0.2236],
        [0.8556, 0.1444],
        [0.7293, 0.2707]], device='cuda:0')
tensor([[1.0000, 0.0000],
        [0.6700, 0.3300],
        [1.0000, 0.0000],
        [1.0000, 0.0000]], device='cuda:0')


### Finish

In [None]:
# Save parameters

In [32]:
torch.save(model.parameters, 'model.pt')
artifact = wandb.Artifact(name='model_param', type='model')
artifact.add_file(local_path="model.pt")
run.log_artifact(artifact);

In [33]:
wandb.finish()

0,1
dev/ce,▁
dev/f1,▁
eval/epoch_ce,█▂▁
train/epoch_loss,█▃▁
train/epochs,▁▁▁▁▁▁▁▁▁▁▁▁▅▅▅▅▅▅▅▅▅▅▅▅████████████
train/hard_loss,▂▅▂▃▁▃▃▂▄▄▂▃▃▂▂▃▃▁▂▄▃▄▃▂▂▅▃▂▂█▃▂▄▂▃▂▂▅▂▁
train/loss_over_batches,█▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▁▃▂▃▃▃▃▃▃▃▃▃
train/soft_loss,▃▇▂▂▂▃▃▅▂▄▃▁▂▅█▁▃▁▂▃▃▄▄▄▃▂▂▃▃▄▁▂▂▂▃▁▂▃▃▃

0,1
dev/ce,0.52251
dev/f1,0.74685
eval/epoch_ce,0.53396
train/epoch_loss,0.53318
train/epochs,2.0
train/hard_loss,0.31125
train/loss_over_batches,0.5277
train/soft_loss,0.64505


The approach is completely trivial, but works quite fine

## TSV files


In [37]:
import os
import csv

In [38]:
filepaths = ["/content/ArMIS_results.tsv", "/content/ConvAbuse_results.tsv", "/content/HS-Brexit_results.tsv", "/content/MD-Agreement_results.tsv"]
epsilon = 1e-12

for fp in filepaths:
  if os.path.exists(fp):
    os.remove(fp)

for key in data_dict_dev.keys():
  tsv_dataset = CustomLabelDataset(data_dict_dev[key])
  tsv_dataloader = DataLoader(tsv_dataset, shuffle=False, batch_size=1)
  filepath_write = f"/content/{key}_results.tsv"

  with open(filepath_write, 'w', newline='') as tsvfile:
      writer = csv.writer(tsvfile, delimiter='\t', lineterminator='\n')
      for i, batch in enumerate(tqdm(tsv_dataloader, 0)):
        input_ids = batch[0]["input_ids"].to(device, dtype = torch.long)
        attention_mask = batch[0]["attention_mask"].to(device, dtype = torch.long)
        token_type_ids = batch[0]["token_type_ids"].to(device, dtype = torch.long)

        with torch.no_grad():
          pred = model(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        logits = pred.logits
        prediction = torch.argmax(logits, dim=-1)
        probability = torch.softmax(logits, axis=-1)
        probability = torch.clip(probability, epsilon, 1. - epsilon) # Really necessary?
        writer.writerow([prediction[0].item(), probability[0][0].item(), probability[0][1].item()])


  0%|          | 0/141 [00:00<?, ?it/s]

  0%|          | 0/1104 [00:00<?, ?it/s]

  0%|          | 0/812 [00:00<?, ?it/s]

  0%|          | 0/168 [00:00<?, ?it/s]

In [39]:
from zipfile import ZipFile

filepath = "res.zip" 

if os.path.exists(filepath):
    os.remove(filepath)

#loop over filepath names throws an string index out of range for whatever reason(also can't use content here, not sure why)
with ZipFile(filepath, 'w') as zipObj:
  zipObj.write("MD-Agreement_results.tsv")
  zipObj.write("ArMIS_results.tsv")
  zipObj.write("HS-Brexit_results.tsv")
  zipObj.write("ConvAbuse_results.tsv")

In [40]:
from google.colab import files
files.download("res.zip")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>