<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#multi-task-dataloader" data-toc-modified-id="multi-task-dataloader-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>multi-task dataloader</a></span></li><li><span><a href="#multi-task-model" data-toc-modified-id="multi-task-model-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>multi-task model</a></span></li><li><span><a href="#train" data-toc-modified-id="train-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>train</a></span><ul class="toc-item"><li><span><a href="#single-task" data-toc-modified-id="single-task-3.1"><span class="toc-item-num">3.1&nbsp;&nbsp;</span>single task</a></span><ul class="toc-item"><li><span><a href="#freeze" data-toc-modified-id="freeze-3.1.1"><span class="toc-item-num">3.1.1&nbsp;&nbsp;</span>freeze</a></span></li><li><span><a href="#unfreeze" data-toc-modified-id="unfreeze-3.1.2"><span class="toc-item-num">3.1.2&nbsp;&nbsp;</span>unfreeze</a></span></li></ul></li></ul></li></ul></div>

In [1]:
import pandas as pd
import torch.nn as nn
import torch.nn.functional as F
import torch
from torch.utils.data import Dataset, DataLoader
from torch import optim
from torch.optim import lr_scheduler
import torchmetrics
import numpy as np
import collections
import json
from tqdm.auto import tqdm, trange
import matplotlib.pyplot as plt
from transformers import *
from transformers.modeling_outputs import SequenceClassifierOutput
import sys
import os

In [2]:
# https://github.com/huggingface/transformers/issues/5486
os.environ["TOKENIZERS_PARALLELISM"] = "true"

In [3]:
MODEL_CLASSES = {
    "bert": (BertConfig, BertPreTrainedModel, BertModel, BertTokenizer),
    "roberta": (RobertaConfig, RobertaPreTrainedModel, RobertaModel, RobertaTokenizer),
    "albert": (AlbertConfig, AlbertPreTrainedModel, AlbertModel, AlbertTokenizer),
    "distilbert": (DistilBertConfig, DistilBertPreTrainedModel, DistilBertModel, DistilBertTokenizer)
}

In [4]:
# model_type = 'distilbert'
# model_name = 'distilbert-base-uncased'

model_type = 'bert'
model_name = 'bert-base-uncased'
config_class, pretrained_model_class, model_class, tokenizer_class = MODEL_CLASSES[model_type]

In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [6]:
with open('../data/xslue/tasks.json', 'r') as f:
    tasks = json.load(f)
tasks

{'CrowdFlower': 13,
 'DailyDialog': 7,
 'EmoBank_Valence': 1,
 'EmoBank_Arousal': 1,
 'EmoBank_Dominance': 1,
 'HateOffensive': 3,
 'PASTEL_age': 8,
 'PASTEL_country': 2,
 'PASTEL_education': 10,
 'PASTEL_ethnic': 10,
 'PASTEL_gender': 3,
 'PASTEL_politics': 3,
 'PASTEL_tod': 5,
 'SARC': 2,
 'SarcasmGhosh': 2,
 'SentiTreeBank': 1,
 'ShortHumor': 2,
 'ShortJokeKaggle': 2,
 'ShortRomance': 2,
 'StanfordPoliteness': 1,
 'TroFi': 2,
 'VUA': 2}

In [7]:
tasks2idx = {}
for i,k in enumerate(tasks):
    tasks2idx[k] = i
tasks2idx

{'CrowdFlower': 0,
 'DailyDialog': 1,
 'EmoBank_Valence': 2,
 'EmoBank_Arousal': 3,
 'EmoBank_Dominance': 4,
 'HateOffensive': 5,
 'PASTEL_age': 6,
 'PASTEL_country': 7,
 'PASTEL_education': 8,
 'PASTEL_ethnic': 9,
 'PASTEL_gender': 10,
 'PASTEL_politics': 11,
 'PASTEL_tod': 12,
 'SARC': 13,
 'SarcasmGhosh': 14,
 'SentiTreeBank': 15,
 'ShortHumor': 16,
 'ShortJokeKaggle': 17,
 'ShortRomance': 18,
 'StanfordPoliteness': 19,
 'TroFi': 20,
 'VUA': 21}

In [8]:
# task and their (train) dataset size 
selected_task = ['PASTEL_country', # 33224
#                  'SARC', # 205645
                 'SarcasmGhosh', # 39780
                 'ShortHumor', # 37801
#                  'ShortJokeKaggle', # 406682
#                  'ShortRomance', # 1902
#                  'TroFi', # 3335
                 'VUA', # 15157
                ] 


# multi-task dataloader

In [9]:
class MyDataset(Dataset): 
    # currently it's a Mapping-style dataset. Not sure if a Iterable-style dataset will be better
    def __init__(self, tsv_file):
        self.df = pd.read_csv(tsv_file, sep='\t')
        self.df = self.df.dropna()
        self.df = self.df.reset_index(drop=True)
        if self.df['label'].dtype == 'float64':
            self.df['label'] = self.df['label'].astype('float32')
            
    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        dataslice = self.df.iloc[idx]
        sample = {'text':dataslice['text'], 'label':dataslice['label']}
        return sample


In [10]:
class MultiTaskTrainDataLoader():
    '''
    Each time, a random integer selects a dataset and load a batch of data {text, label} from it. Return i_task and data
    
    a iterator
    Known issue: large dataset may have not iterate once, small datasets may have been iterated many times
    '''
    
    def __init__(self, selected_task, batch_size, shuffle, num_workers):
        self.tasks = selected_task
        self.split = 'train'
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.num_workers = num_workers
        
        self.num_tasks = len(self.tasks)
        self.datasets = []
        self.dataloaders = []
        self.dataloaderiters = []
        self.len = 0
        for task in self.tasks:
            self.datasets.append(MyDataset(f'../data/xslue/processed/{self.split}/{task}.tsv'))
            self.dataloaders.append(DataLoader(self.datasets[-1], batch_size=self.batch_size, shuffle=self.shuffle, num_workers=self.num_workers)) 
            self.dataloaderiters.append(self.dataloaders[-1]._get_iterator())
            self.len += len(self.dataloaders[-1])
    def __len__(self):   
        return self.len

    def __iter__(self):
        self.n = 0
        return self
    
    def __next__(self):
        i_task = np.random.randint(self.num_tasks)
        if self.n < self.len:
            self.n += 1
        else:
            raise StopIteration
        dataloaderiter = self.dataloaderiters[i_task]    
        try: 
            batch = next(dataloaderiter)
        except StopIteration:
#             self.dataloaderiters[i_task]._reset(self.dataloaders[i_task])
#             dataloaderiter = self.dataloaderiters[i_task]
            self.dataloaderiters[i_task] = iter(self.dataloaders[i_task])
            dataloaderiter = self.dataloaderiters[i_task]
            batch = next(dataloaderiter)
        return i_task, batch

In [11]:
class MultiTaskTestDataLoader():
    '''
    For dev and test
    
    a generator
    '''
    
    def __init__(self, tasks, split, batch_size, shuffle, num_workers):
        assert split in ['dev', 'test'], 'not implemented'
        self.tasks = tasks
        self.split = split
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.num_workers = num_workers
        
        self.num_tasks = len(tasks)
        self.datasets = []
        self.dataloaders = []
        self.dataloaderiters = []
        self.len = 0
        for task in tasks:
            self.datasets.append(MyDataset(f'../data/xslue/processed/{self.split}/{task}.tsv'))
            self.dataloaders.append(DataLoader(self.datasets[-1], batch_size=self.batch_size, shuffle=self.shuffle, num_workers=self.num_workers))
            self.len += len(self.dataloaders[-1])
        self.i_task = 0
    def __len__(self):   
        return self.len

    def __iter__(self):
        for i_task in range(self.num_tasks):
            dataloader = self.dataloaders[i_task]
            for batch in dataloader:
                yield i_task, batch
    
        

# multi-task model

In [12]:
class RegressionHead(nn.Module):
    def __init__(self, embedding_dim = 768, hidden_dim = 128):
        super().__init__()
        self.dropout = nn.Dropout(0.1)
        self.hidden = nn.Linear(embedding_dim, 1)
        
        self.loss_fn = nn.MSELoss()
    def forward(self, sent_emb, label):
        output = self.hidden(self.dropout(sent_emb)).squeeze(1)

        loss = self.loss_fn(output, label)
        return output, loss

In [13]:
class ClassificationHead(nn.Module):
    def __init__(self, num_labels, embedding_dim = 768, hidden_dim = 128):
        super().__init__()
        self.num_labels = num_labels
        self.dropout = nn.Dropout(0.1)
        self.hidden = nn.Linear(embedding_dim, self.num_labels)
        
        self.loss_fn = nn.CrossEntropyLoss()
    def forward(self, sent_emb, label):
        output = self.hidden(self.dropout(sent_emb))
        
        loss = self.loss_fn(output.view(-1, self.num_labels), label.view(-1))
        return output, loss

In [21]:
class MultiTaskBert(pretrained_model_class):
    def __init__(self, config, selected_task):
        super().__init__(config)
        self.basemodel = model_class(config)
        self.style_heads = nn.ModuleList()
        for task in selected_task:
            if tasks[task] == 1:
                self.style_heads.append(RegressionHead())
            else:
                self.style_heads.append(ClassificationHead(tasks[task]))
    def forward(self, i_task=None, label=None, **kwargs):
        output = self.basemodel(**kwargs)
        if 'pooler_output' in output:
            sent_emb = output['pooler_output']
        else:
            sent_emb = output['last_hidden_state'][:,0,:]
        
        output, loss = self.style_heads[i_task](sent_emb, label)
        return output, loss

In [22]:
def print_loss(losses):
    for k in losses:
        print(f'{losses[k]:4.4f}', end=' ')
    print('')

In [23]:
def validate(model, dataloader):
    val_loss = collections.defaultdict(float)
    val_size = collections.defaultdict(int)
    overall_acc = torchmetrics.Accuracy() 
    task_accs = [torchmetrics.Accuracy() for i in range(len(selected_task))] 
    
    if dataloader == 'train':
        mt_dataloader = MultiTaskTrainDataLoader(selected_task, 32, False, 4)
    else:
        mt_dataloader = MultiTaskTestDataLoader(selected_task, 'dev', 32, False, 4)
    model.eval()
    for data in tqdm(mt_dataloader, leave=False):  
        i_task, batch = data
        label = batch['label'].to(device)
        size = len(label)
        del batch['label']
        tokens = tokenizer(**batch, return_tensors='pt', padding=True, truncation=True, max_length=64).to(device)
        output, loss = model(**tokens, i_task=i_task,  label=label)
        overall_acc.update(output.to('cpu').detach(), label.to('cpu').detach())
        task_accs[i_task].update(output.to('cpu').detach(), label.to('cpu').detach())
        tokens = None
        output = None
        val_loss[i_task] += loss.detach().item()*size
        val_size[i_task] += size
    
    accs = []
    for i_task in val_loss:
        val_loss[i_task] /= val_size[i_task]
        accs.append(task_accs[i_task].compute())
    model.train()
    
    
    return val_loss, overall_acc.compute(), accs


# train

## single task

### freeze

In [17]:
selected_task = [
                 'ShortHumor', 
#                  'VUA',
                ] 


In [24]:
train_dataloader = MultiTaskTrainDataLoader(selected_task, 32, False, 4)

config = config_class.from_pretrained(model_name)
tokenizer = tokenizer_class.from_pretrained(model_name)

In [35]:
torch.cuda.empty_cache()
num_epoch = 5

mt_model = MultiTaskBert(config, selected_task).to(device)
for param in mt_model.basemodel.parameters():
    param.requires_grad = False

optimizer = optim.AdamW(mt_model.parameters(), lr=5e-5)
# optimizer = optim.SGD(mt_model.parameters(), lr=0.001)

# scheduler = lr_scheduler.LinearLR(optimizer, start_factor=0.01, total_iters=500)
scheduler = lr_scheduler.OneCycleLR(optimizer, max_lr=5e-5, total_steps=num_epoch*len(train_dataloader)) 

df = pd.DataFrame(columns=['i_iter', 'train_loss', 'train_acc', 'val_loss', 'val_acc'])

for i_epoch in range(num_epoch):
    for i_iter, data in enumerate(tqdm(train_dataloader)):  

        i_task, batch = data
        optimizer.zero_grad()
        label = batch['label'].to(device)
        del batch['label']
        tokens = tokenizer(**batch, return_tensors='pt', padding=True, truncation=True, max_length=64).to(device)
        output, loss = mt_model(**tokens, i_task=i_task,  label=label)
        loss.backward()
        optimizer.step()
        scheduler.step()

#         if i_iter in [0, 10, 25, 50, 100]:
#             train_loss, train_overall_acc, train_task_accs = validate(mt_model, 'train')
#             val_loss, val_overall_acc, val_task_accs = validate(mt_model, 'val')
#             result = {'i_iter':i_iter, 'train_loss':train_loss[0], 'train_acc':train_overall_acc.item(), 'val_loss':val_loss[0], 'val_acc':val_overall_acc.item()}
#             df = df.append(result , ignore_index=True)
#             print(result)
#         if i_iter>100:
#             break

    train_loss, train_overall_acc, train_task_accs = validate(mt_model, 'train')
    val_loss, val_overall_acc, val_task_accs = validate(mt_model, 'val')
    result = {'i_epoch':i_epoch, 'train_loss':train_loss[0], 'train_acc':train_overall_acc.item(), 'val_loss':val_loss[0], 'val_acc':val_overall_acc.item()}
    df = df.append(result , ignore_index=True)
    print(result)


  0%|          | 0/1182 [00:00<?, ?it/s]

  0%|          | 0/1182 [00:00<?, ?it/s]

  0%|          | 0/62 [00:20<?, ?it/s]

{'i_epoch': 0, 'train_loss': 0.6652952216453065, 'train_acc': 0.6093754172325134, 'val_loss': 0.6672794208403976, 'val_acc': 0.6132122874259949}


  0%|          | 0/1182 [00:00<?, ?it/s]

  0%|          | 0/1182 [00:00<?, ?it/s]

  0%|          | 0/62 [00:00<?, ?it/s]

{'i_epoch': 1, 'train_loss': 0.6327887569712298, 'train_acc': 0.6503795981407166, 'val_loss': 0.6334584649319728, 'val_acc': 0.656580924987793}


  0%|          | 0/1182 [00:00<?, ?it/s]

  0%|          | 0/1182 [00:00<?, ?it/s]

  0%|          | 0/62 [00:00<?, ?it/s]

{'i_epoch': 2, 'train_loss': 0.6194691281748059, 'train_acc': 0.6582100987434387, 'val_loss': 0.6192063390457203, 'val_acc': 0.6591023802757263}


  0%|          | 0/1182 [00:00<?, ?it/s]

  0%|          | 0/1182 [00:00<?, ?it/s]

  0%|          | 0/62 [00:00<?, ?it/s]

{'i_epoch': 3, 'train_loss': 0.6095730139999105, 'train_acc': 0.6636861562728882, 'val_loss': 0.608214221545319, 'val_acc': 0.6606152057647705}


  0%|          | 0/1182 [00:00<?, ?it/s]

  0%|          | 0/1182 [00:00<?, ?it/s]

  0%|          | 0/62 [00:00<?, ?it/s]

Exception ignored in: Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7fa3a0772f70>Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7fa3a0772f70><function _MultiProcessingDataLoaderIter.__del__ at 0x7fa3a0772f70>

Traceback (most recent call last):
Traceback (most recent call last):
  File "/home/jz17d/anaconda3/envs/torch/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 1328, in __del__

Exception ignored in: Traceback (most recent call last):
<function _MultiProcessingDataLoaderIter.__del__ at 0x7fa3a0772f70>  File "/home/jz17d/anaconda3/envs/torch/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 1328, in __del__
  File "/home/jz17d/anaconda3/envs/torch/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 1328, in __del__

        self._shutdown_workers()self._shutdown_workers()Traceback (most recent call last):
    
  File "/home/jz17d/anaconda3/envs/torch/lib/python3.9/site-package

{'i_epoch': 4, 'train_loss': 0.6090051889946207, 'train_acc': 0.6636067628860474, 'val_loss': 0.607591244228188, 'val_acc': 0.6611195206642151}


In [36]:
# with onecycle lr_scheduler
df

Unnamed: 0,i_iter,train_loss,train_acc,val_loss,val_acc,i_epoch
0,,0.665295,0.609375,0.667279,0.613212,0.0
1,,0.632789,0.65038,0.633458,0.656581,1.0
2,,0.619469,0.65821,0.619206,0.659102,2.0
3,,0.609573,0.663686,0.608214,0.660615,3.0
4,,0.609005,0.663607,0.607591,0.66112,4.0


In [27]:
# with 500 warm up linear lr_scheduler
df

Unnamed: 0,i_iter,train_loss,train_acc,val_loss,val_acc,i_epoch
0,,0.635168,0.649798,0.63345,0.663137,0.0
1,,0.624064,0.655088,0.621221,0.664145,1.0
2,,0.616705,0.658263,0.613418,0.665154,2.0
3,,0.614793,0.657972,0.611683,0.666667,3.0
4,,0.610469,0.660088,0.607097,0.668684,4.0


### unfreeze

In [24]:
selected_task = [
                 'ShortHumor', 
#                  'VUA',
                ] 


In [25]:
train_dataloader = MultiTaskTrainDataLoader(selected_task, 32, False, 4)

config = config_class.from_pretrained(model_name)
tokenizer = tokenizer_class.from_pretrained(model_name)

In [None]:
torch.cuda.empty_cache()
num_epoch = 10

mt_model = MultiTaskBert(config, selected_task).to(device)
# for param in mt_model.basemodel.parameters():
#     param.requires_grad = False

optimizer = optim.AdamW(mt_model.parameters(), lr=5e-5)
# optimizer = optim.SGD(mt_model.parameters(), lr=0.001)

# scheduler = lr_scheduler.LinearLR(optimizer, start_factor=0.01, total_iters=500)
scheduler = lr_scheduler.OneCycleLR(optimizer, max_lr=5e-5, total_steps=num_epoch*len(train_dataloader)) 

df = pd.DataFrame(columns=['i_iter', 'train_loss', 'train_acc', 'val_loss', 'val_acc'])

for i_epoch in range(num_epoch):
    for i_iter, data in enumerate(tqdm(train_dataloader)):  

        i_task, batch = data
        optimizer.zero_grad()
        label = batch['label'].to(device)
        del batch['label']
        tokens = tokenizer(**batch, return_tensors='pt', padding=True, truncation=True, max_length=64).to(device)
        output, loss = mt_model(**tokens, i_task=i_task,  label=label)
        loss.backward()
        optimizer.step()
        scheduler.step()

#         if i_iter in [0, 10, 25, 50, 100]:
#             train_loss, train_overall_acc, train_task_accs = validate(mt_model, 'train')
#             val_loss, val_overall_acc, val_task_accs = validate(mt_model, 'val')
#             result = {'i_iter':i_iter, 'train_loss':train_loss[0], 'train_acc':train_overall_acc.item(), 'val_loss':val_loss[0], 'val_acc':val_overall_acc.item()}
#             df = df.append(result , ignore_index=True)
#             print(result)
#         if i_iter>100:
#             break

    train_loss, train_overall_acc, train_task_accs = validate(mt_model, 'train')
    val_loss, val_overall_acc, val_task_accs = validate(mt_model, 'val')
    result = {'i_epoch':i_epoch, 'train_loss':train_loss[0], 'train_acc':train_overall_acc.item(), 'val_loss':val_loss[0], 'val_acc':val_overall_acc.item()}
    df = df.append(result , ignore_index=True)
    print(result)


  0%|          | 0/1182 [00:00<?, ?it/s]

  0%|          | 0/1182 [00:00<?, ?it/s]

  0%|          | 0/62 [00:00<?, ?it/s]

{'i_epoch': 0, 'train_loss': 0.36868295300647363, 'train_acc': 0.8496600389480591, 'val_loss': 0.38348085836395135, 'val_acc': 0.8350983262062073}


  0%|          | 0/1182 [00:00<?, ?it/s]

In [None]:
df