<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#multi-task-dataloader" data-toc-modified-id="multi-task-dataloader-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>multi-task dataloader</a></span></li><li><span><a href="#multi-task-model" data-toc-modified-id="multi-task-model-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>multi-task model</a></span></li><li><span><a href="#train" data-toc-modified-id="train-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>train</a></span><ul class="toc-item"><li><span><a href="#single-task" data-toc-modified-id="single-task-3.1"><span class="toc-item-num">3.1&nbsp;&nbsp;</span>single task</a></span><ul class="toc-item"><li><span><a href="#ShortHumor" data-toc-modified-id="ShortHumor-3.1.1"><span class="toc-item-num">3.1.1&nbsp;&nbsp;</span>ShortHumor</a></span><ul class="toc-item"><li><span><a href="#freeze" data-toc-modified-id="freeze-3.1.1.1"><span class="toc-item-num">3.1.1.1&nbsp;&nbsp;</span>freeze</a></span></li><li><span><a href="#unfreeze" data-toc-modified-id="unfreeze-3.1.1.2"><span class="toc-item-num">3.1.1.2&nbsp;&nbsp;</span>unfreeze</a></span></li></ul></li><li><span><a href="#SarcasmGhosh" data-toc-modified-id="SarcasmGhosh-3.1.2"><span class="toc-item-num">3.1.2&nbsp;&nbsp;</span>SarcasmGhosh</a></span><ul class="toc-item"><li><span><a href="#freeze" data-toc-modified-id="freeze-3.1.2.1"><span class="toc-item-num">3.1.2.1&nbsp;&nbsp;</span>freeze</a></span></li><li><span><a href="#unfreeze" data-toc-modified-id="unfreeze-3.1.2.2"><span class="toc-item-num">3.1.2.2&nbsp;&nbsp;</span>unfreeze</a></span></li></ul></li></ul></li></ul></li></ul></div>

In [1]:
import pandas as pd
import torch.nn as nn
import torch.nn.functional as F
import torch
from torch.utils.data import Dataset, DataLoader
from torch import optim
from torch.optim import lr_scheduler
import torchmetrics
import numpy as np
import collections
import json
from tqdm.auto import tqdm, trange
import matplotlib.pyplot as plt
from transformers import *
from transformers.modeling_outputs import SequenceClassifierOutput
import sys
import os

In [2]:
# https://github.com/huggingface/transformers/issues/5486
os.environ["TOKENIZERS_PARALLELISM"] = "true"

In [3]:
MODEL_CLASSES = {
    "bert": (BertConfig, BertPreTrainedModel, BertModel, BertTokenizer),
    "roberta": (RobertaConfig, RobertaPreTrainedModel, RobertaModel, RobertaTokenizer),
    "albert": (AlbertConfig, AlbertPreTrainedModel, AlbertModel, AlbertTokenizer),
    "distilbert": (DistilBertConfig, DistilBertPreTrainedModel, DistilBertModel, DistilBertTokenizer)
}

In [4]:
# model_type = 'distilbert'
# model_name = 'distilbert-base-uncased'

model_type = 'bert'
model_name = 'bert-base-uncased'
config_class, pretrained_model_class, model_class, tokenizer_class = MODEL_CLASSES[model_type]

In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [6]:
with open('../data/xslue/tasks.json', 'r') as f:
    tasks = json.load(f)
tasks

{'CrowdFlower': 13,
 'DailyDialog': 7,
 'EmoBank_Valence': 1,
 'EmoBank_Arousal': 1,
 'EmoBank_Dominance': 1,
 'HateOffensive': 3,
 'PASTEL_age': 8,
 'PASTEL_country': 2,
 'PASTEL_education': 10,
 'PASTEL_ethnic': 10,
 'PASTEL_gender': 3,
 'PASTEL_politics': 3,
 'PASTEL_tod': 5,
 'SARC': 2,
 'SarcasmGhosh': 2,
 'SentiTreeBank': 1,
 'ShortHumor': 2,
 'ShortJokeKaggle': 2,
 'ShortRomance': 2,
 'StanfordPoliteness': 1,
 'TroFi': 2,
 'VUA': 2}

In [7]:
tasks2idx = {}
for i,k in enumerate(tasks):
    tasks2idx[k] = i
tasks2idx

{'CrowdFlower': 0,
 'DailyDialog': 1,
 'EmoBank_Valence': 2,
 'EmoBank_Arousal': 3,
 'EmoBank_Dominance': 4,
 'HateOffensive': 5,
 'PASTEL_age': 6,
 'PASTEL_country': 7,
 'PASTEL_education': 8,
 'PASTEL_ethnic': 9,
 'PASTEL_gender': 10,
 'PASTEL_politics': 11,
 'PASTEL_tod': 12,
 'SARC': 13,
 'SarcasmGhosh': 14,
 'SentiTreeBank': 15,
 'ShortHumor': 16,
 'ShortJokeKaggle': 17,
 'ShortRomance': 18,
 'StanfordPoliteness': 19,
 'TroFi': 20,
 'VUA': 21}

In [8]:
# task and their (train) dataset size 
selected_task = ['PASTEL_country', # 33224
#                  'SARC', # 205645
                 'SarcasmGhosh', # 39780
                 'ShortHumor', # 37801
#                  'ShortJokeKaggle', # 406682
#                  'ShortRomance', # 1902
#                  'TroFi', # 3335
                 'VUA', # 15157
                ] 


# multi-task dataloader

In [9]:
class MyDataset(Dataset): 
    # currently it's a Mapping-style dataset. Not sure if a Iterable-style dataset will be better
    def __init__(self, tsv_file):
        self.df = pd.read_csv(tsv_file, sep='\t')
        self.df = self.df.dropna()
        self.df = self.df.reset_index(drop=True)
        if self.df['label'].dtype == 'float64':
            self.df['label'] = self.df['label'].astype('float32')
            
    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        dataslice = self.df.iloc[idx]
        sample = {'text':dataslice['text'], 'label':dataslice['label']}
        return sample


In [10]:
class MultiTaskTrainDataLoader():
    '''
    Each time, a random integer selects a dataset and load a batch of data {text, label} from it. Return i_task and data
    
    a iterator
    Known issue: large dataset may have not iterate once, small datasets may have been iterated many times
    '''
    
    def __init__(self, selected_task, batch_size, shuffle, num_workers):
        self.tasks = selected_task
        self.split = 'train'
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.num_workers = num_workers
        
        self.num_tasks = len(self.tasks)
        self.datasets = []
        self.dataloaders = []
        self.dataloaderiters = []
        self.len = 0
        for task in self.tasks:
            self.datasets.append(MyDataset(f'../data/xslue/processed/{self.split}/{task}.tsv'))
            self.dataloaders.append(DataLoader(self.datasets[-1], batch_size=self.batch_size, shuffle=self.shuffle, num_workers=self.num_workers)) 
            self.dataloaderiters.append(self.dataloaders[-1]._get_iterator())
            self.len += len(self.dataloaders[-1])
    def __len__(self):   
        return self.len

    def __iter__(self):
        self.n = 0
        return self
    
    def __next__(self):
        i_task = np.random.randint(self.num_tasks)
        if self.n < self.len:
            self.n += 1
        else:
            raise StopIteration
        dataloaderiter = self.dataloaderiters[i_task]    
        try: 
            batch = next(dataloaderiter)
        except StopIteration:
#             self.dataloaderiters[i_task]._reset(self.dataloaders[i_task])
#             dataloaderiter = self.dataloaderiters[i_task]
            self.dataloaderiters[i_task] = iter(self.dataloaders[i_task])
            dataloaderiter = self.dataloaderiters[i_task]
            batch = next(dataloaderiter)
        return i_task, batch

In [11]:
class MultiTaskTestDataLoader():
    '''
    For dev and test
    
    a generator
    '''
    
    def __init__(self, tasks, split, batch_size, shuffle, num_workers):
        assert split in ['dev', 'test'], 'not implemented'
        self.tasks = tasks
        self.split = split
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.num_workers = num_workers
        
        self.num_tasks = len(tasks)
        self.datasets = []
        self.dataloaders = []
        self.dataloaderiters = []
        self.len = 0
        for task in tasks:
            self.datasets.append(MyDataset(f'../data/xslue/processed/{self.split}/{task}.tsv'))
            self.dataloaders.append(DataLoader(self.datasets[-1], batch_size=self.batch_size, shuffle=self.shuffle, num_workers=self.num_workers))
            self.len += len(self.dataloaders[-1])
        self.i_task = 0
    def __len__(self):   
        return self.len

    def __iter__(self):
        for i_task in range(self.num_tasks):
            dataloader = self.dataloaders[i_task]
            for batch in dataloader:
                yield i_task, batch
    
        

# multi-task model

In [12]:
class RegressionHead(nn.Module):
    def __init__(self, embedding_dim = 768, hidden_dim = 128):
        super().__init__()
        self.dropout = nn.Dropout(0.1)
        self.hidden = nn.Linear(embedding_dim, 1)
        
        self.loss_fn = nn.MSELoss()
    def forward(self, sent_emb, label):
        output = self.hidden(self.dropout(sent_emb)).squeeze(1)

        loss = self.loss_fn(output, label)
        return output, loss

In [13]:
class ClassificationHead(nn.Module):
    def __init__(self, num_labels, embedding_dim = 768, hidden_dim = 128):
        super().__init__()
        self.num_labels = num_labels
        self.dropout = nn.Dropout(0.1)
        self.hidden = nn.Linear(embedding_dim, self.num_labels)
        
        self.loss_fn = nn.CrossEntropyLoss()
    def forward(self, sent_emb, label):
        output = self.hidden(self.dropout(sent_emb))
        
        loss = self.loss_fn(output.view(-1, self.num_labels), label.view(-1))
        return output, loss

In [14]:
class MultiTaskBert(pretrained_model_class):
    def __init__(self, config, selected_task):
        super().__init__(config)
        self.basemodel = model_class.from_pretrained(model_name)
        self.style_heads = nn.ModuleList()
        for task in selected_task:
            if tasks[task] == 1:
                self.style_heads.append(RegressionHead())
            else:
                self.style_heads.append(ClassificationHead(tasks[task]))
    def forward(self, i_task=None, label=None, **kwargs):
        output = self.basemodel(**kwargs)
        if 'pooler_output' in output:
            sent_emb = output['pooler_output']
        else:
            sent_emb = output['last_hidden_state'][:,0,:]
        
        output, loss = self.style_heads[i_task](sent_emb, label) 
        return output, loss

In [15]:
def print_loss(losses):
    for k in losses:
        print(f'{losses[k]:4.4f}', end=' ')
    print('')

In [23]:
def validate(model, selected_task, split):
    val_loss = collections.defaultdict(float)
    val_size = collections.defaultdict(int)
    overall_acc = torchmetrics.Accuracy() 
    task_accs = [torchmetrics.Accuracy() for i in range(len(selected_task))] 
    
    if split == 'train':
        mt_dataloader = MultiTaskTrainDataLoader(selected_task, 32, False, 4)
    else:
        mt_dataloader = MultiTaskTestDataLoader(selected_task, 'dev', 32, False, 4)
        
    model.eval()
    for data in tqdm(mt_dataloader, leave=False):  
        i_task, batch = data
        label = batch['label'].to(device)
        size = len(label)
        del batch['label']
        tokens = tokenizer(**batch, return_tensors='pt', padding=True, truncation=True, max_length=64).to(device)
        output, loss = model(**tokens, i_task=i_task,  label=label)
        overall_acc.update(output.to('cpu').detach(), label.to('cpu').detach())
        task_accs[i_task].update(output.to('cpu').detach(), label.to('cpu').detach())
        tokens = None
        output = None
        val_loss[i_task] += loss.detach().item()*size
        val_size[i_task] += size
    
    accs = []
    for i_task in val_loss:
        val_loss[i_task] /= val_size[i_task]
        accs.append(task_accs[i_task].compute())
    model.train()
    
    
    return val_loss, overall_acc.compute(), accs


In [None]:
def get_linear_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps, last_epoch=-1):
    """
    Create a schedule with a learning rate that decreases linearly from the initial lr set in the optimizer to 0, after
    a warmup period during which it increases linearly from 0 to the initial lr set in the optimizer.
    Args:
        optimizer ([`~torch.optim.Optimizer`]):
            The optimizer for which to schedule the learning rate.
        num_warmup_steps (`int`):
            The number of steps for the warmup phase.
        num_training_steps (`int`):
            The total number of training steps.
        last_epoch (`int`, *optional*, defaults to -1):
            The index of the last epoch when resuming training.
    Return:
        `torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule.
    """

    def lr_lambda(current_step: int):
        if current_step < num_warmup_steps:
            return float(current_step) / float(max(1, num_warmup_steps))
        return max(
            0.0, float(num_training_steps - current_step) / float(max(1, num_training_steps - num_warmup_steps))
        )

    return lr_scheduler.LambdaLR(optimizer, lr_lambda, last_epoch)

# train

## single task

### ShortHumor

#### freeze

In [17]:
selected_task = [
                 'ShortHumor', 
#                  'VUA',
                ] 


In [18]:
train_dataloader = MultiTaskTrainDataLoader(selected_task, 32, False, 4)

config = config_class.from_pretrained(model_name)
tokenizer = tokenizer_class.from_pretrained(model_name)

In [21]:
torch.cuda.empty_cache()
num_epoch = 5

mt_model = MultiTaskBert(config, selected_task).to(device)
for param in mt_model.basemodel.parameters():
    param.requires_grad = False

optimizer = optim.AdamW(mt_model.parameters(), lr=5e-5)
# optimizer = optim.SGD(mt_model.parameters(), lr=0.001)

# scheduler = lr_scheduler.LinearLR(optimizer, start_factor=0.01, total_iters=500)
scheduler = lr_scheduler.OneCycleLR(optimizer, max_lr=5e-5, total_steps=num_epoch*len(train_dataloader)) 

df = pd.DataFrame(columns=['i_iter', 'train_loss', 'train_acc', 'val_loss', 'val_acc'])

for i_epoch in range(num_epoch):
    for i_iter, data in enumerate(tqdm(train_dataloader)):  

        i_task, batch = data
        optimizer.zero_grad()
        label = batch['label'].to(device)
        del batch['label']
        tokens = tokenizer(**batch, return_tensors='pt', padding=True, truncation=True, max_length=64).to(device)
        output, loss = mt_model(**tokens, i_task=i_task,  label=label)
        loss.backward()
        optimizer.step()
        scheduler.step()

#         if i_iter in [0, 10, 25, 50, 100]:
#             train_loss, train_overall_acc, train_task_accs = validate(mt_model, 'train')
#             val_loss, val_overall_acc, val_task_accs = validate(mt_model, 'val')
#             result = {'i_iter':i_iter, 'train_loss':train_loss[0], 'train_acc':train_overall_acc.item(), 'val_loss':val_loss[0], 'val_acc':val_overall_acc.item()}
#             df = df.append(result , ignore_index=True)
#             print(result)
#         if i_iter>100:
#             break

    train_loss, train_overall_acc, train_task_accs = validate(mt_model, 'train')
    val_loss, val_overall_acc, val_task_accs = validate(mt_model, 'val')
    result = {'i_epoch':i_epoch, 'train_loss':train_loss[0], 'train_acc':train_overall_acc.item(), 'val_loss':val_loss[0], 'val_acc':val_overall_acc.item()}
    df = df.append(result , ignore_index=True)
    print(result)


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/1182 [00:00<?, ?it/s]

  0%|          | 0/1182 [00:00<?, ?it/s]

  0%|          | 0/62 [00:00<?, ?it/s]

{'i_epoch': 0, 'train_loss': 0.6093603514596295, 'train_acc': 0.6768603920936584, 'val_loss': 0.6037419830616592, 'val_acc': 0.6787695288658142}


  0%|          | 0/1182 [00:00<?, ?it/s]

  0%|          | 0/1182 [00:00<?, ?it/s]

  0%|          | 0/62 [00:00<?, ?it/s]

{'i_epoch': 1, 'train_loss': 0.5694271674443419, 'train_acc': 0.6997169256210327, 'val_loss': 0.5666932542697986, 'val_acc': 0.7009581327438354}


  0%|          | 0/1182 [00:00<?, ?it/s]

  0%|          | 0/1182 [00:00<?, ?it/s]

  0%|          | 0/62 [00:00<?, ?it/s]

{'i_epoch': 2, 'train_loss': 0.542740888546511, 'train_acc': 0.7273352742195129, 'val_loss': 0.5414822714171262, 'val_acc': 0.7216339111328125}


  0%|          | 0/1182 [00:00<?, ?it/s]

  0%|          | 0/1182 [00:00<?, ?it/s]

  0%|          | 0/62 [00:00<?, ?it/s]

{'i_epoch': 3, 'train_loss': 0.5291925388148391, 'train_acc': 0.7452183961868286, 'val_loss': 0.5293197461408133, 'val_acc': 0.7367624640464783}


  0%|          | 0/1182 [00:00<?, ?it/s]

  0%|          | 0/1182 [00:00<?, ?it/s]

  0%|          | 0/62 [00:00<?, ?it/s]

{'i_epoch': 4, 'train_loss': 0.5281198126989516, 'train_acc': 0.7454035878181458, 'val_loss': 0.5283215008657266, 'val_acc': 0.738779604434967}


In [22]:
# with onecycle lr_scheduler
df

Unnamed: 0,i_iter,train_loss,train_acc,val_loss,val_acc,i_epoch
0,,0.60936,0.67686,0.603742,0.67877,0.0
1,,0.569427,0.699717,0.566693,0.700958,1.0
2,,0.542741,0.727335,0.541482,0.721634,2.0
3,,0.529193,0.745218,0.52932,0.736762,3.0
4,,0.52812,0.745404,0.528322,0.73878,4.0


#### unfreeze

In [23]:
selected_task = [
                 'ShortHumor', 
#                  'VUA',
                ] 


In [24]:
train_dataloader = MultiTaskTrainDataLoader(selected_task, 32, False, 4)

config = config_class.from_pretrained(model_name)
tokenizer = tokenizer_class.from_pretrained(model_name)

In [25]:
torch.cuda.empty_cache()
num_epoch = 5

mt_model = MultiTaskBert(config, selected_task).to(device)
# for param in mt_model.basemodel.parameters():
#     param.requires_grad = False

optimizer = optim.AdamW(mt_model.parameters(), lr=5e-5)
# optimizer = optim.SGD(mt_model.parameters(), lr=0.001)

# scheduler = lr_scheduler.LinearLR(optimizer, start_factor=0.01, total_iters=500)
scheduler = lr_scheduler.OneCycleLR(optimizer, max_lr=5e-5, total_steps=num_epoch*len(train_dataloader)) 

df = pd.DataFrame(columns=['i_iter', 'train_loss', 'train_acc', 'val_loss', 'val_acc'])

for i_epoch in range(num_epoch):
    for i_iter, data in enumerate(tqdm(train_dataloader)):  

        i_task, batch = data
        optimizer.zero_grad()
        label = batch['label'].to(device)
        del batch['label']
        tokens = tokenizer(**batch, return_tensors='pt', padding=True, truncation=True, max_length=64).to(device)
        output, loss = mt_model(**tokens, i_task=i_task,  label=label)
        loss.backward()
        optimizer.step()
        scheduler.step()

#         if i_iter in [0, 10, 25, 50, 100]:
#             train_loss, train_overall_acc, train_task_accs = validate(mt_model, 'train')
#             val_loss, val_overall_acc, val_task_accs = validate(mt_model, 'val')
#             result = {'i_iter':i_iter, 'train_loss':train_loss[0], 'train_acc':train_overall_acc.item(), 'val_loss':val_loss[0], 'val_acc':val_overall_acc.item()}
#             df = df.append(result , ignore_index=True)
#             print(result)
#         if i_iter>100:
#             break

    train_loss, train_overall_acc, train_task_accs = validate(mt_model, 'train')
    val_loss, val_overall_acc, val_task_accs = validate(mt_model, 'val')
    result = {'i_epoch':i_epoch, 'train_loss':train_loss[0], 'train_acc':train_overall_acc.item(), 'val_loss':val_loss[0], 'val_acc':val_overall_acc.item()}
    df = df.append(result , ignore_index=True)
    print(result)


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/1182 [00:00<?, ?it/s]

  0%|          | 0/1182 [00:00<?, ?it/s]

  0%|          | 0/62 [00:00<?, ?it/s]

{'i_epoch': 0, 'train_loss': 0.07624205425341254, 'train_acc': 0.9742070436477661, 'val_loss': 0.1591183666442298, 'val_acc': 0.940494179725647}


  0%|          | 0/1182 [00:00<?, ?it/s]

  0%|          | 0/1182 [00:00<?, ?it/s]

  0%|          | 0/62 [00:00<?, ?it/s]

Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7f19a2515550>
Traceback (most recent call last):
  File "/home/jz17d/anaconda3/envs/torch/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 1328, in __del__
    self._shutdown_workers()
  File "/home/jz17d/anaconda3/envs/torch/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 1320, in _shutdown_workers
    if w.is_alive():
  File "/home/jz17d/anaconda3/envs/torch/lib/python3.9/multiprocessing/process.py", line 160, in is_alive
Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7f19a2515550>Exception ignored in: 
Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7f19a2515550><function _MultiProcessingDataLoaderIter.__del__ at 0x7f19a2515550>Traceback (most recent call last):


Traceback (most recent call last):
  File "/home/jz17d/anaconda3/envs/torch/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 1328, in __del

{'i_epoch': 1, 'train_loss': 0.030277462488580556, 'train_acc': 0.9911113381385803, 'val_loss': 0.14881271859926823, 'val_acc': 0.9520927667617798}


  0%|          | 0/1182 [00:00<?, ?it/s]

  0%|          | 0/1182 [00:00<?, ?it/s]

  0%|          | 0/62 [00:00<?, ?it/s]

{'i_epoch': 2, 'train_loss': 0.011882821528125562, 'train_acc': 0.9964551329612732, 'val_loss': 0.15233089646162015, 'val_acc': 0.9546142220497131}


  0%|          | 0/1182 [00:00<?, ?it/s]

  0%|          | 0/1182 [00:00<?, ?it/s]

  0%|          | 0/62 [00:00<?, ?it/s]

{'i_epoch': 3, 'train_loss': 0.0017386663505057035, 'train_acc': 0.9996032118797302, 'val_loss': 0.14110398142522484, 'val_acc': 0.9667171239852905}


  0%|          | 0/1182 [00:00<?, ?it/s]

  0%|          | 0/1182 [00:00<?, ?it/s]

  0%|          | 0/62 [00:00<?, ?it/s]

{'i_epoch': 4, 'train_loss': 0.000905192531714256, 'train_acc': 0.9998412728309631, 'val_loss': 0.15679824659722963, 'val_acc': 0.9636913537979126}


In [26]:
df # ShortHumor

Unnamed: 0,i_iter,train_loss,train_acc,val_loss,val_acc,i_epoch
0,,0.076242,0.974207,0.159118,0.940494,0.0
1,,0.030277,0.991111,0.148813,0.952093,1.0
2,,0.011883,0.996455,0.152331,0.954614,2.0
3,,0.001739,0.999603,0.141104,0.966717,3.0
4,,0.000905,0.999841,0.156798,0.963691,4.0


### SarcasmGhosh

#### freeze

In [25]:
selected_task = [
                'SarcasmGhosh',
#                 'SARC',
                ] 


In [26]:
train_dataloader = MultiTaskTrainDataLoader(selected_task, 32, False, 4)

config = config_class.from_pretrained(model_name)
tokenizer = tokenizer_class.from_pretrained(model_name)

In [28]:
df = pd.read_csv('../data/xslue/processed/train/SarcasmGhosh.tsv', sep='\t')

In [27]:
for x in train_dataloader:
    print(x)
    break

(0, {'text': ['@0430yes i hope youre lurking rn. i want to listen to hallucination & wanna love you again live someday, pretty please?! 😭 😭 😭', "05 really taught me a valuable lesson I'm never gonna be late again! #Not", '@098BERRY Never had a voice to protest, so you fed me shit to digest. I wish I had a reason, my flaws are open season.', '@0hMySt4rs Rest in peace & love to you and your family', '100 days until Christmas! 🌲 #too soon #not ready yet', "@100_ThingsILove @WhatNinaSpotted yay! Can't wait to be reunited with you huni! Xx", "100 words short of the word requirement but I don't care, I'm going to bed. #rebel #ihatehistory", '@1010xlhacker it was nice hanging out this afternoon. I had a great time!', '10k walk this morning. We did an awesome job.', '10 minutes to eat #challengeaccepted', '10 Times One Direction Had the Perfect Love Song for Every Relationship :', "#10TurnOns Yet again if you list them on twitter you're garbage.", '@11aawwgg021 wow! sweet butt for all of them 

In [19]:
torch.cuda.empty_cache()
num_epoch = 5

mt_model = MultiTaskBert(config, selected_task).to(device)
for param in mt_model.basemodel.parameters():
    param.requires_grad = False

optimizer = optim.AdamW(mt_model.parameters(), lr=5e-5)
# optimizer = optim.SGD(mt_model.parameters(), lr=0.001)

num_warmup_steps=500
num_training_steps=num_epoch*len(train_dataloader)
def lr_lambda(current_step: int):
    if current_step < num_warmup_steps:
        return float(current_step) / float(max(1, num_warmup_steps))
    return max(
        0.0, float(num_training_steps - current_step) / float(max(1, num_training_steps - num_warmup_steps))
    )
scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda, -1)
# scheduler = lr_scheduler.LinearLR(optimizer, start_factor=0.01, total_iters=500)
# scheduler = lr_scheduler.OneCycleLR(optimizer, max_lr=5e-5, total_steps=num_epoch*len(train_dataloader)) 

df = pd.DataFrame(columns=['i_iter', 'train_loss', 'train_acc', 'val_loss', 'val_acc'])

for i_epoch in range(num_epoch):
    for i_iter, data in enumerate(tqdm(train_dataloader)):  

        i_task, batch = data
        optimizer.zero_grad()
        label = batch['label'].to(device)
        del batch['label']
        tokens = tokenizer(**batch, return_tensors='pt', padding=True, truncation=True, max_length=64).to(device)
        output, loss = mt_model(**tokens, i_task=i_task,  label=label)
        loss.backward()
        optimizer.step()
        scheduler.step()

#         if i_iter in [0, 10, 25, 50, 100]:
#             train_loss, train_overall_acc, train_task_accs = validate(mt_model, 'train')
#             val_loss, val_overall_acc, val_task_accs = validate(mt_model, 'val')
#             result = {'i_iter':i_iter, 'train_loss':train_loss[0], 'train_acc':train_overall_acc.item(), 'val_loss':val_loss[0], 'val_acc':val_overall_acc.item()}
#             df = df.append(result , ignore_index=True)
#             print(result)
#         if i_iter>100:
#             break

    train_loss, train_overall_acc, train_task_accs = validate(mt_model, selected_task, split='train')
    val_loss, val_overall_acc, val_task_accs = validate(mt_model, selected_task, split='val')
    result = {'i_epoch':i_epoch, 'train_loss':train_loss[0], 'train_acc':train_overall_acc.item(), 'val_loss':val_loss[0], 'val_acc':val_overall_acc.item()}
    df = df.append(result , ignore_index=True)
    print(result)


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/1244 [00:00<?, ?it/s]

  0%|          | 0/1244 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

{'i_epoch': 0, 'train_loss': 2.3325773350359724, 'train_acc': 0.4652840495109558, 'val_loss': 4.353245750458509, 'val_acc': 0.03819223493337631}


  0%|          | 0/1244 [00:00<?, ?it/s]

  0%|          | 0/1244 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

{'i_epoch': 1, 'train_loss': 2.197716439109672, 'train_acc': 0.4651834964752197, 'val_loss': 4.086464829083691, 'val_acc': 0.03819223493337631}


  0%|          | 0/1244 [00:00<?, ?it/s]

  0%|          | 0/1244 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [24]:
selected_task

['SarcasmGhosh']

In [21]:
train_loss

defaultdict(float, {0: 2.197716439109672})

In [None]:
df

#### unfreeze

In [12]:
selected_task = [
                'SarcasmGhosh',
                ] 


In [22]:
train_dataloader = MultiTaskTrainDataLoader(selected_task, 32, False, 4)

config = config_class.from_pretrained(model_name)
tokenizer = tokenizer_class.from_pretrained(model_name)

In [19]:
torch.cuda.empty_cache()
num_epoch = 5

mt_model = MultiTaskBert(config, selected_task).to(device)
# for param in mt_model.basemodel.parameters():
#     param.requires_grad = False

optimizer = optim.AdamW(mt_model.parameters(), lr=5e-5)
# optimizer = optim.SGD(mt_model.parameters(), lr=0.001)

# scheduler = lr_scheduler.LinearLR(optimizer, start_factor=0.01, total_iters=500)
scheduler = lr_scheduler.OneCycleLR(optimizer, max_lr=5e-5, total_steps=num_epoch*len(train_dataloader)) 

df = pd.DataFrame(columns=['i_iter', 'train_loss', 'train_acc', 'val_loss', 'val_acc'])

for i_epoch in range(num_epoch):
    for i_iter, data in enumerate(tqdm(train_dataloader)):  

        i_task, batch = data
        optimizer.zero_grad()
        label = batch['label'].to(device)
        del batch['label']
        tokens = tokenizer(**batch, return_tensors='pt', padding=True, truncation=True, max_length=64).to(device)
        output, loss = mt_model(**tokens, i_task=i_task,  label=label)
        loss.backward()
        optimizer.step()
        scheduler.step()

#         if i_iter in [0, 10, 25, 50, 100]:
#             train_loss, train_overall_acc, train_task_accs = validate(mt_model, 'train')
#             val_loss, val_overall_acc, val_task_accs = validate(mt_model, 'val')
#             result = {'i_iter':i_iter, 'train_loss':train_loss[0], 'train_acc':train_overall_acc.item(), 'val_loss':val_loss[0], 'val_acc':val_overall_acc.item()}
#             df = df.append(result , ignore_index=True)
#             print(result)
#         if i_iter>100:
#             break

    train_loss, train_overall_acc, train_task_accs = validate(mt_model, 'train')
    val_loss, val_overall_acc, val_task_accs = validate(mt_model, 'val')
    result = {'i_epoch':i_epoch, 'train_loss':train_loss[0], 'train_acc':train_overall_acc.item(), 'val_loss':val_loss[0], 'val_acc':val_overall_acc.item()}
    df = df.append(result , ignore_index=True)
    print(result)


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/1244 [00:00<?, ?it/s]

  0%|          | 0/1244 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

{'i_epoch': 0, 'train_loss': 4.307537162129135, 'train_acc': 0.46475616097450256, 'val_loss': 7.750239974564804, 'val_acc': 0.036919157952070236}


  0%|          | 0/1244 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [20]:
df

Unnamed: 0,i_iter,train_loss,train_acc,val_loss,val_acc,i_epoch
0,,4.307537,0.464756,7.75024,0.036919,0.0
