<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#multi-task-dataloader" data-toc-modified-id="multi-task-dataloader-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>multi-task dataloader</a></span></li><li><span><a href="#multi-task-model" data-toc-modified-id="multi-task-model-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>multi-task model</a></span></li><li><span><a href="#bertology" data-toc-modified-id="bertology-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>bertology</a></span></li><li><span><a href="#train" data-toc-modified-id="train-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>train</a></span><ul class="toc-item"><li><span><a href="#single-task" data-toc-modified-id="single-task-4.1"><span class="toc-item-num">4.1&nbsp;&nbsp;</span>single task</a></span><ul class="toc-item"><li><span><a href="#ShortHumor" data-toc-modified-id="ShortHumor-4.1.1"><span class="toc-item-num">4.1.1&nbsp;&nbsp;</span>ShortHumor</a></span><ul class="toc-item"><li><span><a href="#freeze" data-toc-modified-id="freeze-4.1.1.1"><span class="toc-item-num">4.1.1.1&nbsp;&nbsp;</span>freeze</a></span></li><li><span><a href="#unfreeze" data-toc-modified-id="unfreeze-4.1.1.2"><span class="toc-item-num">4.1.1.2&nbsp;&nbsp;</span>unfreeze</a></span></li></ul></li><li><span><a href="#SARC" data-toc-modified-id="SARC-4.1.2"><span class="toc-item-num">4.1.2&nbsp;&nbsp;</span>SARC</a></span><ul class="toc-item"><li><span><a href="#freeze" data-toc-modified-id="freeze-4.1.2.1"><span class="toc-item-num">4.1.2.1&nbsp;&nbsp;</span>freeze</a></span></li><li><span><a href="#unfreeze" data-toc-modified-id="unfreeze-4.1.2.2"><span class="toc-item-num">4.1.2.2&nbsp;&nbsp;</span>unfreeze</a></span></li></ul></li></ul></li><li><span><a href="#multi-task" data-toc-modified-id="multi-task-4.2"><span class="toc-item-num">4.2&nbsp;&nbsp;</span>multi task</a></span><ul class="toc-item"><li><span><a href="#ShortHumor-+-SARC-freeze" data-toc-modified-id="ShortHumor-+-SARC-freeze-4.2.1"><span class="toc-item-num">4.2.1&nbsp;&nbsp;</span>ShortHumor + SARC freeze</a></span></li><li><span><a href="#ShortHumor-+-SARC-unfreeze" data-toc-modified-id="ShortHumor-+-SARC-unfreeze-4.2.2"><span class="toc-item-num">4.2.2&nbsp;&nbsp;</span>ShortHumor + SARC unfreeze</a></span></li><li><span><a href="#PASTEL_country-+-SARC-+-ShortHumor-freeze-all" data-toc-modified-id="PASTEL_country-+-SARC-+-ShortHumor-freeze-all-4.2.3"><span class="toc-item-num">4.2.3&nbsp;&nbsp;</span>PASTEL_country + SARC + ShortHumor freeze all</a></span></li><li><span><a href="#PASTEL_country-+-SARC-+-ShortHumor-freeze-all,-no-pooler" data-toc-modified-id="PASTEL_country-+-SARC-+-ShortHumor-freeze-all,-no-pooler-4.2.4"><span class="toc-item-num">4.2.4&nbsp;&nbsp;</span>PASTEL_country + SARC + ShortHumor freeze all, no pooler</a></span></li><li><span><a href="#PASTEL_country-+-SARC-+-ShortHumor-unfreeze-all,-no-pooler" data-toc-modified-id="PASTEL_country-+-SARC-+-ShortHumor-unfreeze-all,-no-pooler-4.2.5"><span class="toc-item-num">4.2.5&nbsp;&nbsp;</span>PASTEL_country + SARC + ShortHumor unfreeze all, no pooler</a></span></li></ul></li></ul></li></ul></div>

In [1]:
import pandas as pd
import torch.nn as nn
import torch.nn.functional as F
import torch
from torch.utils.data import Dataset, DataLoader
from torch import optim
from torch.optim import lr_scheduler
import torchmetrics
import numpy as np
import collections
import json
from datetime import datetime
from tqdm.auto import tqdm, trange
import matplotlib.pyplot as plt
from transformers import *
from transformers.modeling_outputs import SequenceClassifierOutput
import sys
import os
from IPython.display import display

In [2]:
# https://github.com/huggingface/transformers/issues/5486
os.environ["TOKENIZERS_PARALLELISM"] = "true"

In [3]:
MODEL_CLASSES = {
    "bert": (BertConfig, BertPreTrainedModel, BertModel, BertTokenizer),
    "roberta": (RobertaConfig, RobertaPreTrainedModel, RobertaModel, RobertaTokenizer),
    "albert": (AlbertConfig, AlbertPreTrainedModel, AlbertModel, AlbertTokenizer),
    "distilbert": (DistilBertConfig, DistilBertPreTrainedModel, DistilBertModel, DistilBertTokenizer)
}

In [4]:
# model_type = 'distilbert'
# model_name = 'distilbert-base-uncased'

model_type = 'bert'
model_name = 'bert-base-uncased'
config_class, pretrained_model_class, model_class, tokenizer_class = MODEL_CLASSES[model_type]

In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [6]:
# Dictionary: task_name: number_of_labels
with open('../data/xslue/tasks.json', 'r') as f:
    tasks = json.load(f)
tasks

{'CrowdFlower': 13,
 'DailyDialog': 7,
 'EmoBank_Valence': 1,
 'EmoBank_Arousal': 1,
 'EmoBank_Dominance': 1,
 'HateOffensive': 3,
 'PASTEL_age': 8,
 'PASTEL_country': 2,
 'PASTEL_education': 10,
 'PASTEL_ethnic': 10,
 'PASTEL_gender': 3,
 'PASTEL_politics': 3,
 'PASTEL_tod': 5,
 'SARC': 2,
 'SarcasmGhosh': 2,
 'SentiTreeBank': 1,
 'ShortHumor': 2,
 'ShortJokeKaggle': 2,
 'ShortRomance': 2,
 'StanfordPoliteness': 1,
 'TroFi': 2,
 'VUA': 2}

In [7]:
# Dictionary: task_name: task index
tasks2idx = {k:i for i,k in enumerate(tasks)}
tasks2idx

{'CrowdFlower': 0,
 'DailyDialog': 1,
 'EmoBank_Valence': 2,
 'EmoBank_Arousal': 3,
 'EmoBank_Dominance': 4,
 'HateOffensive': 5,
 'PASTEL_age': 6,
 'PASTEL_country': 7,
 'PASTEL_education': 8,
 'PASTEL_ethnic': 9,
 'PASTEL_gender': 10,
 'PASTEL_politics': 11,
 'PASTEL_tod': 12,
 'SARC': 13,
 'SarcasmGhosh': 14,
 'SentiTreeBank': 15,
 'ShortHumor': 16,
 'ShortJokeKaggle': 17,
 'ShortRomance': 18,
 'StanfordPoliteness': 19,
 'TroFi': 20,
 'VUA': 21}

In [8]:
# task and their (train) dataset size 
selected_task = ['PASTEL_country', # 33224
#                  'SARC', # 205645
                 'SarcasmGhosh', # 39780
                 'ShortHumor', # 37801
#                  'ShortJokeKaggle', # 406682
#                  'ShortRomance', # 1902
#                  'TroFi', # 3335
                 'VUA', # 15157
                ] 


# multi-task dataloader

Test/validation dataloader consume dataset one by one, where as the train dataloader do it randomly. So the train dataloader is more complicated than test/validation dataloader. It must be able to reset a dataset once it is exhausted.

In [9]:
class MyDataset(Dataset): 
    # currently it's a Mapping-style dataset. Not sure if a Iterable-style dataset will be better
    # limit: use to truncate dataset. This will drop rows after certain index. May influence label distribution.
    def __init__(self, tsv_file, limit=None):
        self.df = pd.read_csv(tsv_file, sep='\t')
        self.df = self.df.dropna()
        self.df = self.df.reset_index(drop=True)
        if self.df['label'].dtype == 'float64':
            self.df['label'] = self.df['label'].astype('float32')
        if limit:
            self.df = self.df.iloc[:limit]
            
    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        dataslice = self.df.iloc[idx]
        sample = {'text':dataslice['text'], 'label':dataslice['label']}
        return sample


In [10]:
class MultiTaskTrainDataLoader():
    '''
    Each time, a random integer selects a dataset and load a batch of data {text, label} from it. Return i_task and data
    
    a iterator
    Known issue: large dataset may have not iterate once, small datasets may have been iterated many times
    '''
    
    def __init__(self, selected_task, batch_size, shuffle, num_workers, limit=None):
        self.tasks = selected_task
        self.split = 'train'
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.num_workers = num_workers
        
        self.num_tasks = len(self.tasks)
        self.datasets = []
        self.dataloaders = []
        self.dataloaderiters = []
        self.len = 0
        for task in self.tasks:
            self.datasets.append(MyDataset(f'../data/xslue/processed/{self.split}/{task}.tsv', limit=limit))
            self.dataloaders.append(DataLoader(self.datasets[-1], batch_size=self.batch_size, shuffle=self.shuffle, num_workers=self.num_workers)) 
            self.dataloaderiters.append(self.dataloaders[-1]._get_iterator())
            self.len += len(self.dataloaders[-1])
    def __len__(self):   
        return self.len

    def __iter__(self):
        self.n = 0
        return self
    
    def __next__(self):
        i_task = np.random.randint(self.num_tasks)
        if self.n < self.len:
            self.n += 1
        else:
            raise StopIteration
        dataloaderiter = self.dataloaderiters[i_task]    
        try: 
            batch = next(dataloaderiter)
        except StopIteration:
#             self.dataloaderiters[i_task]._reset(self.dataloaders[i_task])
#             dataloaderiter = self.dataloaderiters[i_task]
            self.dataloaderiters[i_task] = iter(self.dataloaders[i_task])
            dataloaderiter = self.dataloaderiters[i_task]
            batch = next(dataloaderiter)
        return i_task, batch

In [11]:
class MultiTaskTestDataLoader():
    '''
    For dev and test
    
    a generator
    '''
    
    def __init__(self, tasks, split, batch_size, shuffle, num_workers, limit=None):
        assert split in ['dev', 'test'], 'not implemented'
        self.tasks = tasks
        self.split = split
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.num_workers = num_workers
        
        self.num_tasks = len(tasks)
        self.datasets = []
        self.dataloaders = []
        self.dataloaderiters = []
        self.len = 0
        for task in tasks:
            self.datasets.append(MyDataset(f'../data/xslue/processed/{self.split}/{task}.tsv', limit=limit))
            self.dataloaders.append(DataLoader(self.datasets[-1], batch_size=self.batch_size, shuffle=self.shuffle, num_workers=self.num_workers))
            self.len += len(self.dataloaders[-1])
        self.i_task = 0
    def __len__(self):   
        return self.len

    def __iter__(self):
        for i_task in range(self.num_tasks):
            dataloader = self.dataloaders[i_task]
            for batch in dataloader:
                yield i_task, batch
    
        

# multi-task model

Given selected tasks, the model will add corresponding classification heads on the top of pretrained bert/(other bert). 

In [108]:
class RegressionHead(nn.Module):
    def __init__(self, embedding_dim = 768, hidden_dim = 128):
        super().__init__()
        self.dropout = nn.Dropout(0.1)
        self.hidden = nn.Linear(embedding_dim, 1)
        
        self.loss_fn = nn.MSELoss()
    def forward(self, sent_emb, label):
        output = self.hidden(self.dropout(sent_emb)).squeeze(1)

        loss = self.loss_fn(output, label)
        return output, loss

In [109]:
class ClassificationHead(nn.Module):
    def __init__(self, num_labels, embedding_dim = 768, hidden_dim = 128):
        super().__init__()
        self.num_labels = num_labels
        self.dropout = nn.Dropout(0.1)
        self.hidden = nn.Linear(embedding_dim, self.num_labels)
        
        self.loss_fn = nn.CrossEntropyLoss()
    def forward(self, sent_emb, label):
        output = self.hidden(self.dropout(sent_emb))
        
        loss = self.loss_fn(output.view(-1, self.num_labels), label.view(-1))
        return output, loss

In [110]:
class MultiTaskBert(pretrained_model_class):
    def __init__(self, config, selected_task, use_pooler=True):
        super().__init__(config)
        self.use_pooler = use_pooler
        self.basemodel = model_class.from_pretrained(model_name)
        self.style_heads = nn.ModuleList()
        
        for task in selected_task:
            if tasks[task] == 1:
                self.style_heads.append(RegressionHead())
            else:
                self.style_heads.append(ClassificationHead(tasks[task]))
    def forward(self, i_task=None, label=None, **kwargs):
        output = self.basemodel(**kwargs)
        if self.use_pooler and ('pooler_output' in output):
            sent_emb = output['pooler_output']
        else:
            sent_emb = output['last_hidden_state'][:,0,:]
        
        output, loss = self.style_heads[i_task](sent_emb, label) 
        return output, loss

In [111]:
def print_loss(losses):
    for k in losses:
        print(f'{losses[k]:4.4f}', end=' ')
    print('')

In [112]:
def validate(model, tokenizer, selected_task, split, limit=None):
    val_loss = collections.defaultdict(float)
    val_size = collections.defaultdict(int)
    overall_acc = torchmetrics.Accuracy() 
    task_accs = [torchmetrics.Accuracy() for i in range(len(selected_task))] 
    
    if split == 'train':
        mt_dataloader = MultiTaskTrainDataLoader(selected_task, 32, False, 4, limit=limit)
    else:
        mt_dataloader = MultiTaskTestDataLoader(selected_task, 'dev', 32, False, 4, limit=limit)
        
    model.eval()
    for data in tqdm(mt_dataloader, leave=False):  
        i_task, batch = data
        label = batch['label'].to(device)
        size = len(label)
        del batch['label']
        tokens = tokenizer(**batch, return_tensors='pt', padding=True, truncation=True, max_length=64).to(device)
        output, loss = model(**tokens, i_task=i_task,  label=label)
        overall_acc.update(output.to('cpu').detach(), label.to('cpu').detach())
        task_accs[i_task].update(output.to('cpu').detach(), label.to('cpu').detach())
        tokens = None
        output = None
        val_loss[i_task] += loss.detach().item()*size
        val_size[i_task] += size
    
    accs = []
    for i_task in val_loss:
        val_loss[i_task] /= val_size[i_task]
        accs.append(task_accs[i_task].compute())
    model.train()
    
    
    return val_loss, overall_acc.compute(), accs


In [113]:
def train_multitask(selected_task, freeze_bert, use_pooler=True, num_epoch=5, limit=None, save=True):
    """
    selected_task = [
                    'ShortHumor',
                    'SARC',
                    ] 
    freeze_bert = False # True or False or int, represent freeze how many layers to freeze
    use_pooler=True
    num_epoch = 5
    limit = 30000 # set the maximum number of data in each dataset 
    save=True
    """
    excute_time = datetime.now() 
    result_folder = '../result'
    model_folder = f"{result_folder}/{'+'.join(selected_task)}/{excute_time.now().strftime('%Y%m%d-%H:%M:%S')}"
    torch.cuda.empty_cache()
    
    # load bert config
    config = config_class.from_pretrained(model_name) # bert config is actually a dataclass. can add any args you want
    config.freeze_bert = freeze_bert
    config.use_pooler = use_pooler
    config.num_epoch = num_epoch
    config.limit = limit
    # some other args that usually keep the same
    config.my_batchsize = 32
    config.my_max_length = 64
    
    tokenizer = tokenizer_class.from_pretrained(model_name)
    
    train_dataloader = MultiTaskTrainDataLoader(selected_task, config.my_batchsize, False, 4, limit = limit)
    
    # load pretrained bert and add classification heads to it
    mt_model = MultiTaskBert(config, selected_task, use_pooler).to(device)
    
    # freeze top layers
    if freeze_bert==True:
        for param in mt_model.basemodel.parameters():
            param.requires_grad = False
    elif isinstance(freeze_bert, int):
        if freeze_bert > 0:
            freeze_bert = freeze_bert - 12
        for param in mt_model.basemodel.parameters():
            param.requires_grad = False
        for layer in mt_model.basemodel.encoder.layer: 
            for param in layer.parameters():
                param.requires_grad = True    
        if 'pooler' in mt_model.basemodel._modules:
            for param in mt_model.basemodel.pooler.parameters():
                param.requires_grad = True    
        
    optimizer = optim.AdamW(mt_model.parameters(), lr=5e-5)

    scheduler = get_scheduler("linear",
                                optimizer=optimizer,
                                num_warmup_steps=500,
                                num_training_steps=num_epoch*len(train_dataloader))

    # create dataframes for logging
    columns = ['i_epoch', 'train_loss'] + [f'train_loss_{selected_task[i]}' for i in range(len(selected_task))]
    columns += ['train_acc'] + [f'train_acc_{selected_task[i]}' for i in range(len(selected_task))]
    columns += ['val_loss'] + [f'val_loss_{selected_task[i]}' for i in range(len(selected_task))]
    columns += ['val_acc'] + [f'val_acc_{selected_task[i]}' for i in range(len(selected_task))]
    df_evaluation = pd.DataFrame(columns=columns)
    df_loss_per_step = pd.DataFrame(columns=['i_epoch', 'i_iter', 'i_task', 'task_name', 'train_loss'])
    
    best_accuracy = 0.0
    progress_bar = tqdm(range(num_epoch*len(train_dataloader)))
    for i_epoch in range(num_epoch):
        for i_iter, data in enumerate(train_dataloader):  
            i_task, batch = data
            optimizer.zero_grad()
            label = batch['label'].to(device)
            del batch['label']
            tokens = tokenizer(**batch, return_tensors='pt', padding=True, truncation=True, max_length=config.my_max_length).to(device)
            output, loss = mt_model(**tokens, i_task=i_task,  label=label)
            loss.backward()
            optimizer.step()
            scheduler.step()
            
            # log per step
            step_result = {'i_epoch':i_epoch, 'i_iter':i_iter, 'i_task':i_task, 'task_name':selected_task[i_task], 'train_loss':loss.item(),}
            df_loss_per_step = df_loss_per_step.append(step_result , ignore_index=True)
            progress_bar.update(1)
        
        # run evaluation on train and validation set 
        train_loss, train_overall_acc, train_task_accs = validate(mt_model, tokenizer, selected_task, 'train', limit = limit)
        val_loss, val_overall_acc, val_task_accs = validate(mt_model, tokenizer, selected_task, 'val', limit = limit)
        
        # save best model and corresponding opt and scheduler states to disk
        if save and val_overall_acc.item() > best_accuracy: 
            torch.save(mt_model.state_dict(), f"{model_folder}/pytorch_model.bin")
            torch.save(optimizer.state_dict(), f"{model_folder}/optimizer.pt")
            torch.save(scheduler.state_dict(), f"{model_folder}/scheduler.pt")
            
        # collect result
        result = {'i_epoch':i_epoch, f'train_loss':sum(train_loss.values()), 'train_acc':train_overall_acc.item(), 'val_loss':sum(val_loss.values()), 'val_acc':val_overall_acc.item()}
        result.update({f'train_loss_{selected_task[i]}':train_loss[i] for i in train_loss})
        result.update({f'train_acc_{selected_task[i]}':train_task_accs[i].item() for i in range(len(train_task_accs))})   
        result.update({f'val_loss_{selected_task[i]}':val_loss[i] for i in val_loss})
        result.update({f'val_acc_{selected_task[i]}':val_task_accs[i].item() for i in range(len(val_task_accs))})   
        df_evaluation = df_evaluation.append(result , ignore_index=True)
#         print('\n'.join([f"{k}:{v:.4}" if isinstance(v, float) else f"{k}:{v}" for k,v in result.items()]))
    
    # save to disk
    if save:
        config.to_json_file(f"{model_folder}/config.json")
        df_evaluation.to_csv(f"{model_folder}/evaluation.csv", index=False)
        df_loss_per_step.to_csv(f"{model_folder}/loss_per_step.csv", index=False)
    
    display(df_evaluation) 
#     display(df_loss_per_step) # this is too long, not approporate to show directly
    return df_evaluation, df_loss_per_step


# bertology

In [114]:
def entropy(p):
    """ Compute the entropy of a probability distribution """
    plogp = p * torch.log(p)
    plogp[p == 0] = 0
    return -plogp.sum(dim=-1)

In [115]:
def compute_heads_importance(
    model, tokenizer, eval_dataloader, model_dir, compute_entropy=True, compute_importance=True, head_mask=None, 
    dont_normalize_importance_by_layer = True, dont_normalize_global_importance=True
):
    """ This method shows how to compute:
        - head attention entropy
        - head importance scores according to http://arxiv.org/abs/1905.10650
    """
    # Prepare our tensors
    n_layers, n_heads = model.basemodel.config.num_hidden_layers, model.basemodel.config.num_attention_heads
    head_importance = torch.zeros(n_layers, n_heads).to(device)
    attn_entropy = torch.zeros(n_layers, n_heads).to(device)

    if head_mask is None:
        head_mask = torch.ones(n_layers, n_heads).to(device)
    head_mask.requires_grad_(requires_grad=True)
    preds = None
    labels = None
    tot_tokens = 0.0

    for step, batch in enumerate(tqdm(eval_dataloader, desc="Iteration")):
        i_task, batch = batch
        label_ids = batch['label'].to(device)
        size = len(label_ids)
        del batch['label']
        batch = tokenizer(**batch, return_tensors='pt', padding=True, truncation=True, max_length=64).to(device)
        input_ids, input_mask, segment_ids = batch['input_ids'], batch['attention_mask'], batch['token_type_ids']
        # Do a forward pass (not with torch.no_grad() since we need gradients for importance score - see below)
        print(i_task)
        outputs = model(i_task=i_task,
            input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, label=label_ids, head_mask=head_mask, 
            output_attentions = True, 
        )
        loss, logits, all_attentions = (
            outputs[1],
            outputs[0].logits,
            outputs[0].attentions,
        )  # Loss and logits are the first, attention the last
        loss.backward()  # Backpropagate to populate the gradients in the head mask

        if compute_entropy:
            for layer, attn in enumerate(all_attentions):
                masked_entropy = entropy(attn.detach()) * input_mask.float().unsqueeze(1)
                attn_entropy[layer] += masked_entropy.sum(-1).sum(0).detach()

        if compute_importance:
            head_importance += head_mask.grad.abs().detach()

        # Also store our logits/labels if we want to compute metrics afterwards
        if preds is None:
            preds = logits.detach().cpu().numpy()
            labels = label_ids.detach().cpu().numpy()
        else:
            preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
            labels = np.append(labels, label_ids.detach().cpu().numpy(), axis=0)

        tot_tokens += input_mask.float().detach().sum().data

    # Normalize
    attn_entropy /= tot_tokens
    head_importance /= tot_tokens
    # Layerwise importance normalization
    if not dont_normalize_importance_by_layer:
        exponent = 2
        norm_by_layer = torch.pow(torch.pow(head_importance, exponent).sum(-1), 1 / exponent)
        head_importance /= norm_by_layer.unsqueeze(-1) + 1e-20

    if not dont_normalize_global_importance:
        head_importance = (head_importance - head_importance.min()) / (head_importance.max() - head_importance.min())

    # Print/save matrices
#     np.save(os.path.join(model_dir, "attn_entropy.npy"), attn_entropy.detach().cpu().numpy())
#     np.save(os.path.join(model_dir, "head_importance.npy"), head_importance.detach().cpu().numpy())

    head_ranks = torch.zeros(head_importance.numel(), dtype=torch.long, device=device)
    head_ranks[head_importance.view(-1).sort(descending=True)[1]] = torch.arange(
        head_importance.numel(), device=device
    )
    head_ranks = head_ranks.view_as(head_importance)

    return attn_entropy, head_importance, preds, labels

In [116]:
bert = MultiTaskBert(BertConfig.from_pretrained('bert-base-uncased'), selected_task, use_pooler).to(device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [104]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [34]:
eval_dataloader = MultiTaskTestDataLoader(['ShortHumor'], 'dev', 32, False, 4, limit=200)

In [117]:
attn_entropy, head_importance, preds, labels = compute_heads_importance(
    bert, tokenizer, eval_dataloader, model_dir='a')

Iteration:   0%|          | 0/7 [00:00<?, ?it/s]

0


AttributeError: 'Tensor' object has no attribute 'logits'

In [74]:
head_importance.shape

torch.Size([12, 12])

In [76]:
attn_entropy.shape

torch.Size([12, 12])

# train

## single task

### ShortHumor

#### freeze

In [17]:
selected_task = [
                 'ShortHumor', 
#                  'VUA',
                ] 


In [18]:
train_dataloader = MultiTaskTrainDataLoader(selected_task, 32, False, 4)

config = config_class.from_pretrained(model_name)
tokenizer = tokenizer_class.from_pretrained(model_name)

In [21]:
torch.cuda.empty_cache()
num_epoch = 5

mt_model = MultiTaskBert(config, selected_task).to(device)
for param in mt_model.basemodel.parameters():
    param.requires_grad = False

optimizer = optim.AdamW(mt_model.parameters(), lr=5e-5)
# optimizer = optim.SGD(mt_model.parameters(), lr=0.001)

# scheduler = lr_scheduler.LinearLR(optimizer, start_factor=0.01, total_iters=500)
scheduler = lr_scheduler.OneCycleLR(optimizer, max_lr=5e-5, total_steps=num_epoch*len(train_dataloader)) 

df = pd.DataFrame(columns=['i_iter', 'train_loss', 'train_acc', 'val_loss', 'val_acc'])

for i_epoch in range(num_epoch):
    for i_iter, data in enumerate(tqdm(train_dataloader)):  

        i_task, batch = data
        optimizer.zero_grad()
        label = batch['label'].to(device)
        del batch['label']
        tokens = tokenizer(**batch, return_tensors='pt', padding=True, truncation=True, max_length=64).to(device)
        output, loss = mt_model(**tokens, i_task=i_task,  label=label)
        loss.backward()
        optimizer.step()
        scheduler.step()

#         if i_iter in [0, 10, 25, 50, 100]:
#             train_loss, train_overall_acc, train_task_accs = validate(mt_model, 'train')
#             val_loss, val_overall_acc, val_task_accs = validate(mt_model, 'val')
#             result = {'i_iter':i_iter, 'train_loss':train_loss[0], 'train_acc':train_overall_acc.item(), 'val_loss':val_loss[0], 'val_acc':val_overall_acc.item()}
#             df = df.append(result , ignore_index=True)
#             print(result)
#         if i_iter>100:
#             break

    train_loss, train_overall_acc, train_task_accs = validate(mt_model, selected_task, 'train')
    val_loss, val_overall_acc, val_task_accs = validate(mt_model, selected_task'val')
    result = {'i_epoch':i_epoch, 'train_loss':train_loss[0], 'train_acc':train_overall_acc.item(), 'val_loss':val_loss[0], 'val_acc':val_overall_acc.item()}
    df = df.append(result , ignore_index=True)
    print(result)


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/1182 [00:00<?, ?it/s]

  0%|          | 0/1182 [00:00<?, ?it/s]

  0%|          | 0/62 [00:00<?, ?it/s]

{'i_epoch': 0, 'train_loss': 0.6093603514596295, 'train_acc': 0.6768603920936584, 'val_loss': 0.6037419830616592, 'val_acc': 0.6787695288658142}


  0%|          | 0/1182 [00:00<?, ?it/s]

  0%|          | 0/1182 [00:00<?, ?it/s]

  0%|          | 0/62 [00:00<?, ?it/s]

{'i_epoch': 1, 'train_loss': 0.5694271674443419, 'train_acc': 0.6997169256210327, 'val_loss': 0.5666932542697986, 'val_acc': 0.7009581327438354}


  0%|          | 0/1182 [00:00<?, ?it/s]

  0%|          | 0/1182 [00:00<?, ?it/s]

  0%|          | 0/62 [00:00<?, ?it/s]

{'i_epoch': 2, 'train_loss': 0.542740888546511, 'train_acc': 0.7273352742195129, 'val_loss': 0.5414822714171262, 'val_acc': 0.7216339111328125}


  0%|          | 0/1182 [00:00<?, ?it/s]

  0%|          | 0/1182 [00:00<?, ?it/s]

  0%|          | 0/62 [00:00<?, ?it/s]

{'i_epoch': 3, 'train_loss': 0.5291925388148391, 'train_acc': 0.7452183961868286, 'val_loss': 0.5293197461408133, 'val_acc': 0.7367624640464783}


  0%|          | 0/1182 [00:00<?, ?it/s]

  0%|          | 0/1182 [00:00<?, ?it/s]

  0%|          | 0/62 [00:00<?, ?it/s]

{'i_epoch': 4, 'train_loss': 0.5281198126989516, 'train_acc': 0.7454035878181458, 'val_loss': 0.5283215008657266, 'val_acc': 0.738779604434967}


In [22]:
# with onecycle lr_scheduler
df

Unnamed: 0,i_iter,train_loss,train_acc,val_loss,val_acc,i_epoch
0,,0.60936,0.67686,0.603742,0.67877,0.0
1,,0.569427,0.699717,0.566693,0.700958,1.0
2,,0.542741,0.727335,0.541482,0.721634,2.0
3,,0.529193,0.745218,0.52932,0.736762,3.0
4,,0.52812,0.745404,0.528322,0.73878,4.0


#### unfreeze

In [23]:
selected_task = [
                 'ShortHumor', 
#                  'VUA',
                ] 


In [24]:
train_dataloader = MultiTaskTrainDataLoader(selected_task, 32, False, 4)

config = config_class.from_pretrained(model_name)
tokenizer = tokenizer_class.from_pretrained(model_name)

In [25]:
torch.cuda.empty_cache()
num_epoch = 5

mt_model = MultiTaskBert(config, selected_task).to(device)
# for param in mt_model.basemodel.parameters():
#     param.requires_grad = False

optimizer = optim.AdamW(mt_model.parameters(), lr=5e-5)
# optimizer = optim.SGD(mt_model.parameters(), lr=0.001)

# scheduler = lr_scheduler.LinearLR(optimizer, start_factor=0.01, total_iters=500)
scheduler = lr_scheduler.OneCycleLR(optimizer, max_lr=5e-5, total_steps=num_epoch*len(train_dataloader)) 

df = pd.DataFrame(columns=['i_iter', 'train_loss', 'train_acc', 'val_loss', 'val_acc'])

for i_epoch in range(num_epoch):
    for i_iter, data in enumerate(tqdm(train_dataloader)):  

        i_task, batch = data
        optimizer.zero_grad()
        label = batch['label'].to(device)
        del batch['label']
        tokens = tokenizer(**batch, return_tensors='pt', padding=True, truncation=True, max_length=64).to(device)
        output, loss = mt_model(**tokens, i_task=i_task,  label=label)
        loss.backward()
        optimizer.step()
        scheduler.step()

#         if i_iter in [0, 10, 25, 50, 100]:
#             train_loss, train_overall_acc, train_task_accs = validate(mt_model, 'train')
#             val_loss, val_overall_acc, val_task_accs = validate(mt_model, 'val')
#             result = {'i_iter':i_iter, 'train_loss':train_loss[0], 'train_acc':train_overall_acc.item(), 'val_loss':val_loss[0], 'val_acc':val_overall_acc.item()}
#             df = df.append(result , ignore_index=True)
#             print(result)
#         if i_iter>100:
#             break

    train_loss, train_overall_acc, train_task_accs = validate(mt_model, 'train')
    val_loss, val_overall_acc, val_task_accs = validate(mt_model, 'val')
    result = {'i_epoch':i_epoch, 'train_loss':train_loss[0], 'train_acc':train_overall_acc.item(), 'val_loss':val_loss[0], 'val_acc':val_overall_acc.item()}
    df = df.append(result , ignore_index=True)
    print(result)


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/1182 [00:00<?, ?it/s]

  0%|          | 0/1182 [00:00<?, ?it/s]

  0%|          | 0/62 [00:00<?, ?it/s]

{'i_epoch': 0, 'train_loss': 0.07624205425341254, 'train_acc': 0.9742070436477661, 'val_loss': 0.1591183666442298, 'val_acc': 0.940494179725647}


  0%|          | 0/1182 [00:00<?, ?it/s]

  0%|          | 0/1182 [00:00<?, ?it/s]

  0%|          | 0/62 [00:00<?, ?it/s]

Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7f19a2515550>
Traceback (most recent call last):
  File "/home/jz17d/anaconda3/envs/torch/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 1328, in __del__
    self._shutdown_workers()
  File "/home/jz17d/anaconda3/envs/torch/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 1320, in _shutdown_workers
    if w.is_alive():
  File "/home/jz17d/anaconda3/envs/torch/lib/python3.9/multiprocessing/process.py", line 160, in is_alive
Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7f19a2515550>Exception ignored in: 
Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7f19a2515550><function _MultiProcessingDataLoaderIter.__del__ at 0x7f19a2515550>Traceback (most recent call last):


Traceback (most recent call last):
  File "/home/jz17d/anaconda3/envs/torch/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 1328, in __del

{'i_epoch': 1, 'train_loss': 0.030277462488580556, 'train_acc': 0.9911113381385803, 'val_loss': 0.14881271859926823, 'val_acc': 0.9520927667617798}


  0%|          | 0/1182 [00:00<?, ?it/s]

  0%|          | 0/1182 [00:00<?, ?it/s]

  0%|          | 0/62 [00:00<?, ?it/s]

{'i_epoch': 2, 'train_loss': 0.011882821528125562, 'train_acc': 0.9964551329612732, 'val_loss': 0.15233089646162015, 'val_acc': 0.9546142220497131}


  0%|          | 0/1182 [00:00<?, ?it/s]

  0%|          | 0/1182 [00:00<?, ?it/s]

  0%|          | 0/62 [00:00<?, ?it/s]

{'i_epoch': 3, 'train_loss': 0.0017386663505057035, 'train_acc': 0.9996032118797302, 'val_loss': 0.14110398142522484, 'val_acc': 0.9667171239852905}


  0%|          | 0/1182 [00:00<?, ?it/s]

  0%|          | 0/1182 [00:00<?, ?it/s]

  0%|          | 0/62 [00:00<?, ?it/s]

{'i_epoch': 4, 'train_loss': 0.000905192531714256, 'train_acc': 0.9998412728309631, 'val_loss': 0.15679824659722963, 'val_acc': 0.9636913537979126}


In [26]:
df # ShortHumor

Unnamed: 0,i_iter,train_loss,train_acc,val_loss,val_acc,i_epoch
0,,0.076242,0.974207,0.159118,0.940494,0.0
1,,0.030277,0.991111,0.148813,0.952093,1.0
2,,0.011883,0.996455,0.152331,0.954614,2.0
3,,0.001739,0.999603,0.141104,0.966717,3.0
4,,0.000905,0.999841,0.156798,0.963691,4.0


### SARC

#### freeze

In [33]:
selected_task = [
#                 'SarcasmGhosh',
                'SARC',
                ] 


In [34]:
train_dataloader = MultiTaskTrainDataLoader(selected_task, 32, False, 4)

config = config_class.from_pretrained(model_name)
tokenizer = tokenizer_class.from_pretrained(model_name)

In [35]:
torch.cuda.empty_cache()
num_epoch = 5

mt_model = MultiTaskBert(config, selected_task).to(device)
for param in mt_model.basemodel.parameters():
    param.requires_grad = False

optimizer = optim.AdamW(mt_model.parameters(), lr=5e-5)
# optimizer = optim.SGD(mt_model.parameters(), lr=0.001)

num_warmup_steps=500
num_training_steps=num_epoch*len(train_dataloader)
def lr_lambda(current_step: int):
    if current_step < num_warmup_steps:
        return float(current_step) / float(max(1, num_warmup_steps))
    return max(
        0.0, float(num_training_steps - current_step) / float(max(1, num_training_steps - num_warmup_steps))
    )
scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda, -1)
# scheduler = lr_scheduler.LinearLR(optimizer, start_factor=0.01, total_iters=500)
# scheduler = lr_scheduler.OneCycleLR(optimizer, max_lr=5e-5, total_steps=num_epoch*len(train_dataloader)) 

df = pd.DataFrame(columns=['i_iter', 'train_loss', 'train_acc', 'val_loss', 'val_acc'])

for i_epoch in range(num_epoch):
    for i_iter, data in enumerate(tqdm(train_dataloader)):  

        i_task, batch = data
        optimizer.zero_grad()
        label = batch['label'].to(device)
        del batch['label']
        tokens = tokenizer(**batch, return_tensors='pt', padding=True, truncation=True, max_length=64).to(device)
        output, loss = mt_model(**tokens, i_task=i_task,  label=label)
        loss.backward()
        optimizer.step()
        scheduler.step()

#         if i_iter in [0, 10, 25, 50, 100]:
#             train_loss, train_overall_acc, train_task_accs = validate(mt_model, 'train')
#             val_loss, val_overall_acc, val_task_accs = validate(mt_model, 'val')
#             result = {'i_iter':i_iter, 'train_loss':train_loss[0], 'train_acc':train_overall_acc.item(), 'val_loss':val_loss[0], 'val_acc':val_overall_acc.item()}
#             df = df.append(result , ignore_index=True)
#             print(result)
#         if i_iter>100:
#             break

    train_loss, train_overall_acc, train_task_accs = validate(mt_model, selected_task, split='train')
    val_loss, val_overall_acc, val_task_accs = validate(mt_model, selected_task, split='val')
    result = {'i_epoch':i_epoch, 'train_loss':train_loss[0], 'train_acc':train_overall_acc.item(), 'val_loss':val_loss[0], 'val_acc':val_overall_acc.item()}
    df = df.append(result , ignore_index=True)
    print(result)


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/6427 [00:00<?, ?it/s]

  0%|          | 0/6427 [00:00<?, ?it/s]

  0%|          | 0/1607 [00:00<?, ?it/s]

{'i_epoch': 0, 'train_loss': 0.6752687707076435, 'train_acc': 0.5900925993919373, 'val_loss': 0.6748735675782087, 'val_acc': 0.5901769995689392}


  0%|          | 0/6427 [00:00<?, ?it/s]

  0%|          | 0/6427 [00:00<?, ?it/s]

  0%|          | 0/1607 [00:00<?, ?it/s]

{'i_epoch': 1, 'train_loss': 0.6693607444583342, 'train_acc': 0.5988796353340149, 'val_loss': 0.6689359912007867, 'val_acc': 0.6004862785339355}


  0%|          | 0/6427 [00:00<?, ?it/s]

  0%|          | 0/6427 [00:00<?, ?it/s]

  0%|          | 0/1607 [00:00<?, ?it/s]

{'i_epoch': 2, 'train_loss': 0.6668882590044554, 'train_acc': 0.6024197340011597, 'val_loss': 0.6664740512445728, 'val_acc': 0.6041431427001953}


  0%|          | 0/6427 [00:00<?, ?it/s]

  0%|          | 0/6427 [00:00<?, ?it/s]

  0%|          | 0/1607 [00:00<?, ?it/s]

{'i_epoch': 3, 'train_loss': 0.6650835614176025, 'train_acc': 0.6039320230484009, 'val_loss': 0.6646254126094839, 'val_acc': 0.6058354377746582}


  0%|          | 0/6427 [00:00<?, ?it/s]

  0%|          | 0/6427 [00:00<?, ?it/s]

  0%|          | 0/1607 [00:00<?, ?it/s]

Exception ignored in: Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7ff312b82d30><function _MultiProcessingDataLoaderIter.__del__ at 0x7ff312b82d30>

Traceback (most recent call last):
Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7ff312b82d30>  File "/home/jz17d/anaconda3/envs/torch/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 1328, in __del__

Traceback (most recent call last):
Traceback (most recent call last):
  File "/home/jz17d/anaconda3/envs/torch/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 1328, in __del__
          File "/home/jz17d/anaconda3/envs/torch/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 1328, in __del__
self._shutdown_workers()self._shutdown_workers()    

self._shutdown_workers()  File "/home/jz17d/anaconda3/envs/torch/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 1320, in _shutdown_workers
  File "/home/jz17d/anaconda3/env

{'i_epoch': 4, 'train_loss': 0.6644202125413103, 'train_acc': 0.6025412678718567, 'val_loss': 0.6639701425878484, 'val_acc': 0.6020035147666931}


In [57]:
scheduler.state_dict()

{'base_lrs': [5e-05],
 'last_epoch': 9380,
 '_step_count': 9381,
 'verbose': False,
 '_get_lr_called_within_step': False,
 '_last_lr': [0.0],
 'lr_lambdas': [None]}

In [36]:
df

Unnamed: 0,i_iter,train_loss,train_acc,val_loss,val_acc,i_epoch
0,,0.675269,0.590093,0.674874,0.590177,0.0
1,,0.669361,0.59888,0.668936,0.600486,1.0
2,,0.666888,0.60242,0.666474,0.604143,2.0
3,,0.665084,0.603932,0.664625,0.605835,3.0
4,,0.66442,0.602541,0.66397,0.602004,4.0


#### unfreeze

In [18]:
selected_task = [
#                 'SarcasmGhosh',
                'SARC',
    
                ] 


In [19]:
train_dataloader = MultiTaskTrainDataLoader(selected_task, 32, False, 4)

config = config_class.from_pretrained(model_name)
tokenizer = tokenizer_class.from_pretrained(model_name)

In [20]:
torch.cuda.empty_cache()
num_epoch = 5

mt_model = MultiTaskBert(config, selected_task).to(device)
# for param in mt_model.basemodel.parameters():
#     param.requires_grad = False

optimizer = optim.AdamW(mt_model.parameters(), lr=5e-5)
# optimizer = optim.SGD(mt_model.parameters(), lr=0.001)

# scheduler = lr_scheduler.LinearLR(optimizer, start_factor=0.01, total_iters=500)
scheduler = lr_scheduler.OneCycleLR(optimizer, max_lr=5e-5, total_steps=num_epoch*len(train_dataloader)) 

df = pd.DataFrame(columns=['i_iter', 'train_loss', 'train_acc', 'val_loss', 'val_acc'])

for i_epoch in range(num_epoch):
    for i_iter, data in enumerate(tqdm(train_dataloader)):  

        i_task, batch = data
        optimizer.zero_grad()
        label = batch['label'].to(device)
        del batch['label']
        tokens = tokenizer(**batch, return_tensors='pt', padding=True, truncation=True, max_length=64).to(device)
        output, loss = mt_model(**tokens, i_task=i_task,  label=label)
        loss.backward()
        optimizer.step()
        scheduler.step()

#         if i_iter in [0, 10, 25, 50, 100]:
#             train_loss, train_overall_acc, train_task_accs = validate(mt_model, 'train')
#             val_loss, val_overall_acc, val_task_accs = validate(mt_model, 'val')
#             result = {'i_iter':i_iter, 'train_loss':train_loss[0], 'train_acc':train_overall_acc.item(), 'val_loss':val_loss[0], 'val_acc':val_overall_acc.item()}
#             df = df.append(result , ignore_index=True)
#             print(result)
#         if i_iter>100:
#             break

    train_loss, train_overall_acc, train_task_accs = validate(mt_model, selected_task,'train')
    val_loss, val_overall_acc, val_task_accs = validate(mt_model, selected_task, 'val')
    result = {'i_epoch':i_epoch, 'train_loss':train_loss[0], 'train_acc':train_overall_acc.item(), 'val_loss':val_loss[0], 'val_acc':val_overall_acc.item()}
    df = df.append(result , ignore_index=True)
    print(result)


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/6427 [00:00<?, ?it/s]

  0%|          | 0/6427 [00:00<?, ?it/s]

  0%|          | 0/1607 [00:00<?, ?it/s]

{'i_epoch': 0, 'train_loss': 0.5204895184541661, 'train_acc': 0.7440722584724426, 'val_loss': 0.5617923329948242, 'val_acc': 0.7100952863693237}


  0%|          | 0/6427 [00:00<?, ?it/s]

  0%|          | 0/6427 [00:00<?, ?it/s]

  0%|          | 0/1607 [00:00<?, ?it/s]

{'i_epoch': 1, 'train_loss': 0.4135907814818228, 'train_acc': 0.8152292370796204, 'val_loss': 0.5866683719014871, 'val_acc': 0.7142579555511475}


  0%|          | 0/6427 [00:00<?, ?it/s]

  0%|          | 0/6427 [00:00<?, ?it/s]

  0%|          | 0/1607 [00:00<?, ?it/s]

{'i_epoch': 2, 'train_loss': 0.26873430512040186, 'train_acc': 0.8927320837974548, 'val_loss': 0.713898151434713, 'val_acc': 0.7076444029808044}


  0%|          | 0/6427 [00:00<?, ?it/s]

  0%|          | 0/6427 [00:00<?, ?it/s]

  0%|          | 0/1607 [00:00<?, ?it/s]

{'i_epoch': 3, 'train_loss': 0.1376631868924303, 'train_acc': 0.9499280452728271, 'val_loss': 0.8818678910805079, 'val_acc': 0.7074499130249023}


  0%|          | 0/6427 [00:00<?, ?it/s]

  0%|          | 0/6427 [00:00<?, ?it/s]

  0%|          | 0/1607 [00:00<?, ?it/s]

Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7f43cc4f3d30>
Traceback (most recent call last):
  File "/home/jz17d/anaconda3/envs/torch/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 1328, in __del__
Exception ignored in: Exception ignored in:     self._shutdown_workers()
<function _MultiProcessingDataLoaderIter.__del__ at 0x7f43cc4f3d30>  File "/home/jz17d/anaconda3/envs/torch/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 1320, in _shutdown_workers
<function _MultiProcessingDataLoaderIter.__del__ at 0x7f43cc4f3d30>

Traceback (most recent call last):
    if w.is_alive():
  File "/home/jz17d/anaconda3/envs/torch/lib/python3.9/multiprocessing/process.py", line 160, in is_alive
Traceback (most recent call last):
  File "/home/jz17d/anaconda3/envs/torch/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 1328, in __del__
          File "/home/jz17d/anaconda3/envs/torch/lib/python3.9/site-packages/torch/util

{'i_epoch': 4, 'train_loss': 0.08787503195361171, 'train_acc': 0.9715479016304016, 'val_loss': 0.9523796078389406, 'val_acc': 0.7084224820137024}


In [21]:
df

Unnamed: 0,i_iter,train_loss,train_acc,val_loss,val_acc,i_epoch
0,,0.52049,0.744072,0.561792,0.710095,0.0
1,,0.413591,0.815229,0.586668,0.714258,1.0
2,,0.268734,0.892732,0.713898,0.707644,2.0
3,,0.137663,0.949928,0.881868,0.70745,3.0
4,,0.087875,0.971548,0.95238,0.708422,4.0


## multi task

### ShortHumor + SARC freeze

In [17]:
##################### settings #####################
selected_task = [
                'ShortHumor',
                'SARC',
                ] 
num_epoch = 5
freeze_bert = True
limit = 30000 # set the maximum number of data in each dataset 
####################################################

train_dataloader = MultiTaskTrainDataLoader(selected_task, 32, False, 4, limit = limit)
config = config_class.from_pretrained(model_name)
tokenizer = tokenizer_class.from_pretrained(model_name)

torch.cuda.empty_cache()

mt_model = MultiTaskBert(config, selected_task).to(device)
if freeze_bert:
    for param in mt_model.basemodel.parameters():
        param.requires_grad = False

optimizer = optim.AdamW(mt_model.parameters(), lr=5e-5)

scheduler = get_scheduler("linear",
                            optimizer=optimizer,
                            num_warmup_steps=500,
                            num_training_steps=num_epoch*len(train_dataloader))

# create df to save metrics
columns = ['i_epoch', 'train_loss'] + [f'train_loss_{selected_task[i]}' for i in range(len(selected_task))]
columns += ['train_acc'] + [f'train_acc_{selected_task[i]}' for i in range(len(selected_task))]
columns += ['val_loss'] + [f'val_loss_{selected_task[i]}' for i in range(len(selected_task))]
columns += ['val_acc'] + [f'val_acc_{selected_task[i]}' for i in range(len(selected_task))]
df = pd.DataFrame(columns=columns)

for i_epoch in range(num_epoch):
    for i_iter, data in enumerate(tqdm(train_dataloader)):  
        i_task, batch = data
        optimizer.zero_grad()
        label = batch['label'].to(device)
        del batch['label']
        tokens = tokenizer(**batch, return_tensors='pt', padding=True, truncation=True, max_length=64).to(device)
        output, loss = mt_model(**tokens, i_task=i_task,  label=label)
        loss.backward()
        optimizer.step()
        scheduler.step()

#         if i_iter in [0, 10, 25, 50, 100]:
#             train_loss, train_overall_acc, train_task_accs = validate(mt_model, 'train')
#             val_loss, val_overall_acc, val_task_accs = validate(mt_model, 'val')
#             result = {'i_iter':i_iter, 'train_loss':train_loss[0], 'train_acc':train_overall_acc.item(), 'val_loss':val_loss[0], 'val_acc':val_overall_acc.item()}
#             df = df.append(result , ignore_index=True)
#             print(result)
#         if i_iter>100:
#             break
    
    # run evaluation on train and validation set 
    train_loss, train_overall_acc, train_task_accs = validate(mt_model, selected_task,'train', limit = limit)
    val_loss, val_overall_acc, val_task_accs = validate(mt_model, selected_task, 'val', limit = limit)
    
    # collect result
    result = {'i_epoch':i_epoch, f'train_loss':sum(train_loss.values()), 'train_acc':train_overall_acc.item(), 'val_loss':sum(val_loss.values()), 'val_acc':val_overall_acc.item()}
    result.update({f'train_loss_{selected_task[i]}':train_loss[i] for i in train_loss})
    result.update({f'train_acc_{selected_task[i]}':train_task_accs[i].item() for i in range(len(train_task_accs))})   
    result.update({f'val_loss_{selected_task[i]}':val_loss[i] for i in val_loss})
    result.update({f'val_acc_{selected_task[i]}':val_task_accs[i].item() for i in range(len(val_task_accs))})   
    df = df.append(result , ignore_index=True)
    print(result)


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/1876 [00:00<?, ?it/s]

  0%|          | 0/1876 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

{'i_epoch': 0, 'train_loss': 1.2588409416063957, 'train_acc': 0.6209677457809448, 'val_loss': 1.2555277911088591, 'val_acc': 0.5460400581359863, 'train_loss_SARC': 0.689106840099025, 'train_loss_ShortHumor': 0.5697341015073706, 'train_acc_ShortHumor': 0.5322515964508057, 'train_acc_SARC': 0.7093533873558044, 'val_loss_ShortHumor': 0.5662161913774137, 'val_loss_SARC': 0.6893115997314453, 'val_acc_ShortHumor': 0.7044881582260132, 'val_acc_SARC': 0.5355666875839233}


  0%|          | 0/1876 [00:00<?, ?it/s]

  0%|          | 0/1876 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

{'i_epoch': 1, 'train_loss': 1.233905949858753, 'train_acc': 0.6412623524665833, 'val_loss': 1.233718678321877, 'val_acc': 0.5660507082939148, 'train_loss_ShortHumor': 0.5492658669096266, 'train_loss_SARC': 0.6846400829491264, 'train_acc_ShortHumor': 0.7260255813598633, 'train_acc_SARC': 0.5563634037971497, 'val_loss_ShortHumor': 0.5489392152532326, 'val_loss_SARC': 0.6847794630686442, 'val_acc_ShortHumor': 0.7221381664276123, 'val_acc_SARC': 0.555733323097229}


  0%|          | 0/1876 [00:00<?, ?it/s]

  0%|          | 0/1876 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

{'i_epoch': 2, 'train_loss': 1.2160551427783572, 'train_acc': 0.6440615653991699, 'val_loss': 1.2155566978646477, 'val_acc': 0.5639558434486389, 'train_loss_SARC': 0.6834724632036471, 'train_loss_ShortHumor': 0.5325826795747102, 'train_acc_ShortHumor': 0.5536498427391052, 'train_acc_SARC': 0.7403978705406189, 'val_loss_ShortHumor': 0.5320815655900202, 'val_loss_SARC': 0.6834751322746276, 'val_acc_ShortHumor': 0.7286939024925232, 'val_acc_SARC': 0.5530666708946228}


  0%|          | 0/1876 [00:00<?, ?it/s]

  0%|          | 0/1876 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

{'i_epoch': 3, 'train_loss': 1.2075021417780603, 'train_acc': 0.6585410833358765, 'val_loss': 1.2078232778113958, 'val_acc': 0.5783385038375854, 'train_loss_SARC': 0.6810706823840658, 'train_loss_ShortHumor': 0.5264314593939946, 'train_acc_ShortHumor': 0.5702371597290039, 'train_acc_SARC': 0.7459549307823181, 'val_loss_ShortHumor': 0.5266925804974513, 'val_loss_SARC': 0.6811306973139445, 'val_acc_ShortHumor': 0.7367624640464783, 'val_acc_SARC': 0.5678666830062866}


  0%|          | 0/1876 [00:00<?, ?it/s]

  0%|          | 0/1876 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

{'i_epoch': 4, 'train_loss': 1.205312835984088, 'train_acc': 0.6587243676185608, 'val_loss': 1.2054009467225453, 'val_acc': 0.5861238837242126, 'train_loss_SARC': 0.6804667345453256, 'train_loss_ShortHumor': 0.5248461014387623, 'train_acc_ShortHumor': 0.5764569640159607, 'train_acc_SARC': 0.7467577457427979, 'val_loss_ShortHumor': 0.5249551163138131, 'val_loss_SARC': 0.6804458304087321, 'val_acc_ShortHumor': 0.738779604434967, 'val_acc_SARC': 0.576033353805542}


In [18]:
df

Unnamed: 0,i_epoch,train_loss,train_loss_ShortHumor,train_loss_SARC,train_acc,train_acc_ShortHumor,train_acc_SARC,val_loss,val_lossShortHumor,val_lossSARC,val_acc,val_accShortHumor,val_accSARC,val_acc_SARC,val_acc_ShortHumor,val_loss_SARC,val_loss_ShortHumor
0,0.0,1.258841,0.569734,0.689107,0.620968,0.532252,0.709353,1.255528,,,0.54604,,,0.535567,0.704488,0.689312,0.566216
1,1.0,1.233906,0.549266,0.68464,0.641262,0.726026,0.556363,1.233719,,,0.566051,,,0.555733,0.722138,0.684779,0.548939
2,2.0,1.216055,0.532583,0.683472,0.644062,0.55365,0.740398,1.215557,,,0.563956,,,0.553067,0.728694,0.683475,0.532082
3,3.0,1.207502,0.526431,0.681071,0.658541,0.570237,0.745955,1.207823,,,0.578339,,,0.567867,0.736762,0.681131,0.526693
4,4.0,1.205313,0.524846,0.680467,0.658724,0.576457,0.746758,1.205401,,,0.586124,,,0.576033,0.73878,0.680446,0.524955


### ShortHumor + SARC unfreeze

In [19]:
##################### settings #####################
selected_task = [
                'ShortHumor',
                'SARC',
                ] 
num_epoch = 5
freeze_bert = False
limit = 30000 # set the maximum number of data in each dataset 
####################################################

train_dataloader = MultiTaskTrainDataLoader(selected_task, 32, False, 4, limit = limit)
config = config_class.from_pretrained(model_name)
tokenizer = tokenizer_class.from_pretrained(model_name)

torch.cuda.empty_cache()

mt_model = MultiTaskBert(config, selected_task).to(device)
if freeze_bert:
    for param in mt_model.basemodel.parameters():
        param.requires_grad = False

optimizer = optim.AdamW(mt_model.parameters(), lr=5e-5)

scheduler = get_scheduler("linear",
                            optimizer=optimizer,
                            num_warmup_steps=500,
                            num_training_steps=num_epoch*len(train_dataloader))

# create df to save metrics
columns = ['i_epoch', 'train_loss'] + [f'train_loss_{selected_task[i]}' for i in range(len(selected_task))]
columns += ['train_acc'] + [f'train_acc_{selected_task[i]}' for i in range(len(selected_task))]
columns += ['val_loss'] + [f'val_loss_{selected_task[i]}' for i in range(len(selected_task))]
columns += ['val_acc'] + [f'val_acc_{selected_task[i]}' for i in range(len(selected_task))]
df = pd.DataFrame(columns=columns)

progress_bar = tqdm(range(num_epoch*len(train_dataloader)))
for i_epoch in range(num_epoch):
    for i_iter, data in enumerate(train_dataloader):  
        i_task, batch = data
        optimizer.zero_grad()
        label = batch['label'].to(device)
        del batch['label']
        tokens = tokenizer(**batch, return_tensors='pt', padding=True, truncation=True, max_length=64).to(device)
        output, loss = mt_model(**tokens, i_task=i_task,  label=label)
        loss.backward()
        optimizer.step()
        scheduler.step()
        progress_bar.update(1)
#         if i_iter in [0, 10, 25, 50, 100]:
#             train_loss, train_overall_acc, train_task_accs = validate(mt_model, 'train')
#             val_loss, val_overall_acc, val_task_accs = validate(mt_model, 'val')
#             result = {'i_iter':i_iter, 'train_loss':train_loss[0], 'train_acc':train_overall_acc.item(), 'val_loss':val_loss[0], 'val_acc':val_overall_acc.item()}
#             df = df.append(result , ignore_index=True)
#             print(result)
#         if i_iter>100:
#             break
    
    # run evaluation on train and validation set 
    train_loss, train_overall_acc, train_task_accs = validate(mt_model, selected_task,'train', limit = limit)
    val_loss, val_overall_acc, val_task_accs = validate(mt_model, selected_task, 'val', limit = limit)
    
    # collect result
    result = {'i_epoch':i_epoch, f'train_loss':sum(train_loss.values()), 'train_acc':train_overall_acc.item(), 'val_loss':sum(val_loss.values()), 'val_acc':val_overall_acc.item()}
    result.update({f'train_loss_{selected_task[i]}':train_loss[i] for i in train_loss})
    result.update({f'train_acc_{selected_task[i]}':train_task_accs[i].item() for i in range(len(train_task_accs))})   
    result.update({f'val_loss_{selected_task[i]}':val_loss[i] for i in val_loss})
    result.update({f'val_acc_{selected_task[i]}':val_task_accs[i].item() for i in range(len(val_task_accs))})   
    df = df.append(result , ignore_index=True)
    print(result)


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/1876 [00:00<?, ?it/s]

  0%|          | 0/1876 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

{'i_epoch': 0, 'train_loss': 0.5597279640742069, 'train_acc': 0.8793488144874573, 'val_loss': 0.7281250772883059, 'val_acc': 0.704968273639679, 'train_loss_SARC': 0.4994337105401564, 'train_loss_ShortHumor': 0.0602942535340505, 'train_acc_ShortHumor': 0.7725638151168823, 'train_acc_SARC': 0.98238605260849, 'val_loss_ShortHumor': 0.14456106155013654, 'val_loss_SARC': 0.5835640157381694, 'val_acc_ShortHumor': 0.9379727840423584, 'val_acc_SARC': 0.6895666718482971}


  0%|          | 0/1876 [00:00<?, ?it/s]

  0%|          | 0/1876 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

{'i_epoch': 1, 'train_loss': 0.3492017807090173, 'train_acc': 0.9272527098655701, 'val_loss': 0.8544494809125447, 'val_acc': 0.6973704695701599, 'train_loss_ShortHumor': 0.024158018609146632, 'train_loss_SARC': 0.3250437620998707, 'train_acc_ShortHumor': 0.9923270344734192, 'train_acc_SARC': 0.8677259087562561, 'val_loss_ShortHumor': 0.18864742008592525, 'val_loss_SARC': 0.6658020608266194, 'val_acc_ShortHumor': 0.9485628008842468, 'val_acc_SARC': 0.6807666420936584}


  0%|          | 0/1876 [00:00<?, ?it/s]

  0%|          | 0/1876 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

{'i_epoch': 2, 'train_loss': 0.14585648414454147, 'train_acc': 0.97415691614151, 'val_loss': 1.120236882763575, 'val_acc': 0.6930869817733765, 'train_loss_SARC': 0.14131026145500633, 'train_loss_ShortHumor': 0.004546222689535153, 'train_acc_ShortHumor': 0.9495192170143127, 'train_acc_SARC': 0.9987027645111084, 'val_loss_ShortHumor': 0.1890764776363676, 'val_loss_SARC': 0.9311604051272074, 'val_acc_ShortHumor': 0.950075626373291, 'val_acc_SARC': 0.6761000156402588}


  0%|          | 0/1876 [00:00<?, ?it/s]

  0%|          | 0/1876 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

{'i_epoch': 3, 'train_loss': 0.06999924572020612, 'train_acc': 0.9894194602966309, 'val_loss': 1.2681802114517926, 'val_acc': 0.6880217790603638, 'train_loss_SARC': 0.06785879645183454, 'train_loss_ShortHumor': 0.0021404492683715828, 'train_acc_ShortHumor': 0.9789506196975708, 'train_acc_SARC': 0.9994137287139893, 'val_loss_ShortHumor': 0.20113971268648279, 'val_loss_SARC': 1.0670404987653097, 'val_acc_ShortHumor': 0.9581442475318909, 'val_acc_SARC': 0.6701666712760925}


  0%|          | 0/1876 [00:00<?, ?it/s]

  0%|          | 0/1876 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

{'i_epoch': 4, 'train_loss': 0.026761318635796446, 'train_acc': 0.9964342713356018, 'val_loss': 1.4792386742195043, 'val_acc': 0.6959322094917297, 'train_loss_ShortHumor': 0.0010979920536727916, 'train_loss_SARC': 0.025663326582123655, 'train_acc_ShortHumor': 0.9997988343238831, 'train_acc_SARC': 0.9931107759475708, 'val_loss_ShortHumor': 0.18398326656823877, 'val_loss_SARC': 1.2952554076512655, 'val_acc_ShortHumor': 0.9601613879203796, 'val_acc_SARC': 0.6784666776657104}


In [20]:
df

Unnamed: 0,i_epoch,train_loss,train_loss_ShortHumor,train_loss_SARC,train_acc,train_acc_ShortHumor,train_acc_SARC,val_loss,val_lossShortHumor,val_lossSARC,val_acc,val_accShortHumor,val_accSARC,val_acc_SARC,val_acc_ShortHumor,val_loss_SARC,val_loss_ShortHumor
0,0.0,0.559728,0.060294,0.499434,0.879349,0.772564,0.982386,0.728125,,,0.704968,,,0.689567,0.937973,0.583564,0.144561
1,1.0,0.349202,0.024158,0.325044,0.927253,0.992327,0.867726,0.854449,,,0.69737,,,0.680767,0.948563,0.665802,0.188647
2,2.0,0.145856,0.004546,0.14131,0.974157,0.949519,0.998703,1.120237,,,0.693087,,,0.6761,0.950076,0.93116,0.189076
3,3.0,0.069999,0.00214,0.067859,0.989419,0.978951,0.999414,1.26818,,,0.688022,,,0.670167,0.958144,1.06704,0.20114
4,4.0,0.026761,0.001098,0.025663,0.996434,0.999799,0.993111,1.479239,,,0.695932,,,0.678467,0.960161,1.295255,0.183983


### PASTEL_country + SARC + ShortHumor freeze all

In [53]:
selected_task = ['PASTEL_country', # 33224
                 'SARC', # 205645
#                  'SarcasmGhosh', # 39780
                 'ShortHumor', # 37801
#                  'ShortJokeKaggle', # 406682
#                  'ShortRomance', # 1902
#                  'TroFi', # 3335
#                  'VUA', # 15157
                ] 
freeze_bert = True
use_pooler=True
num_epoch=5
limit=30000

df = train_multitask(selected_task, freeze_bert, use_pooler=use_pooler, num_epoch=num_epoch, limit=limit)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/2814 [00:00<?, ?it/s]

  0%|          | 0/2814 [00:00<?, ?it/s]

  0%|          | 0/1130 [00:00<?, ?it/s]

{'i_epoch': 0, 'train_loss': 1.3478879763034892, 'train_acc': 0.7459903359413147, 'val_loss': 1.351353927016269, 'val_acc': 0.5846247673034668, 'train_loss_SARC': 0.6901458172895996, 'train_loss_PASTEL_country': 0.10328395434886294, 'train_loss_ShortHumor': 0.5544582046650268, 'train_acc_PASTEL_country': 0.5218007564544678, 'train_acc_SARC': 0.9789002537727356, 'train_acc_ShortHumor': 0.7239354252815247, 'val_loss_PASTEL_country': 0.11057325167809728, 'val_loss_SARC': 0.690218567721049, 'val_loss_ShortHumor': 0.5505621076171229, 'val_acc_PASTEL_country': 0.9764025807380676, 'val_acc_SARC': 0.5213000178337097, 'val_acc_ShortHumor': 0.7221381664276123}


  0%|          | 0/2814 [00:00<?, ?it/s]

  0%|          | 0/2814 [00:00<?, ?it/s]

  0%|          | 0/1130 [00:00<?, ?it/s]

{'i_epoch': 1, 'train_loss': 1.315136425109223, 'train_acc': 0.7626510858535767, 'val_loss': 1.3229866455977528, 'val_acc': 0.6278226971626282, 'train_loss_SARC': 0.6801249834743752, 'train_loss_ShortHumor': 0.5330846307033215, 'train_loss_PASTEL_country': 0.10192681093152628, 'train_acc_PASTEL_country': 0.5764173865318298, 'train_acc_SARC': 0.740421712398529, 'train_acc_ShortHumor': 0.9790101647377014, 'val_loss_PASTEL_country': 0.11147631177364302, 'val_loss_SARC': 0.6801381779670715, 'val_loss_ShortHumor': 0.5313721558570381, 'val_acc_PASTEL_country': 0.9764025807380676, 'val_acc_SARC': 0.5729333162307739, 'val_acc_ShortHumor': 0.7281895875930786}


  0%|          | 0/2814 [00:00<?, ?it/s]

  0%|          | 0/2814 [00:00<?, ?it/s]

  0%|          | 0/1130 [00:00<?, ?it/s]

{'i_epoch': 2, 'train_loss': 1.3030687483747263, 'train_acc': 0.7674493789672852, 'val_loss': 1.3115802746861982, 'val_acc': 0.629123330116272, 'train_loss_SARC': 0.6796582365760059, 'train_loss_PASTEL_country': 0.10222911398486915, 'train_loss_ShortHumor': 0.5211813978138512, 'train_acc_PASTEL_country': 0.5743288993835449, 'train_acc_SARC': 0.978828489780426, 'train_acc_ShortHumor': 0.7496123909950256, 'val_loss_PASTEL_country': 0.11145547760505649, 'val_loss_SARC': 0.6797234460512797, 'val_loss_ShortHumor': 0.5204013510298621, 'val_acc_PASTEL_country': 0.9764025807380676, 'val_acc_SARC': 0.5738666653633118, 'val_acc_ShortHumor': 0.7377710342407227}


  0%|          | 0/2814 [00:00<?, ?it/s]

  0%|          | 0/2814 [00:00<?, ?it/s]

  0%|          | 0/1130 [00:00<?, ?it/s]

{'i_epoch': 3, 'train_loss': 1.2915832654363686, 'train_acc': 0.7729625701904297, 'val_loss': 1.3019626897497782, 'val_acc': 0.6373146176338196, 'train_loss_ShortHumor': 0.5124990444973008, 'train_loss_PASTEL_country': 0.10152204740968009, 'train_loss_SARC': 0.6775621735293876, 'train_acc_PASTEL_country': 0.7571437954902649, 'train_acc_SARC': 0.9790216684341431, 'train_acc_ShortHumor': 0.5837439894676208, 'val_loss_PASTEL_country': 0.11151475002713342, 'val_loss_SARC': 0.6775517244021098, 'val_loss_ShortHumor': 0.5128962153205352, 'val_acc_PASTEL_country': 0.9764025807380676, 'val_acc_SARC': 0.582966685295105, 'val_acc_ShortHumor': 0.7493696212768555}


  0%|          | 0/2814 [00:00<?, ?it/s]

  0%|          | 0/2814 [00:00<?, ?it/s]

  0%|          | 0/1130 [00:00<?, ?it/s]

{'i_epoch': 4, 'train_loss': 1.2907954024404542, 'train_acc': 0.774362325668335, 'val_loss': 1.3010049818080396, 'val_acc': 0.6396945118904114, 'train_loss_ShortHumor': 0.5133960778228087, 'train_loss_PASTEL_country': 0.10103028968157811, 'train_loss_SARC': 0.6763690349360674, 'train_acc_PASTEL_country': 0.7556447386741638, 'train_acc_SARC': 0.9791115522384644, 'train_acc_ShortHumor': 0.5883961319923401, 'val_loss_PASTEL_country': 0.11138038554800912, 'val_loss_SARC': 0.6762410214424134, 'val_loss_ShortHumor': 0.5133835748176171, 'val_acc_PASTEL_country': 0.9764025807380676, 'val_acc_SARC': 0.5862666964530945, 'val_acc_ShortHumor': 0.7428139448165894}


### PASTEL_country + SARC + ShortHumor freeze all, no pooler

In [18]:
selected_task = ['PASTEL_country', # 33224
                 'SARC', # 205645
#                  'SarcasmGhosh', # 39780
                 'ShortHumor', # 37801
#                  'ShortJokeKaggle', # 406682
#                  'ShortRomance', # 1902
#                  'TroFi', # 3335
#                  'VUA', # 15157
                ] 
freeze_bert = True
use_pooler=False
num_epoch=5
limit=30000

df = train_multitask(selected_task, freeze_bert, use_pooler=use_pooler, num_epoch=num_epoch, limit=limit)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/2814 [00:00<?, ?it/s]

  0%|          | 0/2814 [00:00<?, ?it/s]

  0%|          | 0/1130 [01:00<?, ?it/s]

{'i_epoch': 0, 'train_loss': 1.2177376822810455, 'train_acc': 0.8013020157814026, 'val_loss': 1.2296284687517374, 'val_acc': 0.6579034924507141, 'train_loss_PASTEL_country': 0.10260867291143409, 'train_loss_ShortHumor': 0.45339751350196145, 'train_loss_SARC': 0.66173149586765, 'train_acc_PASTEL_country': 0.9791006445884705, 'train_acc_SARC': 0.8129602670669556, 'train_acc_ShortHumor': 0.6054857969284058, 'val_loss_PASTEL_country': 0.11002312396138784, 'val_loss_SARC': 0.6615031373341879, 'val_loss_ShortHumor': 0.4581022074561617, 'val_acc_PASTEL_country': 0.9764025807380676, 'val_acc_SARC': 0.604033350944519, 'val_acc_ShortHumor': 0.8058497309684753}


  0%|          | 0/2814 [00:00<?, ?it/s]

  0%|          | 0/2814 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [23]:
d = {'i_epoch': 0, 'train_loss': 1.2177376822810455, 'train_acc': 0.8013020157814026, 'val_loss': 1.2296284687517374, 'val_acc': 0.6579034924507141, 'train_loss_PASTEL_country': 0.10260867291143409, 'train_loss_ShortHumor': 0.45339751350196145, 'train_loss_SARC': 0.66173149586765, 'train_acc_PASTEL_country': 0.9791006445884705, 'train_acc_SARC': 0.8129602670669556, 'train_acc_ShortHumor': 0.6054857969284058, 'val_loss_PASTEL_country': 0.11002312396138784, 'val_loss_SARC': 0.6615031373341879, 'val_loss_ShortHumor': 0.4581022074561617, 'val_acc_PASTEL_country': 0.9764025807380676, 'val_acc_SARC': 0.604033350944519, 'val_acc_ShortHumor': 0.8058497309684753}
print('\n'.join([f"{k}:{v:.4}" if isinstance(v, float) else f"{k}:{v}" for k,v in d.items()]))

i_epoch:0
train_loss:1.218
train_acc:0.8013
val_loss:1.23
val_acc:0.6579
train_loss_PASTEL_country:0.1026
train_loss_ShortHumor:0.4534
train_loss_SARC:0.6617
train_acc_PASTEL_country:0.9791
train_acc_SARC:0.813
train_acc_ShortHumor:0.6055
val_loss_PASTEL_country:0.11
val_loss_SARC:0.6615
val_loss_ShortHumor:0.4581
val_acc_PASTEL_country:0.9764
val_acc_SARC:0.604
val_acc_ShortHumor:0.8058


### PASTEL_country + SARC + ShortHumor unfreeze all, no pooler

In [None]:
selected_task = ['PASTEL_country', # 33224
                 'SARC', # 205645
#                  'SarcasmGhosh', # 39780
                 'ShortHumor', # 37801
#                  'ShortJokeKaggle', # 406682
#                  'ShortRomance', # 1902
#                  'TroFi', # 3335
#                  'VUA', # 15157
                ] 
freeze_bert = False
use_pooler=False
num_epoch=5
limit=30000

df = train_multitask(selected_task, freeze_bert, use_pooler=use_pooler, num_epoch=num_epoch, limit=limit)