<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#multi-task-dataloader" data-toc-modified-id="multi-task-dataloader-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>multi-task dataloader</a></span></li><li><span><a href="#multi-task-model" data-toc-modified-id="multi-task-model-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>multi-task model</a></span></li><li><span><a href="#train" data-toc-modified-id="train-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>train</a></span></li><li><span><a href="#check-model-output" data-toc-modified-id="check-model-output-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>check model output</a></span></li><li><span><a href="#check-bert-output" data-toc-modified-id="check-bert-output-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>check bert output</a></span></li><li><span><a href="#senteval" data-toc-modified-id="senteval-6"><span class="toc-item-num">6&nbsp;&nbsp;</span>senteval</a></span></li></ul></div>

In [1]:
import pandas as pd
import torch.nn as nn
import torch.nn.functional as F
import torch
from torch.utils.data import Dataset, DataLoader
from torch import optim
from torch.optim import lr_scheduler
import torchmetrics
import numpy as np
import collections
import json
from tqdm.auto import tqdm, trange
import matplotlib.pyplot as plt
from transformers import AutoConfig, AutoTokenizer, BertModel, RobertaModel
import sys
import os

In [2]:
# https://github.com/huggingface/transformers/issues/5486
os.environ["TOKENIZERS_PARALLELISM"] = "true"

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [4]:
with open('../data/xslue/tasks.json', 'r') as f:
    tasks = json.load(f)
tasks

{'CrowdFlower': 13,
 'DailyDialog': 7,
 'EmoBank_Valence': 1,
 'EmoBank_Arousal': 1,
 'EmoBank_Dominance': 1,
 'HateOffensive': 3,
 'PASTEL_age': 8,
 'PASTEL_country': 2,
 'PASTEL_education': 10,
 'PASTEL_ethnic': 10,
 'PASTEL_gender': 3,
 'PASTEL_politics': 3,
 'PASTEL_tod': 5,
 'SARC': 2,
 'SarcasmGhosh': 2,
 'SentiTreeBank': 1,
 'ShortHumor': 2,
 'ShortJokeKaggle': 2,
 'ShortRomance': 2,
 'StanfordPoliteness': 1,
 'TroFi': 2,
 'VUA': 2}

In [5]:
tasks2idx = {}
for i,k in enumerate(tasks):
    tasks2idx[k] = i
tasks2idx

{'CrowdFlower': 0,
 'DailyDialog': 1,
 'EmoBank_Valence': 2,
 'EmoBank_Arousal': 3,
 'EmoBank_Dominance': 4,
 'HateOffensive': 5,
 'PASTEL_age': 6,
 'PASTEL_country': 7,
 'PASTEL_education': 8,
 'PASTEL_ethnic': 9,
 'PASTEL_gender': 10,
 'PASTEL_politics': 11,
 'PASTEL_tod': 12,
 'SARC': 13,
 'SarcasmGhosh': 14,
 'SentiTreeBank': 15,
 'ShortHumor': 16,
 'ShortJokeKaggle': 17,
 'ShortRomance': 18,
 'StanfordPoliteness': 19,
 'TroFi': 20,
 'VUA': 21}

In [6]:
# task and their (train) dataset size 
selected_task = ['PASTEL_country', # 33224
#                  'SARC', # 205645
                 'SarcasmGhosh', # 39780
                 'ShortHumor', # 37801
#                  'ShortJokeKaggle', # 406682
#                  'ShortRomance', # 1902
#                  'TroFi', # 3335
                 'VUA', # 15157
                ] 


# multi-task dataloader

In [7]:
class MyDataset(Dataset): 
    # currently it's a Mapping-style dataset. Not sure if a Iterable-style dataset will be better
    def __init__(self, tsv_file):
        self.df = pd.read_csv(tsv_file, sep='\t')
        self.df = self.df.dropna()
        self.df = self.df.reset_index(drop=True)
        if self.df['label'].dtype == 'float64':
            self.df['label'] = self.df['label'].astype('float32')
            
    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        dataslice = self.df.iloc[idx]
        sample = {'text':dataslice['text'], 'label':dataslice['label']}
        return sample


In [8]:
class MultiTaskTrainDataLoader():
    '''
    Each time, a random integer selects a dataset and load a batch of data {text, label} from it. Return i_task and data
    
    a iterator
    Known issue: large dataset may have not iterate once, small datasets may have been iterated many times
    '''
    
    def __init__(self, tasks, batch_size, shuffle, num_workers):
        self.tasks = tasks
        self.split = 'train'
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.num_workers = num_workers
        
        self.num_tasks = len(tasks)
        self.datasets = []
        self.dataloaders = []
        self.dataloaderiters = []
        self.len = 0
        for task in tasks:
            self.datasets.append(MyDataset(f'../data/xslue/processed/{self.split}/{task}.tsv'))
            self.dataloaders.append(DataLoader(self.datasets[-1], batch_size=self.batch_size, shuffle=self.shuffle, num_workers=self.num_workers)) 
            self.dataloaderiters.append(self.dataloaders[-1]._get_iterator())
            self.len += len(self.dataloaders[-1])
    def __len__(self):   
        return self.len

    def __iter__(self):
        self.n = 0
        return self
    
    def __next__(self):
        i_task = np.random.randint(self.num_tasks)
        if self.n < self.len:
            self.n += 1
        else:
            raise StopIteration
        dataloaderiter = self.dataloaderiters[i_task]    
        try: 
            batch = next(dataloaderiter)
        except StopIteration:
#             self.dataloaderiters[i_task]._reset(self.dataloaders[i_task])
#             dataloaderiter = self.dataloaderiters[i_task]
            self.dataloaderiters[i_task] = iter(self.dataloaders[i_task])
            dataloaderiter = self.dataloaderiters[i_task]
            batch = next(dataloaderiter)
        return i_task, batch

In [9]:
class MultiTaskTestDataLoader():
    '''
    For dev and test
    
    a generator
    '''
    
    def __init__(self, tasks, split, batch_size, shuffle, num_workers):
        assert split in ['dev', 'test'], 'not implemented'
        self.tasks = tasks
        self.split = split
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.num_workers = num_workers
        
        self.num_tasks = len(tasks)
        self.datasets = []
        self.dataloaders = []
        self.dataloaderiters = []
        self.len = 0
        for task in tasks:
            self.datasets.append(MyDataset(f'../data/xslue/processed/{self.split}/{task}.tsv'))
            self.dataloaders.append(DataLoader(self.datasets[-1], batch_size=self.batch_size, shuffle=self.shuffle, num_workers=self.num_workers))
            self.len += len(self.dataloaders[-1])
        self.i_task = 0
    def __len__(self):   
        return self.len

    def __iter__(self):
        for i_task in range(self.num_tasks):
            dataloader = self.dataloaders[i_task]
            for batch in dataloader:
                yield i_task, batch
    
        

# multi-task model

In [10]:
from transformers.models.bert.modeling_bert import BertPreTrainedModel
from transformers.models.roberta.modeling_roberta import RobertaPreTrainedModel

In [11]:
class RegressionHead(nn.Module):
    def __init__(self, embedding_dim = 768, hidden_dim = 128):
        super().__init__()
#         self.hidden1 = nn.Linear(embedding_dim, hidden_dim)
#         self.hidden2 = nn.Linear(hidden_dim, 1)
        self.hidden = nn.Linear(embedding_dim, 1)
        self.activation = nn.Sigmoid()
        
        self.loss_fn = nn.MSELoss()
    def forward(self, sent_emb, label):
#         output = self.activation(self.hidden2(self.hidden1(sent_emb))).squeeze(1)
        output = self.activation(self.hidden(sent_emb)).squeeze(1)
        
        loss = self.loss_fn(output, label)
        return output, loss

In [12]:
class ClassificationHead(nn.Module):
    def __init__(self, num_labels, embedding_dim = 768, hidden_dim = 128):
        super().__init__()
#         self.hidden1 = nn.Linear(embedding_dim, hidden_dim)
#         self.hidden2 = nn.Linear(hidden_dim, num_labels)
        self.hidden = nn.Linear(embedding_dim, num_labels)
        self.activation = nn.Softmax(dim=1)
        
        self.loss_fn = nn.CrossEntropyLoss()
    def forward(self, sent_emb, label):
#         output = self.activation(self.hidden2(self.hidden1(sent_emb)))
        output = self.activation(self.hidden(sent_emb))
        
        loss = self.loss_fn(output, label)
        return output, loss

In [13]:
class MultiTaskBert(BertPreTrainedModel):
    def __init__(self, config, selected_task, use_pooler=True):
        super().__init__(config)
        self.use_pooler = use_pooler
        self.basemodel = BertModel(config)
        self.style_heads = nn.ModuleList()
        for task in selected_task:
            if tasks[task] == 1:
                self.style_heads.append(RegressionHead())
            else:
                self.style_heads.append(ClassificationHead(tasks[task]))
    def forward(self, input_ids, token_type_ids, attention_mask, i_task=None, label=None, return_sent_emb=False):
        output = self.basemodel(input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask)
        if self.use_pooler:
            sent_emb = output['pooler_output']
        else:
            sent_emb = output['last_hidden_state'][:,0,:]
        
        if return_sent_emb:
            return sent_emb  
        output, loss = self.style_heads[i_task](sent_emb, label)
        return output, loss

In [14]:
class MultiTaskRoberta(RobertaPreTrainedModel):
    def __init__(self, config, selected_task, use_pooler=True):
        super().__init__(config)
        self.use_pooler = use_pooler
        self.basemodel = RobertaModel(config)
        self.style_heads = nn.ModuleList()
        for task in selected_task:
            if tasks[task] == 1:
                self.style_heads.append(RegressionHead())
            else:
                self.style_heads.append(ClassificationHead(tasks[task]))
    def forward(self, input_ids, attention_mask, i_task=None, label=None, return_sent_emb=False):
        output = self.basemodel(input_ids=input_ids, attention_mask=attention_mask)
        if self.use_pooler:
            sent_emb = output['pooler_output']
        else:
            sent_emb = output['last_hidden_state'][:,0,:]
        if return_sent_emb:
            return sent_emb
        
        output, loss = self.style_heads[i_task](sent_emb, label)
        
        return output, loss, sent_emb

In [15]:
def print_loss(losses):
    for k in losses:
        print(f'{losses[k]:4.4f}', end=' ')
    print('')

In [16]:
def validate(mt_val_dataloader):
    val_loss = collections.defaultdict(float)
    val_size = collections.defaultdict(int)
    acc = torchmetrics.Accuracy() # todo
    mt_model.eval()
    for data in tqdm(mt_val_dataloader):  
        i_task, batch = data
        label = batch['label'].to(device)
        size = len(label)
        del batch['label']
        tokens = tokenizer(**batch, return_tensors='pt', padding=True, truncation=True, max_length=64).to(device)
        output, loss = mt_model(**tokens, i_task=i_task,  label=label)
#         acc.update(preds, target)
        tokens = None
        output = None
        val_loss[i_task] += loss.detach().item()*size
        val_size[i_task] += size
    for i_task in val_loss:
        val_loss[i_task] /= val_size[i_task]
    mt_model.train()

    return val_loss


# train

In [17]:
# larger batch_size will definitely lead to memory issue
mt_dataloader = MultiTaskTrainDataLoader(selected_task, batch_size = 16, shuffle = True, num_workers = 4)
mt_dev_dataloader = MultiTaskTestDataLoader(selected_task, split='dev', batch_size = 16, shuffle = True, num_workers = 4)
mt_test_dataloader = MultiTaskTestDataLoader(selected_task, split='test', batch_size = 16, shuffle = True, num_workers = 4)

In [18]:
base_model = "bert-base-uncased"
# base_model = 'roberta-base'

config = AutoConfig.from_pretrained(base_model)
tokenizer = AutoTokenizer.from_pretrained(base_model)

mt_model = MultiTaskBert(config, selected_task, use_pooler=False).to(device)
# mt_model = MultiTaskRoberta(config, tasks).to(device)


In [19]:
for param in mt_model.basemodel.parameters():
    param.requires_grad = False

In [20]:
optimizer = optim.AdamW(mt_model.parameters(), lr=0.001)
scheduler = lr_scheduler.OneCycleLR(optimizer, max_lr=0.001, total_steps=len(mt_dataloader)) 

In [45]:
torch.cuda.empty_cache()

losses = collections.defaultdict(list)
df_dev_loss = pd.DataFrame(columns=np.arange(0,len(selected_task)))
acc_objs = [torchmetrics.Accuracy() for i in range(len(selected_task))]

accs = collections.defaultdict(list) # collect per 300 steps

test_embs = []

for i_iter, data in enumerate(tqdm(mt_dataloader)):  
    if i_iter == 1000:
        for param in mt_model.basemodel.parameters():
            param.requires_grad = True
    i_task, batch = data
    optimizer.zero_grad()
    label = batch['label'].to(device)
    del batch['label']
    tokens = tokenizer(**batch, return_tensors='pt', padding=True, truncation=True, max_length=64).to(device)
    output, loss = mt_model(**tokens, i_task=i_task,  label=label)
    loss.backward()
    optimizer.step()
    scheduler.step()
    acc_objs[i_task].update(output.to('cpu').detach(), label.to('cpu').detach())
    
    losses[i_task].append(loss.detach().item())
    
    if i_iter in [0,10,100,200,210,300,400,500]:
        sent_emb = mt_model(**tokens, i_task=i_task,  label=label, return_sent_emb=True)
        test_embs.append(sent_emb)
        
#     if i_iter > 1500:
#         break

    if i_iter%300 == 0 and i_iter != 0:
        for i_task in range(4):
            accs[i_task].append(acc_objs[i_task].compute().detach().item())
        acc_objs = [torchmetrics.Accuracy() for i in range(len(selected_task))]
        
        torch.cuda.empty_cache()
        dev_loss = validate(mt_dev_dataloader)
        df_dev_loss = df_dev_loss.append(dev_loss , ignore_index=True)
        print(f'#####training iter {i_iter}/{len(mt_dataloader)}')
        print_loss(dev_loss)

  0%|          | 0/7875 [00:00<?, ?it/s]

  0%|          | 0/586 [00:00<?, ?it/s]

#####training iter 300/7875
0.3372 0.6045 0.6850 0.5837 


  0%|          | 0/586 [00:00<?, ?it/s]

#####training iter 600/7875
0.3370 0.5008 0.7140 0.5840 


  0%|          | 0/586 [00:00<?, ?it/s]

#####training iter 900/7875
0.3369 0.4808 0.6568 0.5837 


  0%|          | 0/586 [00:00<?, ?it/s]

#####training iter 1200/7875
0.3369 1.2763 0.8150 0.5837 


  0%|          | 0/586 [00:00<?, ?it/s]

#####training iter 1500/7875
0.3369 1.2763 0.8150 0.5837 


KeyboardInterrupt: 

In [46]:
accs

defaultdict(list,
            {0: [0.9794303774833679,
              0.9725000262260437,
              0.9756944179534912,
              0.9754746556282043,
              0.9795082211494446],
             1: [0.5719085931777954,
              0.6089527010917664,
              0.663281261920929,
              0.5308098793029785,
              0.45892858505249023],
             2: [0.5615234375,
              0.5859375,
              0.5838607549667358,
              0.5496794581413269,
              0.5157967209815979],
             3: [0.7038461565971375,
              0.7053571343421936,
              0.7273550629615784,
              0.7196180820465088,
              0.7259615659713745]})

In [41]:
accc = torchmetrics.Accuracy()

In [43]:
accc.update(output.to('cpu').detach(), label.to('cpu').detach())

In [35]:
accc.update(torch.Tensor([1.,1,1,0,0]), torch.LongTensor([1,1,1,0,0]))

In [44]:
accc.compute()

tensor(0.6250)

In [21]:
df_dev_loss.columns = selected_task
df_dev_loss

Unnamed: 0,PASTEL_country,SarcasmGhosh,ShortHumor,VUA
0,0.33686,1.276343,0.811491,0.583713
1,0.336859,1.276343,0.811497,0.583713
2,0.336859,1.276343,0.811497,0.583713
3,1.289664,0.350181,0.811497,0.583714
4,1.289664,0.350181,0.811497,0.583714
5,1.289664,0.350181,0.811497,0.583714
6,1.289664,0.350181,0.811497,0.583714
7,1.289664,0.350181,0.811497,0.583714
8,1.289664,0.350181,0.811497,0.583714
9,1.289664,0.350181,0.811497,0.583714


# check model output

In [22]:
accs[3]

[0.5625,
 0.4375,
 0.5,
 0.4375,
 0.6875,
 0.5625,
 0.625,
 0.9375,
 0.5625,
 0.5625,
 0.8125,
 0.6875,
 0.8125,
 0.6875,
 0.75,
 0.8125,
 0.8125,
 0.8125,
 0.75,
 0.5625,
 0.6875,
 0.625,
 0.75,
 0.6875,
 0.6875,
 0.625,
 0.75,
 0.6875,
 0.5625,
 0.6875,
 0.9375,
 0.75,
 0.75,
 0.75,
 0.6875,
 0.5625,
 0.625,
 0.6875,
 0.625,
 0.9375,
 0.625,
 0.6875,
 0.6875,
 0.75,
 0.6875,
 0.6875,
 0.75,
 0.6875,
 0.6875,
 0.6875,
 0.75,
 0.8125,
 0.75,
 0.8125,
 0.625,
 0.6875,
 0.75,
 0.6875,
 0.75,
 0.75,
 0.6875,
 0.75,
 0.6875,
 0.8125,
 0.6875,
 0.5625,
 0.75,
 0.6875,
 0.8125,
 0.5,
 0.875,
 0.75,
 0.625,
 0.75,
 0.75,
 0.6875,
 0.6875,
 0.8125,
 0.75,
 0.8125,
 0.75,
 0.6875,
 0.75,
 0.75,
 0.6875,
 0.8125,
 0.6875,
 0.75,
 0.6875,
 0.6875,
 0.75,
 0.625,
 0.875,
 0.75,
 0.8125,
 0.5625,
 0.625,
 0.6875,
 0.75,
 0.6875,
 0.9375,
 0.5625,
 0.6875,
 0.8125,
 0.625,
 0.6875,
 0.8125,
 0.6875,
 1.0,
 0.5,
 0.8125,
 0.8125,
 0.8125,
 0.75,
 0.75,
 0.9375,
 0.8125,
 0.8125,
 0.5,
 0.6875,
 0.687

In [23]:
for acc in acc_objs:
    print(acc.compute())

tensor(0.9551)
tensor(0.5574)
tensor(0.5461)
tensor(0.7150)


In [33]:
for acc in acc_objs:
    print(acc.compute())

tensor(0.9789)
tensor(0.5158)
tensor(0.4825)
tensor(0.7243)


In [24]:
mt_model.eval()

MultiTaskBert(
  (basemodel): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine

In [25]:
logits, loss = mt_model(**tokens, i_task=i_task,  label=label)

In [26]:
logits

tensor([[0.1063, 0.8937],
        [0.1063, 0.8937],
        [0.1063, 0.8937],
        [0.1063, 0.8937],
        [0.1063, 0.8937],
        [0.1063, 0.8937],
        [0.1063, 0.8937],
        [0.1063, 0.8937],
        [0.1063, 0.8937],
        [0.1063, 0.8937],
        [0.1063, 0.8937],
        [0.1063, 0.8937],
        [0.1063, 0.8937],
        [0.1063, 0.8937],
        [0.1063, 0.8937],
        [0.1063, 0.8937]], device='cuda:0', grad_fn=<SoftmaxBackward0>)

In [25]:
label

tensor([1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0], device='cuda:0')

In [26]:
nn.CrossEntropyLoss()(logits, label)

tensor(0.8756, device='cuda:0', grad_fn=<NllLossBackward0>)

In [27]:
loss

tensor(0.8756, device='cuda:0', grad_fn=<NllLossBackward0>)

# check bert output

In [30]:
output = mt_model.basemodel(**tokens)

In [37]:
output

BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[-0.1633, -0.6806,  0.7816,  ..., -0.4631, -0.7018, -1.1974],
         [-0.1623, -0.6788,  0.7804,  ..., -0.4576, -0.7000, -1.1939],
         [-0.1627, -0.6795,  0.7822,  ..., -0.4595, -0.6993, -1.1916],
         ...,
         [-0.1626, -0.6791,  0.7818,  ..., -0.4588, -0.6999, -1.1938],
         [-0.1611, -0.6782,  0.7817,  ..., -0.4582, -0.6971, -1.1924],
         [-0.1617, -0.6792,  0.7808,  ..., -0.4583, -0.6982, -1.1930]],

        [[-0.1632, -0.6810,  0.7818,  ..., -0.4628, -0.7019, -1.1979],
         [-0.1630, -0.6798,  0.7813,  ..., -0.4565, -0.6979, -1.1942],
         [-0.1622, -0.6793,  0.7822,  ..., -0.4578, -0.7010, -1.1931],
         ...,
         [-0.1626, -0.6796,  0.7821,  ..., -0.4586, -0.7001, -1.1943],
         [-0.1610, -0.6787,  0.7821,  ..., -0.4581, -0.6973, -1.1930],
         [-0.1617, -0.6798,  0.7812,  ..., -0.4581, -0.6984, -1.1936]],

        [[-0.1632, -0.6806,  0.7818,  ..., -0.4629, -

In [33]:
label

tensor([1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0], device='cuda:0')

In [35]:
mt_model.style_heads[i_task](output['last_hidden_state'][:,0,:], label)

(tensor([[3.4035e-04, 9.9966e-01],
         [3.4037e-04, 9.9966e-01],
         [3.4035e-04, 9.9966e-01],
         [3.4037e-04, 9.9966e-01],
         [3.4031e-04, 9.9966e-01],
         [3.4029e-04, 9.9966e-01],
         [3.4035e-04, 9.9966e-01],
         [3.4036e-04, 9.9966e-01],
         [3.4035e-04, 9.9966e-01],
         [3.4033e-04, 9.9966e-01],
         [3.4035e-04, 9.9966e-01],
         [3.4032e-04, 9.9966e-01],
         [3.4036e-04, 9.9966e-01],
         [3.4035e-04, 9.9966e-01],
         [3.4031e-04, 9.9966e-01],
         [3.4030e-04, 9.9966e-01]], device='cuda:0', grad_fn=<SoftmaxBackward0>),
 tensor(0.8756, device='cuda:0', grad_fn=<NllLossBackward0>))

In [86]:
logits, loss = mt_model(**tokens, i_task=i_task,  label=label)

In [88]:
logits, loss

(tensor([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
        device='cuda:0', grad_fn=<SqueezeBackward1>),
 tensor(0.2813, device='cuda:0', grad_fn=<MseLossBackward0>))

In [24]:
label

tensor([ 8, 12, 12, 12,  7, 10,  7, 10, 12, 10], device='cuda:0')

In [23]:
sent_emb = mt_model(**tokens, i_task=i_task,  label=label, return_sent_emb=True)
sent_emb

tensor([[ 5.0471e-01,  1.2167e+00, -4.7918e-01,  ..., -1.3093e-01,
         -4.7092e-03,  1.0579e+00],
        [ 4.8265e-01,  1.1955e+00, -4.8833e-01,  ..., -9.8464e-02,
         -2.6491e-02,  1.0363e+00],
        [ 4.9495e-01,  1.2084e+00, -4.8237e-01,  ..., -8.0255e-02,
          1.3555e-02,  1.0508e+00],
        ...,
        [ 5.0307e-01,  3.3286e-01, -4.7845e-01,  ..., -1.3443e-01,
          1.0501e-02,  1.0487e+00],
        [ 5.2614e-01,  1.2204e+00, -4.6653e-01,  ..., -7.9352e-02,
         -9.1928e-03,  1.0598e+00],
        [ 5.0942e-01,  1.2224e+00, -4.7927e-01,  ..., -1.2814e-01,
          1.0450e-03,  1.0637e+00]], device='cuda:0', grad_fn=<SliceBackward0>)

In [36]:
test_embs

[tensor([[ 0.5493, -1.0068,  0.6620,  ...,  0.4401, -0.6234, -1.8955],
         [ 0.4866, -0.3081,  1.4383,  ...,  0.5462, -0.6992, -1.7907],
         [ 0.8709, -0.7770,  1.1853,  ..., -0.2183, -0.8382, -1.6414],
         ...,
         [ 0.0543, -0.6668,  1.2341,  ...,  1.3756, -0.6340, -1.6918],
         [-0.3885, -0.9061,  1.8967,  ...,  0.3281, -0.7763, -1.8667],
         [ 0.4483, -1.2293,  1.1639,  ...,  0.1616, -0.7883, -1.9392]],
        device='cuda:0', grad_fn=<SliceBackward0>),
 tensor([[ 1.1720,  0.1778,  0.5938,  ..., -0.9601, -0.2259, -0.2412],
         [ 1.1684,  0.2329,  0.4642,  ..., -1.0706,  0.3184, -0.2005],
         [ 0.6869, -0.0212,  0.8063,  ..., -0.9281, -0.1159, -0.3850],
         ...,
         [ 1.3084,  0.0476,  0.4350,  ..., -0.6799, -0.5986, -0.4706],
         [ 1.2512,  0.1888,  0.0546,  ..., -0.9272, -0.0723, -0.0089],
         [ 0.7005,  0.3047,  0.6693,  ..., -0.7450, -0.3109, -0.1338]],
        device='cuda:0', grad_fn=<SliceBackward0>),
 tensor([[ 0.8

torch metrics
https://torchmetrics.readthedocs.io/en/stable/pages/overview.html

In [51]:
metric = torchmetrics.Accuracy()

In [60]:
acc = metric(torch.softmax(output, dim=-1).cpu(), torch.LongTensor([6, 7,  7, 12,  5, 12, 10,  5,  8,  8, 8, 8,  9,  5,  7, 3]))
acc.item()

0.1875

In [63]:
total_train_accuracy = metric.compute()

In [65]:
total_train_accuracy

tensor(0.0833)

In [22]:
df_dev.columns = tasks.items()
df_dev

Unnamed: 0_level_0,CrowdFlower,DailyDialog,EmoBank_Valence,EmoBank_Arousal,EmoBank_Dominance,HateOffensive,PASTEL_age,PASTEL_country,PASTEL_education,PASTEL_ethnic,...,PASTEL_tod,SARC,SarcasmGhosh,SentiTreeBank,ShortHumor,ShortJokeKaggle,ShortRomance,StanfordPoliteness,TroFi,VUA
Unnamed: 0_level_1,13,7,1,1,1,3,8,2,10,10,...,5,2,2,1,2,2,2,1,2,2
0,2.312776,0.832633,0.01766,0.064081,0.007568,0.701575,1.832837,0.693147,2.570806,1.628639,...,1.576885,1.125411,0.200766,0.065751,0.693147,0.693147,0.693147,0.012265,0.693147,0.693147
1,2.312776,0.832633,0.035017,0.012292,0.02621,0.701575,1.832837,0.693147,2.570806,1.628639,...,1.576885,1.125411,0.200766,1.380987,0.693147,0.693147,0.693147,0.023418,0.693147,0.693147
2,2.312776,0.832633,0.016358,0.056178,0.019477,0.701575,1.832837,0.693147,2.570802,1.628639,...,1.576885,1.125411,0.200766,0.075151,0.693147,0.693147,0.693147,0.01274,0.693147,0.693147
3,3.281408,3.575869,10356.68317,3.35056,202.949844,2.296593,2.876076,0.693147,2.906558,1.622863,...,1.576885,0.693147,0.693147,0.243174,0.693147,0.693147,0.693147,117.78529,0.693147,1.586025
4,3.19974,3.575869,0.013024,0.009631,0.009826,2.296593,2.929388,0.693147,2.869238,1.804569,...,1.576885,0.693147,0.693147,0.073756,0.693147,0.693147,0.693147,0.012265,0.693147,0.693147
5,3.19974,3.575869,0.01752,0.009645,0.007379,2.296593,2.929388,0.693147,2.869238,1.804569,...,1.576885,0.693147,0.693147,0.082852,0.693147,0.693147,0.693147,0.02192,0.693147,0.693147
6,3.19974,3.575869,44.343632,39.389981,0.009247,2.296593,2.929388,0.693147,2.869238,1.804569,...,1.576885,0.693147,0.693147,0.066447,0.693147,0.693147,0.693147,0.040102,0.693147,0.693147
7,3.19974,3.575869,0.013002,0.009748,0.008249,2.296593,2.929388,0.693147,2.869238,1.804569,...,1.576885,0.693147,0.693147,243.021282,0.693147,0.693147,0.693147,0.014671,0.693147,0.693147
8,2.531922,3.575869,0.014539,0.009767,0.007498,1.902214,2.563523,0.693147,2.282474,1.289514,...,1.970627,0.693147,0.693147,0.063148,0.693147,1.125329,0.693147,0.011814,0.693147,0.693147
9,2.531922,3.056301,0.013438,0.010019,0.00848,1.902214,2.563523,0.693147,2.282474,1.289514,...,1.970627,0.693147,0.693147,0.06321,0.693147,1.125329,0.693147,0.012274,0.693147,0.693147


In [23]:
PATH = './mt_model_runs/mt_2.bin'
torch.save(mt_model.state_dict(), PATH)

In [24]:
import json

with open('losses.json', 'w') as f:
    json.dump(losses, f)


In [25]:
df_dev.to_csv('dev_losses.csv')

# senteval

In [None]:
# sys.path.append(f'{os.getcwd()}/../SentEval')
# PATH_TO_DATA = f'{os.getcwd()}/../SentEval/data'

# # Import SentEval
# import senteval

In [25]:
def prepare(params, samples):
    return
def batcher(params, batch):
    sentences = [' '.join(s) for s in batch]
    batch = tokenizer(
        sentences,
        return_tensors='pt',
        padding=True,
    )
    
    for k in batch:
        batch[k] = batch[k].to(device)
    with torch.no_grad():
        sent_emb = mt_model(**batch, return_sent_emb=True)
    
    return sent_emb.cpu()

# Set params for SentEval (fastmode)
params = {'task_path': PATH_TO_DATA, 'usepytorch': True, 'kfold': 5}
params['classifier'] = {'nhid': 0, 'optim': 'rmsprop', 'batch_size': 128,
                                    'tenacity': 3, 'epoch_size': 2}

task_set = 'sts'
if task_set == 'sts':
    senteval_tasks = ['STS12', 'STS13', 'STS14', 'STS15', 'STS16', 'STSBenchmark', 'SICKRelatedness']
elif task_set == 'transfer':
    senteval_tasks = ['MR', 'CR', 'MPQA', 'SUBJ', 'SST2', 'TREC', 'MRPC']
elif task_set == 'full':
    senteval_tasks = ['STS12', 'STS13', 'STS14', 'STS15', 'STS16', 'STSBenchmark', 'SICKRelatedness']
    senteval_tasks += ['MR', 'CR', 'MPQA', 'SUBJ', 'SST2', 'TREC', 'MRPC']

se = senteval.engine.SE(params, batcher, prepare)
mt_model.eval()
results = se.eval(senteval_tasks)

In [26]:
results

{'STS12': {'MSRpar': {'pearson': (0.04511842111176674, 0.21713343634765625),
   'spearman': SpearmanrResult(correlation=0.05829879048588996, pvalue=0.11065140077710642),
   'nsamples': 750},
  'MSRvid': {'pearson': (-0.04719690809332701, 0.19666754119304142),
   'spearman': SpearmanrResult(correlation=-0.05791055536831769, pvalue=0.11305028382924873),
   'nsamples': 750},
  'SMTeuroparl': {'pearson': (0.05465063166493182, 0.24259241634429948),
   'spearman': SpearmanrResult(correlation=0.04601455241561682, pvalue=0.3252817731161748),
   'nsamples': 459},
  'surprise.OnWN': {'pearson': (0.028684333440108387, 0.43280310929667104),
   'spearman': SpearmanrResult(correlation=0.0486239359795535, pvalue=0.183456625977699),
   'nsamples': 750},
  'surprise.SMTnews': {'pearson': (0.004681029491783591, 0.9257362156479326),
   'spearman': SpearmanrResult(correlation=-0.013892916608515183, pvalue=0.7820461848374392),
   'nsamples': 399},
  'all': {'pearson': {'all': 0.014868264700974198,
    'mea