In [1]:
# Development of the predictive model

In [2]:
# First load the dataset
import pandas as pd

dataset = pd.read_csv('./data/develop_set_bayes_risk.csv', sep="\t")

In [3]:
# Next load the model
from utils.config_utils import parse_config
parsed_config = parse_config('./configs/helsinki-taboeta-de-en.yml', pretrained=True)

model = parsed_config["model"]
tokenizer = parsed_config["tokenizer"]
model = model.to("cuda")
model.eval()

Using custom data configuration de-en-lang1=de,lang2=en
Reusing dataset tatoeba (C:\Users\gerso\.cache\huggingface\datasets\tatoeba\de-en-lang1=de,lang2=en\0.0.0\b3ea9c6bb2af47699c5fc0a155643f5a0da287c7095ea14824ee0a8afd74daf6)


  0%|          | 0/1 [00:00<?, ?it/s]

[276634, 20738, 2500, 2500, 5000]


MarianMTModel(
  (model): MarianModel(
    (shared): Embedding(58101, 512, padding_idx=58100)
    (encoder): MarianEncoder(
      (embed_tokens): Embedding(58101, 512, padding_idx=58100)
      (embed_positions): MarianSinusoidalPositionalEmbedding(512, 512)
      (layers): ModuleList(
        (0): MarianEncoderLayer(
          (self_attn): MarianAttention(
            (k_proj): Linear(in_features=512, out_features=512, bias=True)
            (v_proj): Linear(in_features=512, out_features=512, bias=True)
            (q_proj): Linear(in_features=512, out_features=512, bias=True)
            (out_proj): Linear(in_features=512, out_features=512, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (fc1): Linear(in_features=512, out_features=2048, bias=True)
          (fc2): Linear(in_features=2048, out_features=512, bias=True)
          (final_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        )
   

In [4]:
import math
import ast
# Next we need to create a proper dataset from it, first we split every columns
columns = ["source", "hypothesis", "avg", "std"]
expanded_dataset = pd.DataFrame(columns=columns)

number_of_hypothesis = int((len(dataset.columns) - 1)/2)

for i, row in dataset.iterrows():
    source = row["source"]
    for j in range(number_of_hypothesis):
        hypothesis = row["hypothesis_{}".format(j)]
        x = row["avg_std_{}".format(j)]

        if type(x) != float: # Bit of a hack but we have either a string containing a tuple or x is a float with value NaN
            x = ast.literal_eval(x)
            avg = float(x[0])
            std = float(x[0])
            expanded_dataset = expanded_dataset.append({"source": source, "hypothesis": hypothesis, "avg": avg, "std": std}, ignore_index=True)
print(expanded_dataset.head())

                                source                     hypothesis  \
0  Ich spreche überhaupt kein Deutsch.   I can't speak German at all.   
1  Ich spreche überhaupt kein Deutsch.  I cannot speak German at all.   
2  Ich spreche überhaupt kein Deutsch.   I don't speak German at all.   
3  Ich spreche überhaupt kein Deutsch.    I don't speak a French run.   
4  Ich spreche überhaupt kein Deutsch.    I don't know German by all.   

        avg       std  
0  1.223635  1.223635  
1  0.749200  0.749200  
2  1.228001  1.228001  
3  0.021646  0.021646  
4  0.585859  0.585859  


In [5]:
# Then we creat a dataset from it
from datasets import tqdm, Dataset

dataset = Dataset.from_pandas(expanded_dataset)


In [6]:
# Next we need to create a dataloader for it

# First define the collate function

def get_proprocess_function(tokenizer):
    def preprocess_function(examples, tokenizer,):
        source =examples["source"]
        targets = examples["hypothesis"]
        model_inputs = tokenizer(source,  truncation=True, )
        # Setup the tokenizer for targets
        with tokenizer.as_target_tokenizer():
            labels = tokenizer(targets, truncation=True, )

        model_inputs["labels"] = labels["input_ids"]

        return model_inputs
    return lambda examples: preprocess_function(examples, tokenizer)

preprocess_function = get_proprocess_function(tokenizer)
dataset = dataset.map(preprocess_function, batched=True)

  0%|          | 0/2 [00:00<?, ?ba/s]

In [7]:
from transformers import DataCollatorForSeq2Seq
import torch
def get_collate_fn(model, tokenizer,):
    data_collator = DataCollatorForSeq2Seq(model=model, tokenizer=tokenizer,
                                           padding=True, return_tensors="pt")

    keys = [
        "input_ids",
        "attention_mask",
        "labels"
    ]

    def collate_fn(batch):
        
        new_batch = [{k: s[k] for k in keys} for s in batch]
        x_new = data_collator(new_batch)

        sources = [s["source"] for s in batch]
        hypothesis = [s["hypothesis"] for s in batch]
        
        #Group the averages and the standard deviations
        avgs = torch.Tensor([s["avg"] for s in batch])
        std = torch.Tensor([s["std"] for s in batch])
        
        return x_new, (sources, hypothesis), (avgs, std)

    return collate_fn

collate_fn = get_collate_fn(model, tokenizer,)

In [8]:
from torch.utils.data import DataLoader
dataloader = DataLoader(dataset, collate_fn=collate_fn, batch_size=32, shuffle=True)

In [9]:
#Check the output of 1 batch
for i, batch in enumerate(dataloader):
    print(batch)
    break

({'input_ids': tensor([[  149,  2670,    63,     5,    57,   783,   277,  4538,    18,     3,
             0, 58100, 58100, 58100, 58100, 58100, 58100, 58100],
        [ 2136,   109,  1680,     2,    44,   244,   718,   139,   567,  1830,
         16977,     3,     0, 58100, 58100, 58100, 58100, 58100],
        [  105, 26012,    51, 22740,     3,     0, 58100, 58100, 58100, 58100,
         58100, 58100, 58100, 58100, 58100, 58100, 58100, 58100],
        [17167,     2,   107,   334,    84,   260,  9497,   244,  9181,   524,
          2209,   827,   104,   759,    31,    68,     0, 58100],
        [ 2136,    29,     9,  2081, 26855,     5,     9,  1614,    10,    44,
           810,   110,    89,     3,     0, 58100, 58100, 58100],
        [   42,  4313,    74,     2,    78,  1762,    76,   106,  2695,  2187,
             3,     0, 58100, 58100, 58100, 58100, 58100, 58100],
        [   55,   644, 18751,    74,     3,     0, 58100, 58100, 58100, 58100,
         58100, 58100, 58100, 58100,

In [10]:
print(model)

MarianMTModel(
  (model): MarianModel(
    (shared): Embedding(58101, 512, padding_idx=58100)
    (encoder): MarianEncoder(
      (embed_tokens): Embedding(58101, 512, padding_idx=58100)
      (embed_positions): MarianSinusoidalPositionalEmbedding(512, 512)
      (layers): ModuleList(
        (0): MarianEncoderLayer(
          (self_attn): MarianAttention(
            (k_proj): Linear(in_features=512, out_features=512, bias=True)
            (v_proj): Linear(in_features=512, out_features=512, bias=True)
            (q_proj): Linear(in_features=512, out_features=512, bias=True)
            (out_proj): Linear(in_features=512, out_features=512, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (fc1): Linear(in_features=512, out_features=2048, bias=True)
          (fc2): Linear(in_features=2048, out_features=512, bias=True)
          (final_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        )
   

In [11]:
# Next we create the predictive model 
from torch import nn

def avg_pooling(hidden_state, attention_mask):
    attention_hidden_state = hidden_state * attention_mask.unsqueeze(dim=-1)
    sum_hidden_state = torch.sum(attention_hidden_state, dim=1)
    n = torch.sum(attention_mask.unsqueeze(dim=-1), dim=1)
    avg_hidden_state = sum_hidden_state / n
    return avg_hidden_state

class PredictiveModel(nn.Module):
    
    def __init__(self, NMT_model, embedding_size=512):
        super().__init__()
        self.NMT_model = NMT_model
        # Then we create a two layer feed forward network

        # NMT model does not require a gradient
        self.NMT_model.requires_grad = False
        
        self.linear_layers = nn.Sequential(nn.Linear(512 * 2, 512), torch.nn.SiLU() , nn.Linear(512, 2)) 
        
        self.softplus = nn.Softplus()
        self.padding_id = -100
        
    def forward(self, input_ids,  attention_mask=None, labels=None, decoder_input_ids=None):
        nmt_out = self.NMT_model.forward(input_ids=input_ids, attention_mask=attention_mask, labels=labels, decoder_input_ids=decoder_input_ids, output_hidden_states=True, output_attentions=True)
        encoder_last_hidden_state = nmt_out["encoder_last_hidden_state"]
        decoder_last_hidden_state = nmt_out["decoder_hidden_states"][-1]
        logits = nmt_out["logits"]
       
        # Next perform average pooling
        # first apply attention_mask to encoder_last_hidden_state
  
        avg_encoder_hidden_state = avg_pooling(encoder_last_hidden_state, attention_mask)
        # Then devide 
        
        attention_mask_decoder = (self.padding_id != labels).long()
        avg_decoder_hidden_state = avg_pooling(decoder_last_hidden_state, attention_mask_decoder)
 
        # Concat the two
        hidden_states_concat = torch.cat([avg_encoder_hidden_state, avg_decoder_hidden_state], dim=-1)
        
        x = self.linear_layers(hidden_states_concat)
        #Next we apply softplus to the second part of x
        avg = x[:,0:1]
        std = x[:, 1:]

        std = self.softplus(std)
        return avg, std
predictive_model = PredictiveModel(model).to("cuda")  

In [12]:
### Lastly we can start the training
import torch.optim as optim
criterion = nn.GaussianNLLLoss()

predictive_model = PredictiveModel(model).to("cuda")  
optimizer = optim.Adam(predictive_model.linear_layers.parameters(), lr=0.0001,)
n_epochs = 10


for epoch in range(n_epochs):
    i = 0
    running_loss = 0.0
    for x, (sources, targets), (avg, std) in tqdm(dataloader):
        x = {k: v.to("cuda") for k, v in x.items()}
        (predicted_avg, predicted_std) = predictive_model.forward(**x)
        
        avg = avg.to("cuda")
        predicted_avg = predicted_avg.flatten()
        
        var = predicted_std.flatten()       
        loss = criterion(predicted_avg, avg, var=var)
    
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
        if i % 20 == 19:    # print every 2000 mini-batches
            print(f'[{epoch + 1}, {i + 1:5d}] loss: {running_loss / 20:.3f}')

            running_loss = 0.0
        i += 1

  0%|          | 0/47 [00:00<?, ?it/s]

[1,    20] loss: 0.173
[1,    40] loss: 0.503


  0%|          | 0/47 [00:00<?, ?it/s]

[2,    20] loss: 0.286
[2,    40] loss: 0.761


  0%|          | 0/47 [00:00<?, ?it/s]

[3,    20] loss: 0.705
[3,    40] loss: 0.545


  0%|          | 0/47 [00:00<?, ?it/s]

[4,    20] loss: 1.049
[4,    40] loss: 0.804


  0%|          | 0/47 [00:00<?, ?it/s]

[5,    20] loss: 0.328
[5,    40] loss: 0.359


  0%|          | 0/47 [00:00<?, ?it/s]

[6,    20] loss: 0.348
[6,    40] loss: 0.252


  0%|          | 0/47 [00:00<?, ?it/s]

[7,    20] loss: -0.006
[7,    40] loss: 0.085


  0%|          | 0/47 [00:00<?, ?it/s]

[8,    20] loss: -0.013
[8,    40] loss: -0.078


  0%|          | 0/47 [00:00<?, ?it/s]

[9,    20] loss: 0.375
[9,    40] loss: -0.019


  0%|          | 0/47 [00:00<?, ?it/s]

[10,    20] loss: -0.059
[10,    40] loss: 0.038
