In [2]:
!nvidia-smi -L

GPU 0: Tesla P100-PCIE-16GB (UUID: GPU-561632b4-1f11-6299-970f-4598d3b5aed8)


In [1]:
%cd /content/drive/MyDrive/illuin/entretien

/content/drive/MyDrive/illuin/entretien


# Install and import packages

In [3]:
%%capture
!pip install transformers

In [4]:
import re

import json

import numpy as np

from random import shuffle

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader, SequentialSampler, RandomSampler

from transformers import AutoTokenizer, AutoModel, BertModel
from transformers import AdamW, get_linear_schedule_with_warmup

from sklearn.feature_extraction.text import TfidfVectorizer

from tqdm.notebook import tqdm, trange

In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
WEIGHTS_PATH = os.path.join(".", "weights", "miniBERT.pt")

# Loading dataset

In [6]:
def load_file(file_path):
    dataset = {"question":[], "title":[], "answer":[], "passage":[]}

    with open(file_path, "r") as json_file:
        json_list = list(json_file)

    for json_str in tqdm(json_list):
        result = json.loads(json_str)
        for key in dataset.keys():
            dataset[key] += [result[key]]
    return dataset

In [7]:
train = load_file("train.jsonl")
dev = load_file("dev.jsonl")

  0%|          | 0/9427 [00:00<?, ?it/s]

  0%|          | 0/3270 [00:00<?, ?it/s]

In [8]:
corpus = train["passage"] + dev["passage"]

# Define metric

In [26]:
def metric(true_passage, predicted_passage, alpha=0.9):
    n, m = len(true_passage), len(predicted_passage)
    assert n == m, "true_passage and predicted_passage don't have the same shape ({}, {})".format(n, m)

    score = 0
    for i in range(n):
        true = true_passage[i]
        pred = predicted_passage[i] # pred is a list

        index = pred.index(true)

        score += alpha**index
    return score/n

# Supervised pipeline

In [9]:
bert_model_name = "bert-base-uncased"

In [10]:
bert = AutoModel.from_pretrained(bert_model_name)

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [11]:
tokenizer = AutoTokenizer.from_pretrained(bert_model_name)

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [12]:
def get_extended_attention_mask(attention_mask, input_shape, device=torch.device('cpu')):
    if attention_mask.dim() == 3:
        extended_attention_mask = attention_mask[:, None, :, :]
    elif attention_mask.dim() == 2:
        extended_attention_mask = attention_mask[:, None, None, :]
    else:
        raise ValueError(
            f"Wrong shape for input_ids (shape {input_shape}) or attention_mask (shape {attention_mask.shape})"
        )
    extended_attention_mask = extended_attention_mask.float()  # fp16 compatibility
    extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
    return extended_attention_mask

In [13]:
def resize_bert(bert, keep_layers):
    state_dict = dict(bert.state_dict())
    new_state_dict = {}
    
    for elem in state_dict:
        if "layer" in elem:
            elem_split = elem.split(".")
            layer_num = int(elem_split[elem_split.index("layer") + 1])
            
            if layer_num >= keep_layers:
                continue
        new_state_dict[elem] = state_dict[elem]
    
    config = bert.config; config.num_hidden_layers = keep_layers
    new_bert = BertModel(config)
    new_bert.load_state_dict(new_state_dict)
    
    return new_bert

In [14]:
resized_bert = resize_bert(bert, keep_layers=5)

In [15]:
class SentenceEncoder(nn.Module):
    def __init__(self, bert=resized_bert):
        super().__init__()
        self.bert = bert
        self.num_layers = self.bert.config.num_hidden_layers
        self.hidden_size = self.bert.config.hidden_size

    def forward(self, input_ids, attention_mask): 
        hidden_state = self.bert.embeddings(input_ids)
        ext_attention_mask = get_extended_attention_mask(attention_mask, hidden_state.shape).detach().data
        for i in range(self.bert.config.num_hidden_layers):
            hidden_state, = self.bert.encoder.layer[i](hidden_state, attention_mask=ext_attention_mask)
        return hidden_state[:, 0]

# Training

In [None]:
encoded_passage = tokenizer(train["passage"], padding="max_length", max_length=512, truncation=True, return_tensors="pt")
encoded_question = tokenizer(train["question"], padding="max_length", max_length=512, truncation=True, return_tensors="pt")

In [None]:
# we will assign randomly a "wrong" pair passage for each question

# num_samples = len(corpus)
# shuffled_order = list(range(num_samples))
# shuffle(shuffled_order)

# wrong_pair = {}
# for key in encoded_question.keys():
#     wrong_pair[key] = encoded_question[key][shuffled_order]

In [None]:
# Select a batch size for training
batch_size = 8

# Create an iterator of our data with torch DataLoader 
train_dataset = TensorDataset(encoded_passage['input_ids'], 
                              encoded_passage['attention_mask'],
                              encoded_question['input_ids'],
                              encoded_question['attention_mask'],
                              )
train_sampler = RandomSampler(train_dataset)
train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=batch_size)

In [None]:
%%capture
model = SentenceEncoder()
model.to(device)

In [None]:
optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=5, num_training_steps=20)

EPOCHS = 20
criterion_COS = nn.CosineEmbeddingLoss(reduction="mean")


for epoch in range(EPOCHS):
    print(" EPOCH {}".format(epoch))
   
    model.train()
    for step, batch in tqdm(enumerate(train_dataloader)):
        batch = tuple(t.to(device) for t in batch)
        passage_input_ids, passage_attn_mask, question_input_ids, question_attn_mask = batch
        b_size = passage_input_ids.size(0)
        disorder = (torch.arange(start=0, end=b_size) + 1) % b_size

        pair_passage = model(passage_input_ids, passage_attn_mask)
        pair_question = model(question_input_ids, question_attn_mask)
        wrong_pair = pair_passage[disorder]

        target_plus = pair_passage.new(b_size).fill_(1)
        target_minus = pair_passage.new(b_size).fill_(-1)

        loss = 0
        loss += criterion_COS(pair_passage, pair_question, target_plus)
        loss += criterion_COS(wrong_pair, pair_question, target_minus)

        loss.backward()
        optimizer.step()
    scheduler.step()

 EPOCH 0


0it [00:00, ?it/s]

 EPOCH 1


0it [00:00, ?it/s]

 EPOCH 2


0it [00:00, ?it/s]

 EPOCH 3


0it [00:00, ?it/s]

 EPOCH 4


0it [00:00, ?it/s]

 EPOCH 5


0it [00:00, ?it/s]

 EPOCH 6


0it [00:00, ?it/s]

 EPOCH 7


0it [00:00, ?it/s]

 EPOCH 8


0it [00:00, ?it/s]

 EPOCH 9


0it [00:00, ?it/s]

 EPOCH 10


0it [00:00, ?it/s]

 EPOCH 11


0it [00:00, ?it/s]

 EPOCH 12


0it [00:00, ?it/s]

 EPOCH 13


0it [00:00, ?it/s]

 EPOCH 14


0it [00:00, ?it/s]

# Training metrics

In [16]:
encoded_passage = tokenizer(corpus, padding="max_length", max_length=512, truncation=True, return_tensors="pt")
encoded_question = tokenizer(train["question"], padding="max_length", max_length=512, truncation=True, return_tensors="pt")

In [17]:
# Select a batch size for training
batch_size = 8

# Create an iterator of our data with torch DataLoader 
corpus_dataset = TensorDataset(encoded_passage['input_ids'], encoded_passage['attention_mask'])
corpus_sampler = SequentialSampler(corpus_dataset)
corpus_dataloader = DataLoader(corpus_dataset, sampler=corpus_sampler, batch_size=batch_size)

# Create an iterator of our data with torch DataLoader 
question_dataset = TensorDataset(encoded_question['input_ids'], encoded_question['attention_mask'])
question_sampler = SequentialSampler(question_dataset)
question_dataloader = DataLoader(question_dataset, sampler=question_sampler, batch_size=batch_size)

In [18]:
%%capture
model = SentenceEncoder()
model.load_state_dict(torch.load(WEIGHTS_PATH, map_location="cpu"))
model.to(device)

In [20]:
model.eval()

for step, batch in tqdm(enumerate(corpus_dataloader)):

    batch = tuple(t.to(device) for t in batch)

    b_input_ids, b_input_mask = batch

    if step == 0:
        cls_passage_hidden_state = model(b_input_ids, b_input_mask).detach().cpu()
    else:
        x = model(b_input_ids, b_input_mask).detach().cpu()
        cls_passage_hidden_state = torch.cat([cls_passage_hidden_state, x], axis=0)

0it [00:00, ?it/s]

In [21]:
model.eval()

for step, batch in tqdm(enumerate(question_dataloader)):

    batch = tuple(t.to(device) for t in batch)

    b_input_ids, b_input_mask = batch

    if step == 0:
        cls_question_hidden_state = model(b_input_ids, b_input_mask).detach().cpu()
    else:
        x = model(b_input_ids, b_input_mask).detach().cpu()
        cls_question_hidden_state = torch.cat([cls_question_hidden_state, x], axis=0)

0it [00:00, ?it/s]

In [22]:
cls_passage_hidden_state.shape

torch.Size([12697, 768])

In [23]:
cls_question_hidden_state.shape

torch.Size([9427, 768])

In [25]:
question_context_similarity = cls_passage_hidden_state @ cls_question_hidden_state.T

In [29]:
true_passage = list(range(len(train["question"])))
predicted_passage = []

for i in trange(question_context_similarity.shape[1]):
    doc_scores = question_context_similarity[:, i].tolist()
    prediction_order = sorted(list(range(len(doc_scores))), key=lambda i:doc_scores[i], reverse=True)
    predicted_passage += [prediction_order]

train_metric = metric(true_passage, predicted_passage)

  0%|          | 0/9427 [00:00<?, ?it/s]

In [30]:
print(train_metric)

0.3415752491672092


# Dev metrics

In [31]:
encoded_passage = tokenizer(corpus, padding="max_length", max_length=512, truncation=True, return_tensors="pt")
encoded_question = tokenizer(dev["question"], padding="max_length", max_length=512, truncation=True, return_tensors="pt")

In [32]:
# Select a batch size for training
batch_size = 8

# Create an iterator of our data with torch DataLoader 
corpus_dataset = TensorDataset(encoded_passage['input_ids'], encoded_passage['attention_mask'])
corpus_sampler = SequentialSampler(corpus_dataset)
corpus_dataloader = DataLoader(corpus_dataset, sampler=corpus_sampler, batch_size=batch_size)

# Create an iterator of our data with torch DataLoader 
question_dataset = TensorDataset(encoded_question['input_ids'], encoded_question['attention_mask'])
question_sampler = SequentialSampler(question_dataset)
question_dataloader = DataLoader(question_dataset, sampler=question_sampler, batch_size=batch_size)

In [33]:
%%capture
model = SentenceEncoder()
model.load_state_dict(torch.load(WEIGHTS_PATH, map_location="cpu"))
model.to(device)

In [34]:
model.eval()

for step, batch in tqdm(enumerate(corpus_dataloader)):

    batch = tuple(t.to(device) for t in batch)

    b_input_ids, b_input_mask = batch

    if step == 0:
        cls_passage_hidden_state = model(b_input_ids, b_input_mask).detach().cpu()
    else:
        x = model(b_input_ids, b_input_mask).detach().cpu()
        cls_passage_hidden_state = torch.cat([cls_passage_hidden_state, x], axis=0)

0it [00:00, ?it/s]

In [35]:
model.eval()

for step, batch in tqdm(enumerate(question_dataloader)):

    batch = tuple(t.to(device) for t in batch)

    b_input_ids, b_input_mask = batch

    if step == 0:
        cls_question_hidden_state = model(b_input_ids, b_input_mask).detach().cpu()
    else:
        x = model(b_input_ids, b_input_mask).detach().cpu()
        cls_question_hidden_state = torch.cat([cls_question_hidden_state, x], axis=0)

0it [00:00, ?it/s]

In [36]:
cls_passage_hidden_state.shape

torch.Size([12697, 768])

In [37]:
cls_question_hidden_state.shape

torch.Size([3270, 768])

In [38]:
question_context_similarity = cls_passage_hidden_state @ cls_question_hidden_state.T

In [39]:
true_passage = len(train["question"]) + np.arange(len(dev["question"]))
predicted_passage = []

for i in trange(question_context_similarity.shape[1]):
    doc_scores = question_context_similarity[:, i].tolist()
    prediction_order = sorted(list(range(len(doc_scores))), key=lambda i:doc_scores[i], reverse=True)
    predicted_passage += [prediction_order]

dev_metric = metric(true_passage, predicted_passage)

  0%|          | 0/3270 [00:00<?, ?it/s]

In [40]:
print(dev_metric)

0.18012681226204022
