In [1]:
%cd /content/drive/MyDrive/illuin/entretien

/content/drive/MyDrive/illuin/entretien


# Install and import packages

In [2]:
%%capture
!pip install transformers
!pip install rank-bm25

In [3]:
import os

import re

import json

import numpy as np
from scipy.spatial.distance import cosine

import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords

from rank_bm25 import BM25Okapi

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader, SequentialSampler

from transformers import AutoTokenizer, AutoModel

from sklearn.feature_extraction.text import TfidfVectorizer

from tqdm import tqdm, trange

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [4]:
STOP_WORDS = set(stopwords.words('english'))

# loading files

In [6]:
def load_file(file_path):
    dataset = {"question":[], "title":[], "answer":[], "passage":[]}

    with open(file_path, "r") as json_file:
        json_list = list(json_file)

    for json_str in tqdm(json_list):
        result = json.loads(json_str)
        for key in dataset.keys():
            dataset[key] += [result[key]]
    return dataset

In [7]:
train = load_file("train.jsonl")
dev = load_file("dev.jsonl")

100%|██████████| 9427/9427 [00:00<00:00, 162089.15it/s]
100%|██████████| 3270/3270 [00:00<00:00, 166531.58it/s]


In [8]:
corpus = train["passage"] + dev["passage"]

# Define Metrics

In [9]:
def metric(true_passage, predicted_passage, alpha=0.9):
    n, m = len(true_passage), len(predicted_passage)
    assert n == m, "true_passage and predicted_passage don't have the same shape ({}, {})".format(n, m)

    score = 0
    for i in range(n):
        true = true_passage[i]
        pred = predicted_passage[i] # pred is a list

        index = pred.index(true)

        score += alpha**index
    return score/n

# BM25

In [10]:
token_pattern = re.compile(r"(?u)\b\w\w+\b")
tokenize = token_pattern.findall

In [11]:
tokenized_corpus = [tokenize(sentence) for sentence in tqdm(corpus)]

100%|██████████| 12697/12697 [00:00<00:00, 35257.83it/s]


In [12]:
bm25 = BM25Okapi(tokenized_corpus)

In [13]:
tokenized_query = [tokenize(query) for query in train["question"]]

In [14]:
true_passage = list(range(len(train["question"])))
predicted_passage = []

for query in tqdm(train["question"]):
    tokenized_query = tokenize(query)
    doc_scores = bm25.get_scores(tokenized_query)
    prediction_order = sorted(list(range(len(doc_scores))), key=lambda i:doc_scores[i], reverse=True)
    predicted_passage += [prediction_order]

train_metric = metric(true_passage, predicted_passage)

100%|██████████| 9427/9427 [06:12<00:00, 25.31it/s]


In [15]:
print(train_metric)

0.3706849969266636


In [16]:
true_passage = len(train["passage"]) + np.arange(len(dev["question"]))
predicted_passage = []

for query in tqdm(dev["question"]):
    tokenized_query = tokenize(query)
    doc_scores = bm25.get_scores(tokenized_query)
    prediction_order = sorted(list(range(len(doc_scores))), key=lambda i:doc_scores[i], reverse=True)
    predicted_passage += [prediction_order]

dev_metric = metric(true_passage, predicted_passage)

100%|██████████| 3270/3270 [02:06<00:00, 25.86it/s]


In [17]:
print(dev_metric)

0.3910267717623277


# TF-IDF

### dot product similarity

In [18]:
tfidf = TfidfVectorizer().fit(corpus)

tfidf_passage = tfidf.transform(corpus)
tfidf_question = tfidf.transform(train["question"])

In [19]:
print(tfidf_passage.shape)
print(tfidf_question.shape)

(12697, 47576)
(9427, 47576)


In [24]:
question_context_similarity = (tfidf_passage @ tfidf_question.T).toarray()

In [30]:
true_passage = list(range(len(train["question"])))
predicted_passage = []

for i in trange(question_context_similarity.shape[1]):
    doc_scores = question_context_similarity[:, i]
    prediction_order = sorted(list(range(len(doc_scores))), key=lambda i:doc_scores[i], reverse=True)
    predicted_passage += [prediction_order]

train_metric = metric(true_passage, predicted_passage)

100%|██████████| 9427/9427 [01:14<00:00, 126.18it/s]


In [None]:
print(train_metric)

0.6973080144299679


In [31]:
## and now for dev questions

tfidf_question = tfidf.transform(dev["question"])

In [32]:
question_context_similarity = (tfidf_passage @ tfidf_question.T).toarray()

In [33]:
question_context_similarity.shape

(12697, 3270)

In [34]:
true_passage = len(train["passage"]) + np.arange(len(dev["question"]))
predicted_passage = []

for i in trange(question_context_similarity.shape[1]):
    doc_scores = question_context_similarity[:, i]
    prediction_order = sorted(list(range(len(doc_scores))), key=lambda i:doc_scores[i], reverse=True)
    predicted_passage += [prediction_order]

dev_metric = metric(true_passage, predicted_passage)

100%|██████████| 3270/3270 [00:23<00:00, 136.56it/s]


In [35]:
print(dev_metric)

0.6351215591225176


# cosine similarity

In [36]:
tfidf = TfidfVectorizer().fit(corpus)

tfidf_passage = tfidf.transform(corpus)
tfidf_question = tfidf.transform(train["question"])

In [37]:
print(tfidf_passage.shape)
print(tfidf_question.shape)

(12697, 47576)
(9427, 47576)


In [38]:
question_context_similarity = (tfidf_passage @ tfidf_question.T).toarray()

In [39]:
tfidf_passage_L2_norm = (tfidf_passage.toarray()**2).sum(axis=1)
tfidf_question_L2_norm = (tfidf_question.toarray()**2).sum(axis=1)

In [40]:
print(tfidf_passage_L2_norm.shape)
print(tfidf_question_L2_norm.shape)

(12697,)
(9427,)


In [41]:
question_context_cos_similarity = question_context_similarity.copy()
question_context_cos_similarity /= tfidf_passage_L2_norm.reshape(-1, 1)
question_context_cos_similarity /= tfidf_question_L2_norm.reshape(1, -1)

In [42]:
true_passage = list(range(len(train["question"])))
predicted_passage = []

for i in trange(question_context_cos_similarity.shape[1]):
    doc_scores = question_context_cos_similarity[:, i]
    prediction_order = sorted(list(range(len(doc_scores))), key=lambda i:doc_scores[i], reverse=True)
    predicted_passage += [prediction_order]

train_metric = metric(true_passage, predicted_passage)

100%|██████████| 9427/9427 [01:10<00:00, 133.91it/s]


In [43]:
print(train_metric)

0.6719255060284609


In [44]:
## and now for dev questions
tfidf_question = tfidf.transform(dev["question"])

In [45]:
question_context_similarity = (tfidf_passage @ tfidf_question.T).toarray()

In [47]:
tfidf_passage_L2_norm = (tfidf_passage.toarray()**2).sum(axis=1)
tfidf_question_L2_norm = (tfidf_question.toarray()**2).sum(axis=1)

In [48]:
question_context_cos_similarity = question_context_similarity.copy()
question_context_cos_similarity /= tfidf_passage_L2_norm.reshape(-1, 1)
question_context_cos_similarity /= tfidf_question_L2_norm.reshape(1, -1)

In [49]:
true_passage = len(train["passage"]) + np.arange(len(dev["question"]))
predicted_passage = []

for i in trange(question_context_cos_similarity.shape[1]):
    doc_scores = question_context_cos_similarity[:, i]
    prediction_order = sorted(list(range(len(doc_scores))), key=lambda i:doc_scores[i], reverse=True)
    predicted_passage += [prediction_order]

dev_metric = metric(true_passage, predicted_passage)

100%|██████████| 3270/3270 [00:24<00:00, 135.83it/s]


In [50]:
print(dev_metric)

0.6351215591225176


# Remove stop-words, Stem and TF-IDF

In [62]:
stemmer = nltk.stem.PorterStemmer()

In [63]:
token_pattern = re.compile(r"(?u)\b\w\w+\b")
tokenize = token_pattern.findall

In [64]:
# remove stop words, set the tokenized sentence since order doesn't count
modified_corpus = [" ".join(list(set(tokenize(passage)) - STOP_WORDS)) for passage in corpus]
modified_question = [" ".join(list(set(tokenize(question)) - STOP_WORDS)) for question in train["question"]]

# stem documents
stemmed_corpus = [" ".join([stemmer.stem(w) for w in tokenize(passage)]) for passage in modified_corpus]
stemmed_question = [" ".join([stemmer.stem(w) for w in tokenize(question)]) for question in modified_question]

In [65]:
tfidf = TfidfVectorizer().fit(stemmed_corpus)

tfidf_passage = tfidf.transform(stemmed_corpus)
tfidf_question = tfidf.transform(stemmed_question)

In [66]:
print(tfidf_passage.shape)
print(tfidf_question.shape)

(12697, 34735)
(9427, 34735)


In [67]:
question_context_similarity = (tfidf_passage @ tfidf_question.T).toarray()

In [68]:
true_passage = list(range(len(train["question"])))
predicted_passage = []

for i in trange(question_context_similarity.shape[1]):
    doc_scores = question_context_similarity[:, i]
    prediction_order = sorted(list(range(len(doc_scores))), key=lambda i:doc_scores[i], reverse=True)
    predicted_passage += [prediction_order]

train_metric = metric(true_passage, predicted_passage)

100%|██████████| 9427/9427 [00:44<00:00, 209.98it/s]


In [70]:
print(train_metric)

0.6882624414191127


In [71]:
## now for the dev set

# remove stop words, set the tokenized sentence since order doesn't count
modified_question = [" ".join(list(set(tokenize(question)) - STOP_WORDS)) for question in dev["question"]]

# stem documents
stemmed_question = [" ".join([stemmer.stem(w) for w in tokenize(question)]) for question in modified_question]

In [72]:
tfidf_passage = tfidf.transform(stemmed_corpus)
tfidf_question = tfidf.transform(stemmed_question)

In [73]:
question_context_similarity = (tfidf_passage @ tfidf_question.T).toarray()

In [74]:
question_context_similarity.shape

(12697, 3270)

In [75]:
true_passage = len(train["passage"]) + np.arange(len(dev["question"]))
predicted_passage = []

for i in trange(question_context_similarity.shape[1]):
    doc_scores = question_context_similarity[:, i]
    prediction_order = sorted(list(range(len(doc_scores))), key=lambda i:doc_scores[i], reverse=True)
    predicted_passage += [prediction_order]

dev_metric = metric(true_passage, predicted_passage)

100%|██████████| 3270/3270 [00:15<00:00, 216.59it/s]


In [76]:
print(dev_metric)

0.6518668438637109


# Transformers

### Get the [CLS] hidden state for each layer

In [90]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [87]:
bert_model_name = "bert-base-uncased"

In [88]:
bert = AutoModel.from_pretrained(bert_model_name)

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [89]:
tokenizer = AutoTokenizer.from_pretrained(bert_model_name)

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [99]:
encoded_passage = tokenizer(corpus, padding="max_length", max_length=512, truncation=True, return_tensors="pt")
encoded_question = tokenizer(train["question"], padding="max_length", max_length=512, truncation=True, return_tensors="pt")
encoded_dev_question = tokenizer(dev["question"], padding="max_length", max_length=512, truncation=True, return_tensors="pt")

In [100]:
# Select a batch size for training
batch_size = 8

# Create an iterator of passages with torch DataLoader 
passage_dataset = TensorDataset(encoded_passage['input_ids'], encoded_passage['attention_mask'])
passage_sampler = SequentialSampler(passage_dataset)
passage_dataloader = DataLoader(passage_dataset, sampler=passage_sampler, batch_size=batch_size)

# Create an iterator of train questions with torch DataLoader 
question_dataset = TensorDataset(encoded_question['input_ids'], encoded_question['attention_mask'])
question_sampler = SequentialSampler(question_dataset)
question_dataloader = DataLoader(question_dataset, sampler=question_sampler, batch_size=batch_size)

# Create an iterator of dev questions with torch DataLoader
dev_question_dataset = TensorDataset(encoded_dev_question['input_ids'], encoded_dev_question['attention_mask'])
dev_question_sampler = SequentialSampler(dev_question_dataset)
dev_question_dataloader = DataLoader(dev_question_dataset, sampler=dev_question_sampler, batch_size=batch_size)

In [93]:
def get_extended_attention_mask(attention_mask, input_shape, device=torch.device('cpu')):
    if attention_mask.dim() == 3:
        extended_attention_mask = attention_mask[:, None, :, :]
    elif attention_mask.dim() == 2:
        extended_attention_mask = attention_mask[:, None, None, :]
    else:
        raise ValueError(
            f"Wrong shape for input_ids (shape {input_shape}) or attention_mask (shape {attention_mask.shape})"
        )
    extended_attention_mask = extended_attention_mask.float()  # fp16 compatibility
    extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
    return extended_attention_mask

In [94]:
class SentenceEncoder(nn.Module):
    def __init__(self, bert=bert):
        super().__init__()
        self.bert = bert
        self.num_layers = self.bert.config.num_hidden_layers
        self.hidden_size = self.bert.config.hidden_size

    def forward(self, input_ids, attention_mask): 
        hidden_state = self.bert.embeddings(input_ids)
        ext_attention_mask = get_extended_attention_mask(attention_mask, hidden_state.shape).detach().data

        for i in range(self.num_layers):
            hidden_state, = self.bert.encoder.layer[i](hidden_state, attention_mask=ext_attention_mask)
            if i == 0:
                cls_hidden_state = hidden_state[:, 0]
            else:
                cls_hidden_state = torch.cat([cls_hidden_state, hidden_state[:, 0]], axis=0)
        cls_hidden_state = cls_hidden_state.view(self.num_layers, -1, self.hidden_size).permute(1, 0, 2)
        return cls_hidden_state

In [95]:
%%capture
model = SentenceEncoder()
model.to(device)
model.eval()

In [None]:
for step, batch in tqdm(enumerate(passage_dataloader)):

    batch = tuple(t.to(device) for t in batch)

    b_input_ids, b_input_mask = batch

    if step == 0:
        cls_passage_hidden_state = model(b_input_ids, b_input_mask).detach().cpu()
    else:
        x = model(b_input_ids, b_input_mask).detach().cpu()
        cls_passage_hidden_state = torch.cat([cls_passage_hidden_state, x], axis=0)

1588it [07:53,  3.36it/s]


In [None]:
for step, batch in tqdm(enumerate(question_dataloader)):

    batch = tuple(t.to(device) for t in batch)

    b_input_ids, b_input_mask = batch

    if step == 0:
        cls_question_hidden_state = model(b_input_ids, b_input_mask).detach().cpu()
    else:
        x = model(b_input_ids, b_input_mask).detach().cpu()
        cls_question_hidden_state = torch.cat([cls_question_hidden_state, x], axis=0)

1179it [05:39,  3.47it/s]


In [101]:
for step, batch in tqdm(enumerate(dev_question_dataloader)):

    batch = tuple(t.to(device) for t in batch)

    b_input_ids, b_input_mask = batch

    if step == 0:
        cls_dev_question_hidden_state = model(b_input_ids, b_input_mask).detach().cpu()
    else:
        x = model(b_input_ids, b_input_mask).detach().cpu()
        cls_dev_question_hidden_state = torch.cat([cls_dev_question_hidden_state, x], axis=0)

409it [01:05,  6.28it/s]


In [None]:
cls_passage_hidden_state = cls_passage_hidden_state.permute(1, 0, 2)
cls_question_hidden_state = cls_question_hidden_state.permute(1, 0, 2)
cls_dev_question_hidden_state = cls_dev_question_hidden_state.permute(1, 0, 2)

In [None]:
torch.save(cls_passage_hidden_state, "cls_passage_hidden_state.pt")
torch.save(cls_question_hidden_state, "cls_question_hidden_state.pt")
torch.save(cls_dev_question_hidden_state, "cls_dev_question_hidden_state.pt")

### Now, compute similarities and deduce best layer

In [106]:
cls_passage_hidden_state = torch.load("cls_passage_hidden_state.pt", map_location="cpu")
cls_question_hidden_state = torch.load("cls_question_hidden_state.pt", map_location="cpu")
cls_dev_question_hidden_state = torch.load("cls_dev_question_hidden_state.pt", map_location="cpu")

#### on train set

In [78]:
cls_passage_hidden_state.shape, cls_question_hidden_state.shape

(torch.Size([12, 12697, 768]), torch.Size([12, 9427, 768]))

In [85]:
scores_all_layers = torch.matmul(cls_passage_hidden_state, cls_question_hidden_state.transpose(-1, -2))

In [86]:
scores_all_layers.shape

torch.Size([12, 12697, 9427])

In [None]:
num_layers = bert.config.num_hidden_layers
train_metric = []
for i in range(num_layers):
    question_context_similarity = scores_all_layers[i].numpy()

    true_passage = list(range(len(train["question"])))
    predicted_passage = []
    for i in trange(question_context_similarity.shape[1]):
        doc_scores = question_context_similarity[:, i]
        prediction_order = sorted(list(range(len(doc_scores))), key=lambda i:doc_scores[i], reverse=True)
        predicted_passage += [prediction_order]

    train_metric += [metric(true_passage, predicted_passage)]

100%|██████████| 9427/9427 [01:11<00:00, 132.23it/s]
100%|██████████| 9427/9427 [01:11<00:00, 132.69it/s]
100%|██████████| 9427/9427 [01:13<00:00, 129.07it/s]
100%|██████████| 9427/9427 [01:10<00:00, 133.28it/s]
100%|██████████| 9427/9427 [01:12<00:00, 129.44it/s]
100%|██████████| 9427/9427 [01:13<00:00, 127.47it/s]
100%|██████████| 9427/9427 [01:14<00:00, 126.36it/s]
100%|██████████| 9427/9427 [01:11<00:00, 131.34it/s]
100%|██████████| 9427/9427 [01:12<00:00, 129.77it/s]
100%|██████████| 9427/9427 [01:11<00:00, 132.22it/s]
100%|██████████| 9427/9427 [01:11<00:00, 131.48it/s]
100%|██████████| 9427/9427 [01:09<00:00, 134.71it/s]


In [None]:
train_metric

[0.0038878329328788626,
 0.004839848182465071,
 0.00163963605077251,
 0.11804055094440544,
 0.1481323875615838,
 0.03922110216422828,
 0.012475439630225744,
 0.004424516373907048,
 0.004395380622490834,
 0.009521469503107172,
 0.026410994832704583,
 0.03679354248809049]

#### and now on dev set

In [107]:
cls_passage_hidden_state.shape, cls_dev_question_hidden_state.shape

(torch.Size([12, 12697, 768]), torch.Size([12, 3270, 768]))

In [108]:
scores_all_layers = torch.matmul(cls_passage_hidden_state, cls_dev_question_hidden_state.transpose(-1, -2))

In [109]:
scores_all_layers.shape

torch.Size([12, 12697, 3270])

In [110]:
num_layers = bert.config.num_hidden_layers
dev_metric = []

for i in range(num_layers):
    question_context_similarity = scores_all_layers[i].numpy()

    true_passage = len(train["question"]) + np.arange(len(dev["question"]))
    predicted_passage = []
    for i in trange(question_context_similarity.shape[1]):
        doc_scores = question_context_similarity[:, i]
        prediction_order = sorted(list(range(len(doc_scores))), key=lambda i:doc_scores[i], reverse=True)
        predicted_passage += [prediction_order]

    dev_metric += [metric(true_passage, predicted_passage)]

100%|██████████| 3270/3270 [00:24<00:00, 133.82it/s]
100%|██████████| 3270/3270 [00:24<00:00, 132.45it/s]
100%|██████████| 3270/3270 [00:25<00:00, 128.42it/s]
100%|██████████| 3270/3270 [00:24<00:00, 134.21it/s]
100%|██████████| 3270/3270 [00:24<00:00, 131.35it/s]
100%|██████████| 3270/3270 [00:24<00:00, 135.14it/s]
100%|██████████| 3270/3270 [00:24<00:00, 134.15it/s]
100%|██████████| 3270/3270 [00:24<00:00, 132.56it/s]
100%|██████████| 3270/3270 [00:24<00:00, 132.71it/s]
100%|██████████| 3270/3270 [00:24<00:00, 134.75it/s]
100%|██████████| 3270/3270 [00:24<00:00, 131.90it/s]
100%|██████████| 3270/3270 [00:24<00:00, 135.40it/s]


In [111]:
dev_metric

[0.002275058041746314,
 0.004471908165488861,
 0.0018376450320966645,
 0.11106991816553806,
 0.13072829232883756,
 0.035530416994016045,
 0.010297887967341055,
 0.003383275735245169,
 0.003481309998721146,
 0.0072295131794347005,
 0.019944033470609118,
 0.03688572374707861]