## Load dataset SQUAD

In [1]:
import os
os.getcwd()

'C:\\Users\\giuli\\Documents\\ragcache\\code'

In [2]:
from datasets import load_dataset
train_ds, val_ds = load_dataset("squad", split=['train', 'validation'])

In [3]:
train_ds, val_ds

(Dataset({
     features: ['id', 'title', 'context', 'question', 'answers'],
     num_rows: 87599
 }),
 Dataset({
     features: ['id', 'title', 'context', 'question', 'answers'],
     num_rows: 10570
 }))

## Dataset creation

In [4]:
from tqdm.auto import tqdm
from langchain_community.embeddings import HuggingFaceEmbeddings # embedder
import pandas as pd

In [5]:
import faiss
import numpy as np

def faiss_knn(xb, xq, k = 3, d = 1024):
    index = faiss.IndexFlatL2(d)
    index.add(xb) 
    D, I = index.search(xq, k) 
    return I

#1. embed questions 

#2. compute knn (k=3) to extract pairs and lebaled them 

#3. build dataset

In [25]:
def create_pairs(dataset_split, embedding_model, k = 6, d = 1024):
    '''
    input: dataset, embedding_model
    output: dataset pairs D = (q1,q0,y)
    '''
    
    # i have X questions per context -> I want to create a context_id to every question with same context
    df = dataset_split.to_pandas()
    df['p_id'] = df['context'].factorize()[0]
    df.drop_duplicates(subset='question', keep="first", inplace=True)
    # create an identifier 
    df['matrix_id'] = list(range(0, len(df)))   

    print('stage faiss knn')
    # embeddings
    q_embeddings = embedding_model.embed_documents(df.question.to_list())
    # setup faiss knn
    xb = np.array(q_embeddings, dtype = np.float32)
    I = faiss_knn(xb = xb, xq = xb, k = k, d = d)

    #We sample 1999 prompts uniformly at random, and for each prompt, we choose the farthest three prompts 3
    #among the five nearest neighbors to form three prompt pairs. 
    indexes = np.random.choice(range(0,I.shape[0]), size=1999, replace=False)
    I = I[indexes, 2:]
    # create dataset pairs and label whether each prompt pair can be answered by the same corpus (0 or 1)
    print('stage dataset creation')
    dataset = []
    for row in tqdm(I):
        # (center, neighb1, neighb2, neighb3)
        center = row[0]
        passage_id0 = df.loc[df.matrix_id == center, 'p_id'].values[0]
        question0 = df.loc[df.matrix_id == center, 'question'].values[0]
        for neigh in row[1:]:
            passage_id1 = df.loc[df.matrix_id == neigh, 'p_id'].values[0]
            question1 = df.loc[df.matrix_id == neigh, 'question'].values[0]
            # append (q0, q1, y)
            if passage_id0 == passage_id1:
                dataset.append( (question0, question1, 1) )
            else:
                dataset.append( (question0, question1, 0) )
                
    return dataset        

### KNN 

In [26]:
EMBEDDING_MODEL_NAME = 'intfloat/e5-large-v2'

embedding_model = HuggingFaceEmbeddings(
    model_name=EMBEDDING_MODEL_NAME,
    multi_process=True,
    model_kwargs={"device": "cuda"},
    encode_kwargs={"normalize_embeddings": True},  # set True for cosine similarity
)

In [27]:
train = create_pairs(train_ds, embedding_model, k = 6)

stage faiss knn
stage dataset creation


  0%|          | 0/1999 [00:00<?, ?it/s]

In [28]:
import pickle
with open(r'C:/Users/giuli/Documents/ragcache/data/train_squadv0.pickle', 'wb') as handle:
    pickle.dump(train, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [29]:
from collections import Counter
Counter([sample[2] for sample in train])

Counter({0: 5294, 1: 703})

## Training

In [45]:
from torch.utils.data import DataLoader
import math
from sentence_transformers import LoggingHandler, util
from sentence_transformers.cross_encoder import CrossEncoder
from sentence_transformers.cross_encoder.evaluation import CEBinaryClassificationEvaluator
from sentence_transformers.readers import InputExample
import logging
from datetime import datetime
import os
import csv
from zipfile import ZipFile

In [35]:
from sklearn.model_selection import train_test_split

# prepare data
train_df = pd.DataFrame(train)
train_s, test_s = train_test_split(train_df, test_size=0.3, random_state=0, stratify=train_df.loc[:, 2])

In [36]:
train_s = [train[idx] for idx in train_s.index]
val_s = [train[idx] for idx in test_s.index]

In [76]:
Counter([sample[2] for sample in train_s]), Counter([sample[2] for sample in val_s])

(Counter({0: 3705, 1: 492}), Counter({0: 1589, 1: 211}))

In [85]:
train_samples = []

for sample in train_s:
    train_samples.append(InputExample(texts=[sample[0][0], sample[1][0]], label=sample[2]))

dev_samples = []
for sample in val_s:
    dev_samples.append(InputExample(texts=[sample[0][0], sample[1][0]], label=sample[2]))

In [86]:
train_batch_size = 32
num_epochs = 5
model_save_path = r'C:/Users/giuli/Documents/ragcache/models/' + model_name + datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

model_name = "distilroberta-base"
model = CrossEncoder(model_name, num_labels=1)

# We wrap train_samples (which is a List[InputExample]) into a pytorch DataLoader
train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=train_batch_size)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [81]:
# We add an evaluator, which evaluates the performance during training
evaluator = CEBinaryClassificationEvaluator.from_input_examples(dev_samples, name="dev")

In [82]:
# Configure the training
warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1)  # 10% of train data for warm-up
warmup_steps

66

In [83]:
import torch
def sigmoid(x):
    return torch.round(torch.sigmoid(x/(0.01-80)))

In [84]:
# Train the model
model.fit(
    train_dataloader=train_dataloader,
    evaluator=evaluator,
    epochs=num_epochs,
    warmup_steps=warmup_steps,
    output_path=model_save_path,
    #activation_fct = sigmoid,
    #loss_fct = torch.nn.CrossEntropyLoss()
)

Epoch:   0%|          | 0/5 [00:00<?, ?it/s]

Iteration:   0%|          | 0/132 [00:00<?, ?it/s]

Iteration:   0%|          | 0/132 [00:00<?, ?it/s]

Iteration:   0%|          | 0/132 [00:00<?, ?it/s]

Iteration:   0%|          | 0/132 [00:00<?, ?it/s]

Iteration:   0%|          | 0/132 [00:00<?, ?it/s]

In [88]:
# Check if dataset exists. If not, download and extract  it
dataset_path = "quora-dataset/"

if not os.path.exists(dataset_path):
    zip_save_path = "quora-IR-dataset.zip"
    util.http_get(url="https://sbert.net/datasets/quora-IR-dataset.zip", path=zip_save_path)
    with ZipFile(zip_save_path, "r") as zip:
        zip.extractall(dataset_path)


# Read the quora dataset split for classification
train_samples = []
with open(os.path.join(dataset_path, "classification", "train_pairs.tsv"), "r", encoding="utf8") as fIn:
    reader = csv.DictReader(fIn, delimiter="\t", quoting=csv.QUOTE_NONE)
    for row in reader:
        train_samples.append(InputExample(texts=[row["question1"], row["question2"]], label=int(row["is_duplicate"])))
        train_samples.append(InputExample(texts=[row["question2"], row["question1"]], label=int(row["is_duplicate"])))


dev_samples = []
with open(os.path.join(dataset_path, "classification", "dev_pairs.tsv"), "r", encoding="utf8") as fIn:
    reader = csv.DictReader(fIn, delimiter="\t", quoting=csv.QUOTE_NONE)
    for row in reader:
        dev_samples.append(InputExample(texts=[row["question1"], row["question2"]], label=int(row["is_duplicate"])))

  0%|          | 0.00/93.6M [00:00<?, ?B/s]

In [102]:
Counter([doc.label for doc in train_samples])

Counter({0: 349308, 1: 207326})

In [103]:
Counter([doc.label for doc in dev_samples])

Counter({0: 24012, 1: 12959})

In [101]:
# Configuration
train_batch_size = 16
num_epochs = 4
model_save_path =  r'C:/Users/giuli/Documents/ragcache/models/training_quora-' + datetime.now().strftime("%Y-%m-%d_%H-%M-%S")


# We use distilroberta-base with a single label, i.e., it will output a value between 0 and 1 indicating the similarity of the two questions
model = CrossEncoder("distilroberta-base", num_labels=1)

# We wrap train_samples (which is a List[InputExample]) into a pytorch DataLoader
train_dataloader = DataLoader(train_samples[:int(len(train_samples)/4)], shuffle=True, batch_size=train_batch_size)


# We add an evaluator, which evaluates the performance during training
evaluator = CEBinaryClassificationEvaluator.from_input_examples(dev_samples[:int(len(dev_samples)/4)], name="Quora-dev")


# Configure the training
warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1)  # 10% of train data for warm-up


# Train the model
model.fit(
    train_dataloader=train_dataloader,
    evaluator=evaluator,
    epochs=num_epochs,
    evaluation_steps=5000,
    warmup_steps=warmup_steps,
    output_path=model_save_path,
)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch:   0%|          | 0/4 [00:00<?, ?it/s]

Iteration:   0%|          | 0/8698 [00:00<?, ?it/s]

Iteration:   0%|          | 0/8698 [00:00<?, ?it/s]

Iteration:   0%|          | 0/8698 [00:00<?, ?it/s]

Iteration:   0%|          | 0/8698 [00:00<?, ?it/s]