# Advanced Information Retrieval - Group Project

## Authors
* Felix Moitzi
* David Rainer
* Michael Unterberger

## Short Description

Training two Bi-Encoder to create similar encodings for Queries and similar Documents. One using only positive samples and the other incorporating Negative sampling. Afterwards the Bi-Encoders will be used to build basic, fast and scalable IR-Engines, the performance of which will be tested.

## Notes For the Boyz

### Dataset

* Kleine Beschreibung zum Dataset: https://ir-datasets.com/cranfield.html
* Weitere Datasets die wir in Zukunft vllt. stattdessen verwenden/hinzufügen können: https://ir-datasets.com/index.html

* Für den Anfang ist ein Binary-Approach (1 = relevant, 0 = irrelevant) das beste
* Meine recommendation Werte 4 und 3 sind positiv/relevant, -1 und 1 sind negativ/irrelevant, Label 2 müssen wir noch besprechen
* Aufbereiten der Daten für Negative Sampling in der Form: Query, Relevantes Doc, Irrelevantes Doc
* Aufbereiten der Daten für Normales trainin

* Should def. use dropout


In [None]:
!pip install ir_datasets
!pip install faiss-gpu

Collecting ir_datasets
  Downloading ir_datasets-0.5.5-py3-none-any.whl (318 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m318.0/318.0 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
Collecting inscriptis>=2.2.0 (from ir_datasets)
  Downloading inscriptis-2.3.2-py3-none-any.whl (41 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.2/41.2 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
Collecting trec-car-tools>=2.5.4 (from ir_datasets)
  Downloading trec_car_tools-2.6-py3-none-any.whl (8.4 kB)
Collecting lz4>=3.1.10 (from ir_datasets)
  Downloading lz4-4.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m54.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting warc3-wet>=0.2.3 (from ir_datasets)
  Downloading warc3_wet-0.2.3-py3-none-any.whl (13 kB)
Collecting warc3-wet-clueweb09>=0.2.5 (from ir_datasets)
  Downloading warc3-wet-clueweb09-0.2.5.tar.

## Imports

In [None]:

import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import torch.nn.functional as F
import numpy as np
import pandas as pd
import ir_datasets
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
import faiss

import os

#initialize gpu if available
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("Using GPU:", torch.cuda.get_device_name(torch.cuda.current_device()))
else:
    device = torch.device("cpu")
    print("Using CPU")

Using GPU: Tesla T4


In [None]:
from google.colab import drive

# mount google drive so save model parameters later
drive.mount('/content/drive')

# Create a folder in the root directory
!mkdir -p "/content/drive/My Drive/My Folder"

Mounted at /content/drive


In [None]:
import random

def progress_bar(progress, total):
    percent = 100 * (progress / float(total))
    bar_length = int(percent)
    bar = ''

    # ANSI escape codes for some rainbow colors
    colors = colors = [
    '\033[31m', # Red
    '\033[32m', # Green
    '\033[33m', # Yellow
    '\033[34m', # Blue
    '\033[35m', # Magenta
    '\033[36m', # Cyan
    '\033[91m', # Bright Red
    '\033[92m', # Bright Green
    '\033[93m', # Bright Yellow
    '\033[94m', # Bright Blue
    '\033[95m', # Bright Magenta
    '\033[96m', # Bright Cyan
    ]

    # Create the bar with random colors
    for i in range(bar_length):
        bar += random.choice(colors) + '█'
    bar += '\033[0m'  # Reset color to default

    bar += '-' * (100 - bar_length)

    if percent >= 100:
        print(f"\r|{bar}| {percent:.2f}%", end="\n")
    else:
        print(f"\r|{bar}| {percent:.2f}%", end="", flush=True)

## Load Dataset & Preprocessing

In [None]:
dataset = ir_datasets.load("cranfield")

In [None]:

def getQrels(relevance_threshold, dataset):
    positive_qrels = []
    negative_qrels = []
    for qrel in dataset.qrels_iter():
        if qrel.relevance > relevance_threshold:
            positive_qrels.append(qrel)
        if qrel.relevance < relevance_threshold:
            negative_qrels.append(qrel)

    return positive_qrels, negative_qrels

def getPositiveData(positive_qrels):
    pos_train = []
    pos_eval = []
    pos_test = []

    number_positive_qrels = len(positive_qrels)
    print(f"Number of positive qrels: {number_positive_qrels}")
    ten_percent_pos = int(number_positive_qrels / 10)

    used_indices = []
    for _ in range(ten_percent_pos):
        rand = np.random.randint(0, number_positive_qrels)
        while rand in used_indices:
            rand = np.random.randint(0, number_positive_qrels)
        pos_eval.append(positive_qrels[rand])
        used_indices.append(rand)

        rand = np.random.randint(0, number_positive_qrels)
        while rand in used_indices:
            rand = np.random.randint(0, number_positive_qrels)
        pos_test.append(positive_qrels[rand])
        used_indices.append(rand)

    for i in range(0, number_positive_qrels):
        if i not in used_indices:
            pos_train.append(positive_qrels[i])

    #print("Length of pos_train:", len(pos_train))
    #print("Length of pos_eval:", len(pos_eval))
    #print("Length of pos_test:", len(pos_test))

    return pos_train, pos_eval, pos_test

def getNegativeData(negative_qrels):
    neg_train = []
    neg_eval = []
    neg_test = []

    number_negative_qrels = len(negative_qrels)
    print(f"Number of negative qrels: {number_negative_qrels}")
    ten_percent_neg = int(number_negative_qrels / 10)

    used_indices = []
    for _ in range(ten_percent_neg):
        rand = np.random.randint(0, number_negative_qrels)
        while rand in used_indices:
            rand = np.random.randint(0, number_negative_qrels)
        neg_eval.append(negative_qrels[rand])
        used_indices.append(rand)

        rand = np.random.randint(0, number_negative_qrels)
        while rand in used_indices:
            rand = np.random.randint(0, number_negative_qrels)
        neg_test.append(negative_qrels[rand])
        used_indices.append(rand)

    for i in range(0, number_negative_qrels):
        if i not in used_indices:
            neg_train.append(negative_qrels[i])

    #print("Length of neg_train:", len(neg_train))
    #print("Length of neg_eval:", len(neg_eval))
    #print("Length of neg_test:", len(neg_test))

    return neg_train, neg_eval, neg_test


def loadDocuments(ds):
    dict = {}
    for doc in ds.docs_iter():
        dict[int(doc.doc_id)] = doc.text
    return dict

def loadQueries(ds):
    dict = {}
    for query in ds.queries_iter():
        dict[int(query.query_id)] = query.text
    return dict

def checkIfInPositives(neg_doc_ids, pos_doc_ids):
    for n in neg_doc_ids:
        if n in pos_doc_ids:
            return True
    return False


# returns dict with query id as key and list of tuples with pos and negative doc ids as values
def getPositiveNegativeDocIDs(processed_queries, processed_documents, pos_train_qrel, neg_train_qrel):
    data = {} # for every query id where are positive samples, samples with positive and negative docs

    # get positive query_id/doc_id pairs
    positive_query_id_doc_ids = {}
    for e in pos_train_qrel:
        query_id = int(e.query_id)
        doc_id = int(e.doc_id)
        if query_id not in processed_queries.keys() or doc_id not in processed_documents.keys():
            continue
        if query_id not in positive_query_id_doc_ids.keys():
            positive_query_id_doc_ids[query_id] = []
        positive_query_id_doc_ids[query_id].append(doc_id)
    #print(positive_query_id_doc_ids)


    # get negative query_id/doc_id pairs
    negative_query_id_doc_ids = {}
    for e in neg_train_qrel:
        query_id = int(e.query_id)
        doc_id = int(e.doc_id)
        if query_id not in processed_queries.keys() or doc_id not in processed_documents.keys():
            continue
        if query_id not in negative_query_id_doc_ids.keys():
            negative_query_id_doc_ids[query_id] = []
        negative_query_id_doc_ids[query_id].append(doc_id)
    #print(negative_query_id_doc_ids)

    for pos_query_id in positive_query_id_doc_ids.keys():
        data[pos_query_id] = []

        pos_doc_ids = positive_query_id_doc_ids[pos_query_id]
        if pos_query_id not in negative_query_id_doc_ids.keys(): # random sampling for positive query ids where no negative docs exist

            random_index = np.random.randint(0, len(negative_query_id_doc_ids.keys()))
            neg_doc_ids = negative_query_id_doc_ids[list(negative_query_id_doc_ids.keys())[random_index]]

            while checkIfInPositives(neg_doc_ids, pos_doc_ids):
                print("Found neg id in positives")
                random_index = np.random.randint(0, len(negative_query_id_doc_ids.keys()))
                neg_doc_ids = negative_query_id_doc_ids[list(negative_query_id_doc_ids.keys())[random_index]]

        else:
            neg_doc_ids = negative_query_id_doc_ids[pos_query_id]

        for i in range(0, len(pos_doc_ids)):
            pos_doc_id = pos_doc_ids[i]

            # adding random sampling for negative doc
            neg_index = np.random.randint(0, len(neg_doc_ids))
            neg_doc_id = neg_doc_ids[neg_index]
            data[pos_query_id].append((pos_doc_id, neg_doc_id))
    return data

def convertMapToListTriples(map):
    return_list = []
    for key in map.keys():
        query_id = key
        for e in map[key]:
            return_list.append((query_id, e[0], e[1]))
    return return_list





In [None]:
positive_qrels, negative_qrels = getQrels(2, dataset)
pos_train_qrel, pos_eval_qrel, pos_test_qrel = getPositiveData(positive_qrels)
neg_train_qrel, neg_eval_qrel, neg_test_qrel = getNegativeData(negative_qrels)
documents = loadDocuments(dataset)
queries = loadQueries(dataset)

[INFO] [starting] http://ir.dcs.gla.ac.uk/resources/test_collections/cran/cran.tar.gz
[INFO] [finished] http://ir.dcs.gla.ac.uk/resources/test_collections/cran/cran.tar.gz: [00:00] [507kB] [570kB/s]


Number of positive qrels: 1097
Number of negative qrels: 353


In [None]:
class CranfieldDatasetPOS_NEG(Dataset):
    def __init__(self, queries, documents, pos_qrel, neg_qrel):
        self.queries = queries
        self.documents = documents
        self.query_idx_pos_neg_doc_ids_map = getPositiveNegativeDocIDs(queries, documents, pos_qrel, neg_qrel)
        self.tripples_list = convertMapToListTriples(self.query_idx_pos_neg_doc_ids_map)
        print(f"Dataloader initialized with {len(self.query_idx_pos_neg_doc_ids_map.keys())} query ids")

    def __len__(self):
        return len(self.tripples_list)

    def __getitem__(self, idx):
        entry = self.tripples_list
        query = self.queries[entry[idx][0]]
        positive_doc = self.documents[entry[idx][1]]
        negative_doc = self.documents[entry[idx][2]]

        return query, positive_doc, negative_doc

class CranfieldDatasetPOS(Dataset):
    def __init__(self, queries, documents, pos_qrel, neg_qrel):
        self.queries = queries
        self.documents = documents
        self.query_idx_pos_neg_doc_ids_map = getPositiveNegativeDocIDs(queries, documents, pos_qrel, neg_qrel)
        self.tripples_list = convertMapToListTriples(self.query_idx_pos_neg_doc_ids_map)
        print(f"Dataloader initialized with {len(self.query_idx_pos_neg_doc_ids_map.keys())} query ids")

    def __len__(self):
        return len(self.tripples_list)

    def __getitem__(self, idx):
        entry = self.tripples_list
        query = self.queries[entry[idx][0]]
        positive_doc = self.documents[entry[idx][1]]

        return query, positive_doc



## Create the Datasets

In [None]:
cranfield_dataset_pos = CranfieldDatasetPOS(queries, documents, pos_train_qrel, neg_train_qrel)
cranfield_dataset_pos_neg = CranfieldDatasetPOS_NEG(queries, documents, pos_train_qrel, neg_train_qrel)

cranfield_dataset_pos_eval = CranfieldDatasetPOS(queries, documents, pos_eval_qrel, neg_eval_qrel)
cranfield_dataset_pos_neg_eval = CranfieldDatasetPOS_NEG(queries, documents, pos_eval_qrel, neg_eval_qrel)

cranfield_dataset_pos_test = CranfieldDatasetPOS(queries, documents, pos_test_qrel, neg_test_qrel)
cranfield_dataset_pos_neg_test = CranfieldDatasetPOS_NEG(queries, documents, pos_test_qrel, neg_test_qrel)

Dataloader initialized with 129 query ids
Dataloader initialized with 129 query ids
Dataloader initialized with 51 query ids
Dataloader initialized with 51 query ids
Dataloader initialized with 52 query ids
Dataloader initialized with 52 query ids


## Create the Model

This is one Branch of the Bi-Encoder Structure.
It...

* Encodes the input-sentence using a tokenizer
*

In [None]:
from transformers import DistilBertModel, DistilBertTokenizer
#from transformers import BertModel, BertTokenizer

'''
class BertEncoder(torch.nn.Module):
    def __init__(self, model_name):
        super(BertEncoder, self).__init__()
        self.encoder = BertModel.from_pretrained(model_name)
        self.tokenizer = BertTokenizer.from_pretrained(model_name)

    def forward(self, input_ids, attention_mask, token_type_ids):
        outputs = self.encoder(input_ids=input_ids,
                               attention_mask=attention_mask,
                               token_type_ids=token_type_ids)
        # Use the pooled output
        pooled_output = outputs.pooler_output
        return pooled_output

    def encode(self, text, max_length=512):
        inputs = self.tokenizer(text, padding=True, truncation=True, max_length=max_length, return_tensors="pt")
        inputs = {key: value.to(device) for key, value in inputs.items()}
        return self.forward(**inputs)
'''

class DistilBertEncoder(torch.nn.Module):
    def __init__(self, model_name):
        super(DistilBertEncoder, self).__init__()
        self.encoder = DistilBertModel.from_pretrained(model_name)
        self.tokenizer = DistilBertTokenizer.from_pretrained(model_name)

    def forward(self, input_ids, attention_mask):
        outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
        outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
        # Mean pooling
        token_embeddings = outputs.last_hidden_state
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
        sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings
        #return outputs.last_hidden_state[:, 0, :]  # Get cls token representation -> acts as aggregate of sequence information

    def encode(self, text, max_length=512):
        inputs = self.tokenizer(text, padding=True, truncation=True, max_length=max_length, return_tensors="pt")
        inputs = {key: value.to(device) for key, value in inputs.items()}
        return self.forward(**inputs)


## Set up Training-utilities

### Create Combined Optimizers for the Encoder Pairs

It is necessary to optimize both encoders of the Bi-Encoder architecture at the same time. So in the first step we take the parameters of the query- and document-encoder and combine them into a list, which will be passed to the optimizer constructor.

### Define Standard Similarity Loss Function
* query_embeddings: Embeddings for the queries.
* positive_embeddings: Embeddings for the positive documents.

returns Mean loss for the batch.

In [None]:
def positive_samples_loss(query_embeddings, positive_embeddings):

    # Cosine similarity between query and positive samples
    similarity = F.cosine_similarity(query_embeddings, positive_embeddings)

    # the goal is to make the similarity as close to 1 as possible
    # so we minimize the mean squared error from 1
    loss = F.mse_loss(similarity, torch.ones_like(similarity))

    return loss

### Define Contrastive Loss Function
Calculate contrastive loss using cosine similarity.
    
* query_embeddings: Embeddings for the query.
* positive_embeddings: Embeddings for the positive documents.
* negative_embeddings: Embeddings for the negative documents.
* margin: Margin by which positive and negative pairs should be separated.

returns Mean contrastive loss over the batch.

In [None]:
def contrastive_loss(query_embeddings, positive_embeddings, negative_embeddings, margin=0.2):
    # Cosine similarity between query and positive samples
    pos_similarity = F.cosine_similarity(query_embeddings, positive_embeddings)

    # Cosine similarity between query and negative samples
    neg_similarity = F.cosine_similarity(query_embeddings, negative_embeddings)

    doc_similarity = F.cosine_similarity(positive_embeddings, negative_embeddings)

    # Loss calculation: Maximize pos_similarity and minimize neg_similarity
    # We want pos_similarity to be high, and neg_similarity to be low
    # neg_similarity + margin <= pos_similarity -> loss = 0
    losses = F.sigmoid(neg_similarity + margin - pos_similarity + doc_similarity)

    return losses.mean()

In [None]:
def triplet_loss(query_embeddings, positive_embeddings, negative_embeddings, margin=0.2):
    """
    Compute the triplet loss.

    Parameters:
    - query_embeddings: Tensor of query embeddings
    - positive_embeddings: Tensor of embeddings of documents relevant to the queries
    - negative_embeddings: Tensor of embeddings of documents not relevant to the queries
    - margin: A scalar margin for triplet loss (default is 1.0)

    Returns:
    - A scalar representing the mean triplet loss over the batch
    """

    # Calculate pairwise distances
    positive_distance = (query_embeddings - positive_embeddings).pow(2).sum(1)
    negative_distance = (query_embeddings - negative_embeddings).pow(2).sum(1)

    # Compute loss
    losses = positive_distance - negative_distance + margin
    loss = losses.clamp(min=0).mean()  # Clamp negative losses to 0 and calculate mean
    return loss


## Training Loops

### Only Positive Training & Test Loop

In [None]:
def train_pos(dataloader, q_enc, doc_enc, loss_function, optimizer):
  q_enc.train()
  doc_enc.train()
  total_loss = 0


  print("Train loop")
  for idx, batch in enumerate(dataloader):
    progress_bar(idx, len(dataloader))
    # batch contains queries positive docs
    queries, pos_docs = batch

    # encoding
    query_embeddings = q_enc.encode(queries)
    pos_doc_embeddings = doc_enc.encode(pos_docs)

    # compute contrastive loss
    loss = loss_function(query_embeddings, pos_doc_embeddings)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    total_loss += loss.item()

  averaged_loss = total_loss / len(dataloader)
  progress_bar(1, 1)
  print(f"Loss: {averaged_loss}")
  print("\n")

In [None]:
def test_pos(dataloader, q_enc, doc_enc, loss_function):
  q_enc.eval()
  doc_enc.eval()

  total_loss = 0
  print("Test loop")
  with torch.no_grad():
    for idx, batch in enumerate(dataloader):
      progress_bar(idx, len(dataloader))
      queries, pos_docs = batch

      q_emb = q_enc.encode(queries)
      doc_emb = doc_enc.encode(pos_docs)

      loss = loss_function(q_emb, doc_emb)
      total_loss += loss.item()
    progress_bar(1, 1)
    averaged_loss = total_loss / len(dataloader)
    print(f"Loss: {averaged_loss}")
    print("\n")
  return averaged_loss

### Contrastive Architecture Training & Test loop (negative samples)

In [None]:
def train_neg(dataloader, q_enc, doc_enc, loss_function, optimizer):
  q_enc.train()
  doc_enc.train()

  total_loss = 0

  print("Train loop")
  for idx, batch in enumerate(dataloader):
    progress_bar(idx, len(dataloader))
    # batch contains queries, positive docs, and negative docs
    queries, pos_docs, neg_docs = batch

    # encoding
    query_embeddings = q_enc.encode(queries)
    pos_doc_embeddings = doc_enc.encode(pos_docs)
    neg_doc_embeddings = doc_enc.encode(neg_docs)

    # compute contrastive loss
    loss = loss_function(query_embeddings, pos_doc_embeddings, neg_doc_embeddings)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    total_loss += loss.item()

  averaged_loss = total_loss / len(dataloader)
  progress_bar(1, 1)
  print(f"Loss: {averaged_loss}")
  print("\n")

In [None]:
def test_neg(dataloader, q_enc, doc_enc, loss_function):
  q_enc.eval()
  doc_enc.eval()

  total_loss = 0
  print("Test loop")
  with torch.no_grad():
    for idx, batch in enumerate(dataloader):
      progress_bar(idx, len(dataloader))
      queries, pos_docs, neg_docs = batch

      q_emb = q_enc.encode(queries)
      pos_doc_emb = doc_enc.encode(pos_docs)
      neg_doc_emb = doc_enc.encode(neg_docs)

      loss = loss_function(q_emb, pos_doc_emb, neg_doc_emb)
      total_loss += loss.item()
    progress_bar(1, 1)
    averaged_loss = total_loss / len(dataloader)
    print(f"Loss: {averaged_loss}")
    print("\n")
  return averaged_loss

## Full Training & Test Loops

In [None]:
epochs = 4
learning_rate = 0.01
batch_size = 1

#model_name = "bert-base-cased"
model_name = "distilbert-base-uncased"

In [None]:
query_encoder = 0
document_encoder = 0
query_encoder = DistilBertEncoder(model_name)
document_encoder = DistilBertEncoder(model_name)

combined_parameters_pos = list(query_encoder.parameters()) +  list(document_encoder.parameters())
optimizer_only_positives = torch.optim.Adam(combined_parameters_pos, lr=learning_rate)

cranfield_dataloader_only_positives = DataLoader(cranfield_dataset_pos, batch_size=batch_size, shuffle=True)
cranfield_dataloader_only_positives_eval = DataLoader(cranfield_dataset_pos_eval, batch_size=batch_size, shuffle=True)
cranfield_dataloader_only_positives_test = DataLoader(cranfield_dataset_pos_test, batch_size=batch_size, shuffle=True)

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [None]:
query_encoder.eval()
document_encoder.eval()

query_encoder.to(device)
document_encoder.to(device)

num_elements_to_inspect = 2
count = 0

for data in cranfield_dataloader_only_positives:
  # Process your data or do something with it
  # ...
  queries, pos_docs = data
  print(tuple((queries[0][:100], )))
  print(tuple((pos_docs[0][:100], )))

  print(query_encoder.encode(queries)[0,:8].tolist())
  print(document_encoder.encode(pos_docs)[0,:8].tolist())
  count += 1
  if count >= num_elements_to_inspect:
    break

query_encoder.to('cpu')
document_encoder.to('cpu')
torch.cuda.empty_cache()

('what are the structural and aeroelastic problems associated with flight\nof high speed aircraft .',)
('some experimental studies of panel flutter at mach\n1 .3.\n  experimental studies of panel flutter wer',)
[-0.17295166850090027, 0.23104381561279297, -0.0543174184858799, 0.20713810622692108, 0.23888719081878662, -0.11587613821029663, -0.15136225521564484, 0.2122560739517212]
[-0.3552234172821045, 0.0623839907348156, 0.12600970268249512, 0.13550697267055511, 0.1580219268798828, 0.07086699455976486, 0.054786477237939835, 0.10012637078762054]
('does a practical flow follow the theoretical concepts for the\ninteraction between adjacent blade row',)
('second approximation to laminar compressible boundary\nlayer on flat plate in slip flow .\n  the first',)
[-0.048189930617809296, -0.010743442922830582, 0.2458885908126831, 0.09528541564941406, -0.08225603401660919, -0.30938518047332764, 0.08063427358865738, -0.08666608482599258]
[-0.50718092918396, -0.15564967691898346, 0.302255779504776,

In [None]:
print("Training the first Architecture on only positives\n")

query_encoder.to(device)
document_encoder.to(device)

for e in range(epochs):
    print(f"Epoch {e+1}\n-------------------------------")
    train_pos(cranfield_dataloader_only_positives, query_encoder, document_encoder, positive_samples_loss, optimizer_only_positives)
    test_pos(cranfield_dataloader_only_positives_eval, query_encoder, document_encoder, positive_samples_loss)
print("Done!")
test_pos(cranfield_dataloader_only_positives_test, query_encoder, document_encoder, positive_samples_loss)

query_encoder.to('cpu')
document_encoder.to('cpu')
torch.cuda.empty_cache()

Training the first Architecture on only positives

Epoch 1
-------------------------------
Train loop
|[92m█[95m█[95m█[32m█[91m█[93m█[93m█[32m█[32m█[91m█[91m█[96m█[96m█[94m█[31m█[93m█[96m█[33m█[36m█[34m█[36m█[33m█[36m█[94m█[93m█[34m█[33m█[96m█[95m█[34m█[31m█[34m█[93m█[92m█[36m█[34m█[91m█[33m█[36m█[95m█[96m█[92m█[94m█[94m█[93m█[32m█[91m█[35m█[93m█[31m█[93m█[92m█[34m█[33m█[91m█[32m█[33m█[34m█[32m█[36m█[94m█[36m█[34m█[34m█[91m█[31m█[35m█[34m█[94m█[94m█[31m█[93m█[31m█[96m█[91m█[95m█[91m█[96m█[36m█[96m█[34m█[92m█[31m█[35m█[35m█[34m█[94m█[36m█[91m█[36m█[36m█[91m█[35m█[32m█[92m█[31m█[34m█[34m█[93m█[95m█[0m| 100.00%
Loss: 0.024140278594558386


Test loop
|[92m█[36m█[33m█[91m█[92m█[31m█[33m█[34m█[36m█[95m█[36m█[92m█[32m█[36m█[92m█[92m█[31m█[94m█[93m█[32m█[92m█[33m█[35m█[92m█[33m█[95m█[91m█[33m█[31m█[33m█[92m█[34m█[92m█[32m█[91m█[96m█[32m█[31m█[34m█[96m█[9

In [None]:
torch.cuda.empty_cache()

query_encoder_ns = 0
document_encoder_ns = 0
query_encoder_ns = DistilBertEncoder(model_name)
document_encoder_ns = DistilBertEncoder(model_name)

combined_parameters_ns = list(query_encoder_ns.parameters()) +  list(document_encoder_ns.parameters())
optimizer_negative_sampling = torch.optim.Adam(combined_parameters_ns, lr=learning_rate)

cranfield_dataloader = DataLoader(cranfield_dataset_pos_neg, batch_size=batch_size, shuffle=True)
cranfield_dataloader_eval = DataLoader(cranfield_dataset_pos_neg_eval, batch_size=batch_size, shuffle=True)
cranfield_dataloader_test = DataLoader(cranfield_dataset_pos_neg_test, batch_size=batch_size, shuffle=True)

In [None]:
query_encoder_ns.to(device)
document_encoder_ns.to(device)

query_encoder_ns.eval()
document_encoder_ns.eval()

num_elements_to_inspect = 2
count = 0

for data in cranfield_dataloader:
    # Process your data or do something with it
    # ...
    queries, pos_docs, neg_docs = data
    print(tuple((queries[0][:100], )))
    print(tuple((pos_docs[0][:100], )))
    print(tuple((neg_docs[0][:100], )))

    print(query_encoder_ns.encode(queries)[0,:8].tolist())
    print(document_encoder_ns.encode(pos_docs)[0,:8].tolist())
    print(document_encoder_ns.encode(neg_docs)[0,:8].tolist())
    count += 1
    if count >= num_elements_to_inspect:
        break

query_encoder_ns.to('cpu')
document_encoder_ns.to('cpu')
torch.cuda.empty_cache()

('thrust vector control by fluid injection -dash papers .',)
('hypersonic flight and the re-entry problem .\npaper reviews the possibilities and some of the main pr',)
('analytical study of the tumbling motions of vehicles\nentering planetary atmospheres .\n  the tumbling',)
[-0.10048697143793106, -0.007908533327281475, 0.051538050174713135, 0.2094869166612625, 0.15603360533714294, -0.1208542212843895, -0.025827020406723022, -0.057776033878326416]
[-0.40410688519477844, 0.21665048599243164, 0.09774568676948547, 0.09566722065210342, 0.18966011703014374, -0.11638510972261429, -0.0655122697353363, 0.013937010429799557]
[-0.5247375965118408, 0.1381705105304718, 0.19366703927516937, -0.008524060249328613, 0.3644755184650421, 0.011582950130105019, 0.0257875993847847, -0.004634075798094273]
('has the effect of the change of initial pressure due to deformation,  on\nthe frequencies of vibratio',)
('investigation of separated flows in supersonic and subsonic\nstreams with emphasis on the effect

In [None]:
print("Training the second Architecture with negative sampling\n")

query_encoder_ns.to(device)
document_encoder_ns.to(device)

for e in range(epochs):
    print(f"Epoch {e+1}\n-------------------------------")
    train_neg(cranfield_dataloader, query_encoder_ns, document_encoder_ns, triplet_loss, optimizer_negative_sampling)
    test_neg(cranfield_dataloader_eval, query_encoder_ns, document_encoder_ns, triplet_loss)
print("Done!")
test_neg(cranfield_dataloader_test, query_encoder_ns, document_encoder_ns,  triplet_loss)

query_encoder_ns.to('cpu')
document_encoder_ns.to('cpu')
torch.cuda.empty_cache()

Training the second Architecture with negative sampling

Epoch 1
-------------------------------
Train loop
|[34m█[33m█[34m█[91m█[93m█[36m█[92m█[92m█[35m█[94m█[32m█[92m█[93m█[36m█[91m█[92m█[34m█[34m█[31m█[34m█[31m█[96m█[95m█[33m█[93m█[91m█[95m█[33m█[94m█[95m█[91m█[93m█[35m█[91m█[32m█[93m█[32m█[36m█[36m█[96m█[95m█[32m█[95m█[91m█[36m█[96m█[36m█[36m█[31m█[95m█[96m█[94m█[36m█[34m█[95m█[31m█[36m█[93m█[32m█[94m█[95m█[32m█[94m█[32m█[32m█[35m█[96m█[35m█[96m█[91m█[34m█[96m█[92m█[34m█[36m█[33m█[34m█[31m█[35m█[96m█[91m█[95m█[94m█[31m█[32m█[96m█[31m█[32m█[92m█[92m█[96m█[32m█[92m█[92m█[36m█[91m█[35m█[92m█[31m█[96m█[0m| 100.00%
Loss: 0.9852082478318306


Test loop
|[96m█[91m█[33m█[92m█[33m█[35m█[36m█[96m█[31m█[35m█[32m█[35m█[33m█[94m█[93m█[92m█[32m█[36m█[93m█[35m█[31m█[34m█[91m█[93m█[31m█[94m█[94m█[93m█[96m█[95m█[95m█[95m█[95m█[93m█[93m█[33m█[95m█[33m█[93m█[35m

In [None]:
query_encoder.eval()
document_encoder.eval()

query_encoder.to(device)
document_encoder.to(device)

num_elements_to_inspect = 2
count = 0

for data in cranfield_dataloader_only_positives:
  # Process your data or do something with it
  # ...
  queries, pos_docs = data
  print(tuple((queries[0][:100], )))
  print(tuple((pos_docs[0][:100], )))

  print(query_encoder.encode(queries)[0,:8].tolist())
  print(document_encoder.encode(pos_docs)[0,:8].tolist())
  count += 1
  if count >= num_elements_to_inspect:
    break

query_encoder.to('cpu')
document_encoder.to('cpu')
torch.cuda.empty_cache()

('how can one detect transition phenomena in boundary layers .',)
('similar solutions for the compressible laminar boundary\nlayer with heat transfer and pressure gradie',)
[-1.282213807106018, -0.2630314230918884, 1.2923930883407593, -0.013666282407939434, -0.22253215312957764, 0.177545428276062, 1.1341073513031006, -1.111574649810791]
[-1.2279917001724243, -0.24122574925422668, 1.2325270175933838, -0.02225184440612793, -0.20768707990646362, 0.17943784594535828, 1.0934123992919922, -1.0641497373580933]
('what are the details of the rigorous kinetic theory of gases .\n(chapman-enskog theory) .',)
('aerodynamic characteristics of two winged reentry vehicles at supersonic\n and hypersonic speeds .\nte',)
[-1.282213568687439, -0.2630314528942108, 1.2923930883407593, -0.013666295446455479, -0.22253216803073883, 0.177545428276062, 1.1341073513031006, -1.1115745306015015]
[-1.2279919385910034, -0.24122577905654907, 1.2325271368026733, -0.022251775488257408, -0.20768707990646362, 0.1794378012

In [None]:
query_encoder_ns.to(device)
document_encoder_ns.to(device)

query_encoder_ns.eval()
document_encoder_ns.eval()

num_elements_to_inspect = 2
count = 0

for data in cranfield_dataloader:
    # Process your data or do something with it
    # ...
    queries, pos_docs, neg_docs = data
    print(tuple((queries[0][:100], )))
    print(tuple((pos_docs[0][:100], )))
    print(tuple((neg_docs[0][:100], )))

    print(query_encoder_ns.encode(queries)[0,:8].tolist())
    print(document_encoder_ns.encode(pos_docs)[0,:8].tolist())
    print(document_encoder_ns.encode(neg_docs)[0,:8].tolist())
    count += 1
    if count >= num_elements_to_inspect:
        break

query_encoder_ns.to('cpu')
document_encoder_ns.to('cpu')
torch.cuda.empty_cache()

('papers on flow visualization on slender conical wings .',)
("on squire's test of the compressibility transformation .\n  discussion of a previous application, by ",)
('the conpressibility transformation and the turbulent\nboundary layer equations .\n  the compressibilit',)
[-0.1029333844780922, 0.16007693111896515, 0.19593186676502228, 0.23063045740127563, 0.16873995959758759, 0.08783576637506485, -0.6180490255355835, 0.4978106915950775]
[0.2566417455673218, 0.1001395583152771, 0.032748639583587646, 0.09801079332828522, 0.16181576251983643, 0.42660829424858093, 0.06478621810674667, -0.04375353828072548]
[0.25664180517196655, 0.10013951361179352, 0.03274863213300705, 0.09801080822944641, 0.16181577742099762, 0.4266083836555481, 0.06478626281023026, -0.043753571808338165]
('can the three-point boundary-value problem for the blasius equation\nbe integrated numerically,  usin',)
('laminar heat-transfer and pressure measurements over blunt-nosed\ncones at large angle of attack .\nte',)
('

# Hyperparameter tuning


In [None]:
from sklearn.model_selection import ParameterGrid

param_grid = {
    'learning_rate': [0.001, 0.01, 0.1],
    'epochs': [2, 5, 10],
    'margin': [0.1, 0.2, 0.5],
    #'batch_size': [16, 32, 64],
}

for params in ParameterGrid(param_grid):
    epochs = params['epochs']
    learning_rate = params['learning_rate']
    margin = params['margin']

    print(f"\nCurrent Parameters: Epochs: {epochs}, Learning Rate: {learning_rate}, Batch Size: {batch_size}")



FAISS KNN-IDX

In [None]:
from transformers import DistilBertConfig

config = DistilBertConfig.from_pretrained(model_name)
model = DistilBertModel(config)

# Initialize FAISS
pos_d = config.dim
neg_d = config.dim

embeddings_pos = None
embeddings_neg = None

index_pos = faiss.IndexFlatL2(pos_d)
index_pos.add(embeddings_pos)

index_neg = faiss.IndexFlatL2(neg_d)
index_neg.add(embeddings_neg)