In [1]:
import logging
import math
import os
import re
from datetime import datetime

import arabicstopwords.arabicstopwords as ar_stp
import numpy as np
import pandas as pd
import pyterrier as pt
import torch
from sentence_transformers import LoggingHandler, SentenceTransformer, CrossEncoder, util, InputExample
from sentence_transformers import models, losses
from sentence_transformers import evaluation

from snowballstemmer import stemmer
from torch import nn
# from simcse import SimSCE
from torch.utils.data import DataLoader

In [2]:
print(torch.__version__)
print(torch.cuda.is_available())
# print(torch.cuda.device_count())
# print(torch.cuda.current_device())
print(torch.cuda.get_device_name(0))

2.0.1+cu118
True
NVIDIA GeForce RTX 3090


# Get Data

In [None]:
data_path = "../data"
index_path = os.path.join(data_path, "QPC_Index/data.properties")

query_train_path = os.path.join(data_path, "QQA23_TaskA_train.tsv")
query_dev_path = os.path.join(data_path, "QQA23_TaskA_dev.tsv")

passage_path = os.path.join(data_path, "Thematic_QPC/QQA23_TaskA_QPC_v1.1.tsv")

qp_pair_train_path = os.path.join(data_path, "qrels\QQA23_TaskA_qrels_train.gold")
qp_pair_dev_path = os.path.join(data_path, "qrels\QQA23_TaskA_qrels_dev.gold")

## Read file

In [4]:
# read file based on its extension (tsv or xlsx)
def read_file(input_file, sep="\t", names = ""):
    if input_file.endswith(".xlsx"):
        df = pd.read_excel(input_file)
    else:
        if names != "":
            df = pd.read_csv(input_file, sep=sep, names=names,encoding="utf-8")
        else:
            df = pd.read_csv(input_file, sep=sep,encoding="utf-8")
    return df

## Read Query-Passage Pair

In [5]:
qrels_columns = ["qid", "Q0", "docid", "relevance"]

def read_qrels_file(qrels_file):
    # split_token = '\t' if format_checker.is_tab_sparated(qrels_file) else  "\s+"
    df_qrels = pd.read_csv(qrels_file, sep='\t', names=qrels_columns)
    df_qrels["qid"] = df_qrels["qid"].astype(str)
    df_qrels["docid"] = df_qrels["docid"].astype(str)
    return df_qrels

## Load the index

In [6]:
def load_index(index_path):
    if not pt.started():
        pt.init(helper_version="0.0.6")

    try:
        index = pt.IndexFactory.of(index_path)
        print("Index was loaded successfully from this path: ", index_path)
        return index
    except Exception as e:
        print('Cannot load the index, check exception details {}'.format(e))
        return []

## Cleaning
Clean text from urls, handles, special characters, tabs, line jumps, extra white space, and puntuations.

In [7]:
# Clean text from urls, handles, special characters, tabs, line jumps, and extra white space.
def clean(text):
    text = re.sub(r"http\S+", " ", text)  # remove urls
    text = re.sub(r"@[\w]*", " ", text)  # remove handles
    text = re.sub(r"[\.\,\#_\|\:\?\?\/\=]", " ", text) # remove special characters
    text = re.sub(r"\t", " ", text)  # remove tabs
    text = re.sub(r"\n", " ", text)  # remove line jump
    text = re.sub(r"\s+", " ", text)  # remove extra white space
    text = re.sub(r'[^\w\s]', '', text) # Removing punctuations in string using regex
    text = text.strip()
    return text

## Preprocessing
Preprocess the arabic input text by performing normalization, stemming, and removing stop words.

In [8]:
# arabic stemmer
ar_stemmer = stemmer("arabic")

# remove arabic stop words
def ar_remove_stop_words(sentence):
    terms=[]
    stopWords= set(ar_stp.stopwords_list())
    for term in sentence.split() : 
        if term not in stopWords :
            terms.append(term)
    return " ".join(terms)


# normalize the arabic text
def normalize_arabic(text):
    text = re.sub("[إأٱآا]", "ا", text)
    text = re.sub("ى", "ي", text)
    text = re.sub("ؤ", "ء", text)
    text = re.sub("ئ", "ء", text)
    text = re.sub("ة", "ه", text)
    return(text)

# stem the arabic text
def ar_stem(sentence):
    return " ".join([ar_stemmer.stemWord(i) for i in sentence.split()])


# apply all preprocessing steps needed for Arabic text
def preprocess_arabic(text): 
    text = normalize_arabic(text)
    text = ar_remove_stop_words(text)
    text = ar_stem(text)
    return text

In [9]:
def prepare_data(path, column, id_type, id_column='docno'):
        df = read_file(path, names=['docno', 'text'])

        print("Cleaning passages")
        # apply the cleaning functions on the queries/questions
        df[column] = df['text'].apply(clean)

        # apply normalization, stemming and stop word removal
        print("Preprocessing - Applying normalization, stemming and stop word removal")
        df[column] = df[column].apply(preprocess_arabic)

        df[id_type] = df[id_column].astype(str) # convert the id column to string
        df = df[[id_type, 'text', column]] # keep the columns needed for search

        print("Done with preparation!")
        return df


### Index

In [10]:
index = load_index(index_path=index_path)

# print(index.getCollectionStatistics().toString())
# print(index.getMetaIndex().getKeys())

# for kv in index.getLexicon():
#     print((kv.getKey())+"\t"+ kv.getValue().toString())
# index.getLexicon()["فاعل"].toString()

PyTerrier 0.9.2 has loaded Terrier 5.7 (built by craigm on 2022-11-10 18:30) and terrier-helper 0.0.6



Index was loaded successfully from this path:  ../data\QPC_Index/data.properties


### Load Passage and Query

In [11]:
df_passage = prepare_data(passage_path, 'passage', 'pid')

df_query_train = prepare_data(query_train_path, 'query', 'qid')
df_query_dev = prepare_data(query_dev_path, 'query', 'qid')

df_qppair_train = read_qrels_file(qp_pair_train_path)

df_qppair_dev = read_qrels_file(qp_pair_dev_path)


Cleaning passages
Preprocessing - Applying normalization, stemming and stop word removal
Done with preparation!
Cleaning passages
Preprocessing - Applying normalization, stemming and stop word removal
Done with preparation!
Cleaning passages
Preprocessing - Applying normalization, stemming and stop word removal
Done with preparation!


# Model

## Sentence Embedding


In [None]:
#### Just some code to print debug information to stdout
logging.basicConfig(format='%(asctime)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    level=logging.INFO,
                    handlers=[LoggingHandler()])
#### /print debug information to stdout

### Retrieval

In [None]:
def save_query_passage_retrieval(result, tag, run_save=False, df_query= df_query_train):
    if tag == "BM25":
        result["Q0"] = ["Q0"] * len(result)
        result["tag"] = [tag] * len(result)
        result['qid'] = result["qid"]
        result['pid'] = result["docno"]
        tag = "BM25_Final"
        result = result[["qid", "Q0", "pid", "rank", "score", "tag"]]

    elif tag == "SimCSE_biencoder" or tag == "SimCSE_bm25_biencoder":
        np_result = np.array(result).flatten()
        print(len(np_result))
        result = pd.DataFrame()

        result["qid"] = df_query["qid"].tolist() * top_k
        result = result.sort_values(by=['qid']).reset_index(drop=True)
        result["Q0"] = ["Q0"] * len(np_result)
        result["pid"] = [df_passage.iloc[x['corpus_id']]['pid'] for x in np_result]
        result["rank"] = list(range(1, top_k+1)) * len(df_query)
        result["score"] = [x['score'] for x in np_result]
        result["tag"] = [tag] * len(np_result)

    elif tag == "SimCSE_cross":
        result['tag'] = tag
        result['Q0'] = 'Q0'
        result = result[["qid", "Q0", "pid", "rank", "score", "tag"]]

    if run_save:
        run_save_path = os.path.join(data_path, f"runs/{tag}.tsv")
        result.to_csv(run_save_path, sep="\t", index=False, header=False)

    return result

In [None]:
#TODO : test with different models
# https://www.sbert.net/docs/pretrained_models.html

# bi_model_name = "sentence-transformers/distiluse-base-multilingual-cased-v1" ## train : 0.48, dev : 0.12
# bi_model_name = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
bi_model_name = "sentence-transformers/paraphrase-multilingual-mpnet-base-v2"
# bi_model_name = "sentence-transformers/paraphrase-xlm-r-multilingual-v1"

# Best model for asymetric similarity
#https://www.sbert.net/docs/pretrained-models/msmarco-v3.html
# bi_model_name = "sentence-transformers/msmarco-distilbert-base-v4"

#bi_model_name = "aubmindlab/bert-base-arabert"

# bi_model_name = "aubmindlab/bert-large-arabertv02" # run 0 :
# bi_model_name = "wissamantoun/araelectra-base-artydiqa" # run 1 : train:0.51 but dev near 0
#bi_model_name = "salti/AraElectra-base-finetuned-ARCD" # run 2 : pAP@10 = 0.397
# bi_model_name = "ZeyadAhmed/AraElectra-Arabic-SQuADv2-QA" # run 3 : pAP@10 = 0.435
#bi_model_name = "timpal0l/mdeberta-v3-base-squad2" # run 4 : pAP@10 = 0.367


# i guess this is not a good model for this task
#bi_model_name = "gfdgdfgdg/arap_qa_bert" # run 5 : pAP@10 = 0.184
#bi_model_name = "gfdgdfgdg/arap_qa_bert_large_v2" # run6 : pAP@10 = 0.372
#bi_model_name = "gfdgdfgdg/arap_qa_bert_v2" # run7 : pAP@10 = 0.344
#bi_model_name = "zohaib99k/Bert_Arabic-SQuADv2-QA" # run8 : pAP@10 = 0.435
#bi_model_name = "arabi-elidrisi/ArabicDistilBERT_QA" #run 9 : pAP@10 = 0.343
#bi_model_name = "MMars/Question_Answering_AraBERT_xtreme_ar" #run 10 : pAP@10 = 0.337
# bi_model_name = "abdalrahmanshahrour/ArabicQA" # run 11 : pAP@10 = 0.304
# bi_model_name = "abdalrahmanshahrour/xtremeQA-ar" # run 12 : pAP@10 = 0.120

model_save_path = os.path.join(data_path, f'model/training_simcse-{bi_model_name}-'+datetime.now().strftime("%Y-%m-%d_%H-%M-%S"))

In [None]:
# Use Huggingface/transformers model (like BERT, RoBERTa, XLNet, XLM-R) for mapping tokens to embeddings
#TODO : change max_seq_length to 384 or 512
word_embedding_model = models.Transformer(bi_model_name, max_seq_length=512)
print("word_embedding_model Max Sequence Length:", word_embedding_model.max_seq_length)
print("word_embedding_model dimension", word_embedding_model.get_word_embedding_dimension())

# Apply mean pooling to get one fixed sized sentence vector
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
print("pooling_model sentence embedding dimension", pooling_model.get_sentence_embedding_dimension())

#TODO : change out_features to 512
dense_model = models.Dense(in_features=pooling_model.get_sentence_embedding_dimension(), out_features=256, activation_function=nn.Tanh())

# bi_encoder = SentenceTransformer(modules=[word_embedding_model, pooling_model])
bi_encoder = SentenceTransformer(modules=[word_embedding_model, pooling_model, dense_model])

In [None]:
#TODO : Data augmentation
# https://github.com/UKPLab/sentence-transformers/blob/master/examples/training/data_augmentation/train_sts_indomain_nlpaug.py
# data augmentation
# from nlpaug.util.file.download import DownloadUtil
# DownloadUtil.download_word2vec(dest_dir='.') # Download word2vec model
# import nlpaug.augmenter.word as naw
# aug = naw.WordEmbsAug(model_type='word2vec', model_path='GoogleNews-vectors-negative300.bin', action="substitute")
# augmented_text = aug.augment(text)


In [None]:
train_samples_passage = []
for _, row in df_passage.iterrows():
    train_samples_passage.append(InputExample(texts=[row['passage'], row['passage']]))

len(train_samples_passage)

In [None]:
train_samples_qp = []
for _, row in df_qppair_train.iterrows():
    query_id = row['qid']

    query = df_query_train[df_query_train['qid'] == query_id]['query'].tolist()[0]
    passage_id = row['docid']
    if passage_id == '-1':
        continue
    else:
        passage = df_passage[df_passage['pid'] == passage_id]['passage'].tolist()[0]
        label = row['relevance']
        #positive sample
        train_samples_qp.append(InputExample(texts=[query, passage], label=label))
        train_samples_qp.append(InputExample(texts=[passage, query], label=label))
print(len(train_samples_qp))

In [None]:
train_samples_qp_contrastive = []
top_k = 75
BM25_model = pt.BatchRetrieve(index, controls = {"wmodel": "BM25"}, num_results=75)
print(top_k)
for _, row in df_qppair_train.groupby('qid'):
    # print(row)
    query_id = row['qid'].tolist()[0]
    query = df_query_train[df_query_train['qid'] == query_id]['query'].tolist()[0]
    bm25_related_passage = BM25_model.search(query)['docno'].tolist()
    positive_passage = row['docid'].tolist()
    negative_passage = list(set(bm25_related_passage) - set(positive_passage))
    # print(bm25_related_passage)
    # print(possitive_passage)
    # print(negative_passage)

    for pos_passage in positive_passage:
        if pos_passage == '-1':
            continue
        else:
            passage = df_passage[df_passage['pid'] == pos_passage]['passage'].tolist()[0]
            label = 1
            #positive sample
            train_samples_qp_contrastive.append(InputExample(texts=[query, passage], label=label))
            train_samples_qp_contrastive.append(InputExample(texts=[passage, query], label=label))

    for neg_passage in negative_passage:
        if neg_passage == '-1':
            continue
        else:
            passage = df_passage[df_passage['pid'] == neg_passage]['passage'].tolist()[0]
            label = 0
            #positive sample
            train_samples_qp_contrastive.append(InputExample(texts=[query, passage], label=label))
            train_samples_qp_contrastive.append(InputExample(texts=[passage, query], label=label))

len(train_samples_qp_contrastive)

In [None]:
train_samples_qp_multiple_negative_ranking = []
for _, row in df_qppair_train.groupby('qid'):
    # print(row)
    query_id = row['qid'].tolist()[0]
    query = df_query_train[df_query_train['qid'] == query_id]['query'].tolist()[0]
    positive_passage = row['docid'].tolist()
    for pos_passage in positive_passage:
        if pos_passage == '-1':
            continue
        else:
            passage = df_passage[df_passage['pid'] == pos_passage]['passage'].tolist()[0]
            label = 1
            #positive sample
            train_samples_qp_multiple_negative_ranking.append(InputExample(texts=[query, passage], label=label))
            train_samples_qp_multiple_negative_ranking.append(InputExample(texts=[passage, query], label=label))

print(len(train_samples_qp_multiple_negative_ranking))

In [36]:
train_samples_qp_triple = []
print(top_k)
BM25_model = pt.BatchRetrieve(index, controls = {"wmodel": "BM25"}, num_results=20)
for _, row in df_qppair_train.groupby('qid'):
    # print(row)
    query_id = row['qid'].tolist()[0]
    query = df_query_train[df_query_train['qid'] == query_id]['query'].tolist()[0]
    bm25_related_passage = BM25_model.search(query)['docno'].tolist()
    possitive_passage = row['docid'].tolist()
    negative_passage = list(set(bm25_related_passage) - set(possitive_passage))
    # print(bm25_related_passage)
    # print(possitive_passage)
    # print(negative_passage)

    for pos_passage_id in possitive_passage:
        for neg_passage_id in negative_passage:
            if pos_passage_id == '-1':
                continue
            else:
                pos_passage = df_passage[df_passage['pid'] == pos_passage_id]['passage'].tolist()[0]
                neg_passage = df_passage[df_passage['pid'] == neg_passage_id]['passage'].tolist()[0]
                train_samples_qp_triple.append(InputExample(texts=[query, pos_passage, neg_passage]))

print(len(train_samples_qp_triple))

10
14047


#### Train Bi-Encoder

In [41]:
num_epochs = 3
train_batch_size = 64

# train_passage_dataloader = DataLoader(train_samples_passage, shuffle=True, batch_size=train_batch_size)
# train_qp_dataloader = DataLoader(train_samples_qp, shuffle=True, batch_size=train_batch_size)
train_qp_contrastive_dataloader = DataLoader(train_samples_qp_contrastive, shuffle=True, batch_size=train_batch_size)
# train_qp_triple_dataloader = DataLoader(train_samples_qp_triple, shuffle=True, batch_size=train_batch_size)
train_qp_multiple_negative_ranking_dataloader = DataLoader(train_samples_qp_multiple_negative_ranking, shuffle=True, batch_size=train_batch_size)

train_biencoder_loss_MultipleNegativesRanking = losses.MultipleNegativesRankingLoss(bi_encoder)
train_biencoder_loss_Contrastive = losses.ContrastiveLoss(bi_encoder)
# train_biencoder_loss = losses.OnlineContrastiveLoss(bi_encoder)
# train_biencoder_loss_triple = losses.TripletLoss(model=bi_encoder)
# train_biencoder_loss_CosineSimilarity = losses.CosineSimilarityLoss(bi_encoder)

In [42]:
'''
model_save_path = ../data\\model/training_simcse-sentence-transformers/distiluse-base-multilingual-cased-v1-2023-08-12_07-37-04
train  : 0.78 but worse on test
'''


'\nmodel_save_path = ../data\\model/training_simcse-sentence-transformers/distiluse-base-multilingual-cased-v1-2023-08-12_07-37-04\ntrain  : 0.78 but worse on test\n'

In [None]:
#  asymmetric semantic search
warmup_steps = math.ceil(len(train_qp_contrastive_dataloader) * num_epochs * 0.02) #10% of train data for warm-up
logging.info("Warmup-steps: {}".format(warmup_steps))

# logging.info("Performance before training")
# dev_evaluator(model)
#RerankingEvaluator
#InformationRetrievalEvaluator

# https://github.com/UKPLab/sentence-transformers/blob/master/examples/training/quora_duplicate_questions/training_OnlineContrastiveLoss.py

dev_evaluator = evaluation.InformationRetrievalEvaluator(
                                df_query_dev.groupby('qid')['query'].apply(str).to_dict(),
                                df_passage.groupby('pid')['passage'].apply(str).to_dict(),
                                df_qppair_dev.groupby('qid')['docid'].apply(set).to_dict(),
                                accuracy_at_k = [10],
                                precision_recall_at_k  = [10],
                                map_at_k = [10], mrr_at_k=[10]
                                #, score_functions='cos_sim'
                                )

#multi-task training
bi_encoder.fit(
          train_objectives=[
                        # (train_passage_dataloader, train_biencoder_loss_MultipleNegativesRanking), #SimCSE
                        (train_qp_contrastive_dataloader, train_biencoder_loss_Contrastive),
                        (train_qp_multiple_negative_ranking_dataloader, train_biencoder_loss_MultipleNegativesRanking),
                        # (train_qp_triple_dataloader, train_biencoder_loss_triple)
                        # (train_qp_contrastive_dataloader, train_biencoder_loss_CosineSimilarity)
                        # (train_qp_triple_dataloader, train_biencoder_loss)
              ],
              evaluator=dev_evaluator,
              epochs=num_epochs,
              evaluation_steps=301,
              warmup_steps=warmup_steps,
              output_path=model_save_path
              )

2023-08-13 10:27:50 - Warmup-steps: 19


Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

Iteration:   0%|          | 0/30 [00:00<?, ?it/s]

2023-08-13 10:35:49 - Information Retrieval Evaluation on  dataset after epoch 0:
2023-08-13 10:35:50 - Queries: 25
2023-08-13 10:35:50 - Corpus: 1266

2023-08-13 10:35:50 - Score-Function: cos_sim
2023-08-13 10:35:50 - Accuracy@10: 20.00%
2023-08-13 10:35:50 - Precision@10: 2.80%
2023-08-13 10:35:50 - Recall@10: 8.45%
2023-08-13 10:35:50 - MRR@10: 0.0890
2023-08-13 10:35:50 - NDCG@10: 0.0572
2023-08-13 10:35:50 - MAP@10: 0.0322
2023-08-13 10:35:50 - Score-Function: dot_score
2023-08-13 10:35:50 - Accuracy@10: 12.00%
2023-08-13 10:35:50 - Precision@10: 2.00%
2023-08-13 10:35:50 - Recall@10: 4.48%
2023-08-13 10:35:50 - MRR@10: 0.0467
2023-08-13 10:35:50 - NDCG@10: 0.0359
2023-08-13 10:35:50 - MAP@10: 0.0219
2023-08-13 10:35:50 - Save model to ../data\model/training_simcse-sentence-transformers/paraphrase-multilingual-mpnet-base-v2-2023-08-13_10-06-48


Iteration:   0%|          | 0/30 [00:00<?, ?it/s]

2023-08-13 10:43:12 - Information Retrieval Evaluation on  dataset after epoch 1:
2023-08-13 10:43:13 - Queries: 25
2023-08-13 10:43:13 - Corpus: 1266

2023-08-13 10:43:13 - Score-Function: cos_sim
2023-08-13 10:43:13 - Accuracy@10: 28.00%
2023-08-13 10:43:13 - Precision@10: 3.60%
2023-08-13 10:43:13 - Recall@10: 13.02%
2023-08-13 10:43:13 - MRR@10: 0.1044
2023-08-13 10:43:13 - NDCG@10: 0.0784
2023-08-13 10:43:13 - MAP@10: 0.0436
2023-08-13 10:43:13 - Score-Function: dot_score
2023-08-13 10:43:13 - Accuracy@10: 12.00%
2023-08-13 10:43:13 - Precision@10: 2.00%
2023-08-13 10:43:13 - Recall@10: 4.48%
2023-08-13 10:43:13 - MRR@10: 0.0480
2023-08-13 10:43:13 - NDCG@10: 0.0372
2023-08-13 10:43:13 - MAP@10: 0.0238
2023-08-13 10:43:13 - Save model to ../data\model/training_simcse-sentence-transformers/paraphrase-multilingual-mpnet-base-v2-2023-08-13_10-06-48


Iteration:   0%|          | 0/30 [00:00<?, ?it/s]

In [None]:
passage_embeddings = bi_encoder.encode(df_passage['passage'].tolist(), convert_to_tensor=True, show_progress_bar=True)
query_train_embeddings = bi_encoder.encode(df_query_train['query'].tolist(), convert_to_tensor=True, show_progress_bar=True)
query_dev_embeddings = bi_encoder.encode(df_query_dev['query'].tolist(), convert_to_tensor=True, show_progress_bar=True)

df_passage['embedding'] = passage_embeddings.cpu().numpy().tolist()
df_query_train['embedding'] = query_train_embeddings.cpu().numpy().tolist()
df_query_dev['embedding'] = query_dev_embeddings.cpu().numpy().tolist()

#### Semantic Search

In [None]:
#bi_encoder.save("sentence-transformers/paraphrase-multilingual-mpnet-base-v2_dev_0.16_train_0.50_contrastive_multiple_neg_ranking_loss_MULTIASK")

In [None]:
tag = "SimCSE_biencoder"
top_k = 10
hits = []
hits = util.semantic_search(query_train_embeddings, passage_embeddings, top_k=top_k)
df_run = save_query_passage_retrieval(hits, tag, run_save=True, df_query=df_query_train)

print(bi_model_name, "all passage embeddings")
! python QQA23_TaskA_eval.py \
    -r "../data/runs/SimCSE_biencoder.tsv" \
    -q "../data/qrels/QQA23_TaskA_qrels_train.gold"

# df_run

In [None]:
# passage_embeddings = bi_encoder.encode(df_passage['passage'].tolist(), convert_to_tensor=True, show_progress_bar=True)
# query_train_embeddings = bi_encoder.encode(df_query_train['query'].tolist(), convert_to_tensor=True, show_progress_bar=True)
# query_dev_embeddings = bi_encoder.encode(df_query_dev['query'].tolist(), convert_to_tensor=True, show_progress_bar=True)
#
# df_passage['embedding'] = passage_embeddings.cpu().numpy().tolist()
# df_query_train['embedding'] = query_train_embeddings.cpu().numpy().tolist()
# df_query_dev['embedding'] = query_dev_embeddings.cpu().numpy().tolist()

tag = "SimCSE_biencoder"
top_k = 10
hits = []
hits = util.semantic_search(query_dev_embeddings, passage_embeddings, top_k=top_k)
# hits = util.cos_sim(query_dev_embeddings, passage_embeddings)
df_run = save_query_passage_retrieval(hits, tag, run_save=True, df_query=df_query_dev)

print(bi_model_name, "all passage embeddings")
! python QQA23_TaskA_eval.py \
    -r "../data/runs/SimCSE_biencoder.tsv" \
    -q "../data/qrels/QQA23_TaskA_qrels_dev.gold"

df_run

In [None]:
tag = "SimCSE_bm25_biencoder"
top_k = 10
BM25_model = pt.BatchRetrieve(index, controls = {"wmodel": "BM25"}, num_results=15)
bm25_biencoder_hit= []
for query in df_query_dev['query'].tolist():
    bm25_result = BM25_model.search(query)
    bm25_related_passage = bm25_result['docno'].tolist()
    passage =  df_passage[df_passage['pid'].isin(bm25_related_passage)]['passage'].tolist()
    # print(len(passage))
    try:
        query_embedding = bi_encoder.encode(query, convert_to_tensor=True, show_progress_bar=False)
        passage_embeddings = bi_encoder.encode(passage, convert_to_tensor=True, show_progress_bar=False)
        # TODO
        '''
        check similarity between query and passage, with methods is better for this task?
        util.dot_score
        util.cos_sim
        util.pairwise_dot_score
        util.pairwise_cos_sim
        '''
        hit = util.semantic_search(query_embedding, passage_embeddings, top_k=top_k)[0]

        mapping = {index : row['docno'] for index, row in bm25_result.iterrows()}

    except:
        #len passage is 0 but why ?
        print(f"len passage : {len(passage)}, qury : {query}")
        query_embedding = bi_encoder.encode(query, convert_to_tensor=True, show_progress_bar=False)
        passage = df_passage['passage'].tolist()
        passage_embeddings = bi_encoder.encode(passage, convert_to_tensor=True, show_progress_bar=False)
        hit = util.semantic_search(query_embedding, passage_embeddings, top_k=top_k)[0]
        mapping = {index : row['pid'] for index, row in df_passage.iterrows()}

    for i in range(len(hit)):
        hit[i]['corpus_id'] = mapping[hit[i]['corpus_id']]

    hit = sorted(hit, key=lambda x: x['score'], reverse=True)

    bm25_biencoder_hit.append(hit)


In [None]:
bm25_biencoder_hit_df = pd.DataFrame()
for i in range(len(bm25_biencoder_hit)):
    for j in range(len(bm25_biencoder_hit[i])):
        new_record = pd.DataFrame([{"qid": df_query_dev['qid'].tolist()[i],
                "Q0": "Q0",
                "pid": bm25_biencoder_hit[i][j]['corpus_id'],
                "rank": j,
                "score": bm25_biencoder_hit[i][j]['score'],
                "tag": tag
            }])
        bm25_biencoder_hit_df = pd.concat([bm25_biencoder_hit_df, new_record], ignore_index=True)
bm25_biencoder_hit_df

In [None]:
bm25_biencoder_hit_df.to_csv("../data/runs/BM25_BiEncoder.tsv", sep="\t", index=False, header=False)

In [None]:
print("BM25 hits with BiEncoder")
!python QQA23_TaskA_eval.py \
    -r "../data/runs/BM25_BiEncoder.tsv" \
    -q "../data/qrels/QQA23_TaskA_qrels_dev.gold"

### Re-ranker (Cross-Encoder)

In [None]:
#TODO:
'''
https://github.com/Guzpenha/transformer_rankers
https://colab.research.google.com/drive/1wGmaO3emC7Sg-tA7nGehIQ2vjOLN9S5e?usp=sharing#scrollTo=y9ps5zmOHxe4
'''

# cross_model_name = "cross-encoder/ms-marco-MiniLM-L-6-v2"
# cross_model_name = "cross-encoder/ms-marco-TinyBERT-L-2"
# cross_model_name = "distilroberta-base"
# cross_model_name = "amberoad/bert-multilingual-passage-reranking-msmarco"
cross_model_name = "ZeyadAhmed/AraElectra-Arabic-SQuADv2-CLS"

#### Train Cross-Encoder

In [None]:
# Fine-tune cross-encoder on the query-passage
torch.cuda.empty_cache()
#https://sbert.net/docs/package_reference/cross_encoder.html

cross_encoder = CrossEncoder(cross_model_name)
# warmup_steps = math.ceil(len(train_qp_dataloader) * num_epochs * 0.1) #10% of train data for warm-up
# logging.info("Warmup-steps: {}".format(warmup_steps))
#
# cross_encoder.fit(train_qp_dataloader,
#           # evaluator=dev_evaluator,
#           epochs=num_epochs,
#           evaluation_steps=100,
#           warmup_steps=warmup_steps,
#           output_path=model_save_path
#           )

In [None]:
tag = "SimCSE_cross"
df_final = pd.DataFrame()
for qid, filter_passages in df_run.groupby('qid'):
    cross_inp = []
    q = df_query_dev[df_query_dev['qid'] == qid]
    for x in filter_passages.values:
        p = df_passage[df_passage['pid'] == x[2]]
        cross_inp.append([q['query'].tolist()[0], p['passage'].tolist()[0]])

    similarity_scores = cross_encoder.predict(cross_inp, show_progress_bar=False)

    # Sort the scores in decreasing order
    sim_scores_argsort = reversed(np.argsort(similarity_scores))

    print(len(similarity_scores))
    print(len(filter_passages['pid'].values))
    df_temp = pd.DataFrame({'qid': qid, 'pid': filter_passages['pid'].values, 'score': similarity_scores})
    df_temp = df_temp.sort_values(by=['score'], ascending=False)[:10]
    df_temp['rank'] = range(1, len(df_temp) + 1)

    df_final = pd.concat([df_final, df_temp], ignore_index=True)

df_final = save_query_passage_retrieval(df_final, tag, run_save=True)

print("cross-encoder model name", cross_model_name)
! python QQA23_TaskA_eval.py \
    -r "../data/runs/SimCSE_cross.tsv" \
    -q "../data/qrels/QQA23_TaskA_qrels_train.gold"
df_final

In [None]:
tag = "BM25"

df_query = prepare_query_for_search(query_train_path)
# df_query = prepare_query_for_search(query_dev_path)

df_run = BM25_model.transform(df_query)
df_run = save_query_passage_retrieval(df_run, tag, run_save=True)
df_run

In [None]:
sample = "عمران"
BM25_model.search(sample)

# Evaluation

In [None]:
! python QQA23_TaskA_eval.py \
    -r "../data/runs/BM25_Final.tsv" \
    -q "../data/qrels/QQA23_TaskA_qrels_dev.gold"

In [None]:
! python QQA23_TaskA_eval.py \
    -r "../data/runs/bigIR_BM25.tsv" \
    -q "../data/qrels/QQA23_TaskA_qrels_dev.gold"

Here, we are just evaluating the perfect run for the dev set

In [None]:
! python QQA23_TaskA_eval.py \
    -r "../data/runs/dev_perfect.tsv" \
    -q "../data/qrels/QQA23_TaskA_qrels_dev.gold"