In [97]:
import re
import os
import pandas as pd
from snowballstemmer import stemmer
import arabicstopwords.arabicstopwords as ar_stp
from functools import reduce
import numpy as np
import pyterrier as pt
import torch

In [98]:
print(torch.__version__)
print(torch.cuda.is_available())
print(torch.cuda.device_count())
print(torch.cuda.current_device())
print(torch.cuda.get_device_name(0))

2.0.1+cu118
True
1
0
NVIDIA GeForce RTX 3090


# Get Data

In [99]:
data_path = "../data"
index_path = os.path.join(data_path, "QPC_Index/data.properties")

query_train_path = os.path.join(data_path, "QQA23_TaskA_train.tsv")
query_dev_path = os.path.join(data_path, "QQA23_TaskA_dev.tsv")

passage_path = os.path.join(data_path, "Thematic_QPC/QQA23_TaskA_QPC_v1.1.tsv")

qp_pair_train_path = os.path.join(data_path, "qrels\QQA23_TaskA_qrels_train.gold")
qp_pair_dev_path = os.path.join(data_path, "qrels\QQA23_TaskA_qrels_dev.gold")

## Read file

In [100]:
# read file based on its extension (tsv or xlsx)
def read_file(input_file, sep="\t", names = ""):
    if input_file.endswith(".xlsx"):
        df = pd.read_excel(input_file)
    else:
        if names != "":
            df = pd.read_csv(input_file, sep=sep, names=names,encoding="utf-8")
        else:
            df = pd.read_csv(input_file, sep=sep,encoding="utf-8")
    return df

## Read Query-Passage Pair

In [101]:
qrels_columns = ["qid", "Q0", "docid", "relevance"]

def read_qrels_file(qrels_file):
    # split_token = '\t' if format_checker.is_tab_sparated(qrels_file) else  "\s+"
    df_qrels = pd.read_csv(qrels_file, sep='\t', names=qrels_columns)
    df_qrels["qid"] = df_qrels["qid"].astype(str)
    df_qrels["docid"] = df_qrels["docid"].astype(str)
    return df_qrels

## Load the index

In [102]:
def load_index(index_path):
    if not pt.started():
        pt.init(helper_version="0.0.6")

    try:
        index = pt.IndexFactory.of(index_path)
        print("Index was loaded successfully from this path: ", index_path)
        return index
    except Exception as e:
        print('Cannot load the index, check exception details {}'.format(e))
        return []

## Cleaning
Clean text from urls, handles, special characters, tabs, line jumps, extra white space, and puntuations.

In [103]:
# Clean text from urls, handles, special characters, tabs, line jumps, and extra white space.
def clean(text):
    text = re.sub(r"http\S+", " ", text)  # remove urls
    text = re.sub(r"@[\w]*", " ", text)  # remove handles
    text = re.sub(r"[\.\,\#_\|\:\?\?\/\=]", " ", text) # remove special characters
    text = re.sub(r"\t", " ", text)  # remove tabs
    text = re.sub(r"\n", " ", text)  # remove line jump
    text = re.sub(r"\s+", " ", text)  # remove extra white space
    text = re.sub(r'[^\w\s]', '', text) # Removing punctuations in string using regex
    text = text.strip()
    return text

## Preprocessing
Preprocess the arabic input text by performing normalization, stemming, and removing stop words.

In [104]:
# arabic stemmer
ar_stemmer = stemmer("arabic")

# remove arabic stop words
def ar_remove_stop_words(sentence):
    terms=[]
    stopWords= set(ar_stp.stopwords_list())
    for term in sentence.split() : 
        if term not in stopWords :
            terms.append(term)
    return " ".join(terms)


# normalize the arabic text
def normalize_arabic(text):
    text = re.sub("[إأٱآا]", "ا", text)
    text = re.sub("ى", "ي", text)
    text = re.sub("ؤ", "ء", text)
    text = re.sub("ئ", "ء", text)
    text = re.sub("ة", "ه", text)
    return(text)

# stem the arabic text
def ar_stem(sentence):
    return " ".join([ar_stemmer.stemWord(i) for i in sentence.split()])


# apply all preprocessing steps needed for Arabic text
def preprocess_arabic(text): 
    text = normalize_arabic(text)
    text = ar_remove_stop_words(text)
    text = ar_stem(text)
    return text

In [105]:
def prepare_data(path, column, id_type, id_column='docno'):
        df = read_file(path, names=['docno', 'text'])

        print("Cleaning passages")
        # apply the cleaning functions on the queries/questions
        df[column] = df['text'].apply(clean)

        # apply normalization, stemming and stop word removal
        print("Preprocessing - Applying normalization, stemming and stop word removal")
        df[column] = df[column].apply(preprocess_arabic)

        df[id_type] = df[id_column].astype(str) # convert the id column to string
        df = df[[id_type, 'text', column]] # keep the columns needed for search

        print("Done with preparation!")
        return df


### Index

In [106]:
index = load_index(index_path=index_path)

# print(index.getCollectionStatistics().toString())
# print(index.getMetaIndex().getKeys())

# for kv in index.getLexicon():
#     print((kv.getKey())+"\t"+ kv.getValue().toString())
# index.getLexicon()["فاعل"].toString()

Index was loaded successfully from this path:  ../data\QPC_Index/data.properties


### Passage

In [107]:
df_passage = prepare_data(passage_path, 'passage', 'pid')
df_passage

Cleaning passages
Preprocessing - Applying normalization, stemming and stop word removal
Done with preparation!


Unnamed: 0,pid,text,passage
0,1:1-4,بسم الله الرحمن الرحيم. الحمد لله رب العالمين....,بسم الله رحم رحيم حمد لله عالم رحم رحيم مال يو...
1,1:5-6,إياك نعبد وإياك نستعين. اهدنا الصراط المستقيم.,ايا نعبد واي نستع اهد صراط مستقيم
2,1:7-7,صراط الذين أنعمت عليهم غير المغضوب عليهم ولا ا...,صراط انعم مغضوب ضال
3,2:1-2,الم. ذلك الكتاب لا ريب فيه هدى للمتقين.,الم كتاب ريب هد متق
4,2:3-5,الذين يؤمنون بالغيب ويقيمون الصلاة ومما رزقناه...,يءمن غيب يقيم صلاه رزق ينفق يءمن انزل اليك انز...
...,...,...,...
1261,110:1-3,إذا جاء نصر الله والفتح. ورأيت الناس يدخلون في...,اذا جاء نصر الله والفتح راي ناس يدخل دين الله ...
1262,111:1-5,تبت يدا أبي لهب وتب. ما أغنى عنه ماله وما كسب....,تبت يدا اب وتب اغن مال كسب سيصل نار وامرا حمال...
1263,112:1-4,قل هو الله أحد. الله الصمد. لم يلد ولم يولد. و...,قل الله احد الله صمد يلد يولد يكن كفو احد
1264,113:1-5,قل أعوذ برب الفلق. من شر ما خلق. ومن شر غاسق إ...,قل اعوذ برب فلق شر خلق شر غاسق اذا وقب شر نفاث...


### Query

In [108]:
df_query_train = prepare_data(query_train_path, 'query', 'qid')
df_query_train

Cleaning passages
Preprocessing - Applying normalization, stemming and stop word removal
Done with preparation!


Unnamed: 0,qid,text,query
0,101,من هم قوم شعيب؟,قوم شعيب
1,102,من هم قوم موسى؟,قوم موس
2,103,من بنى الكعبة؟,بن كعبه
3,105,من هو النبي المعروف بالصبر؟,النب معروف صبر
4,106,من كفل السيدة مريم؟,كفل سيده مريم
...,...,...,...
169,422,ما هي الأماكن التي ذُكرت في القرآن كأماكن مقدسة؟,اماك ذكر القر كام مقدس
170,423,لماذا لم يتم حذف الآيات المنسوخة من القرآن؟,لماذ يتم حذف الا منسوخه القر
171,425,هل سيدنا محمد هو أفضل الأنبياء؟,سيد محمد افضل انبياء
172,426,هل حذر القرآن المؤمنين من اتخاذ أهل الكتاب أول...,حذر القر مءمن اتخاذ اهل كتاب اولياء


In [109]:
df_query_dev = prepare_data(query_dev_path, 'query', 'qid')
df_query_dev

Cleaning passages
Preprocessing - Applying normalization, stemming and stop word removal
Done with preparation!


Unnamed: 0,qid,text,query
0,114,من الذي خسف الله به الأرض؟,خسف الله ارض
1,124,كم مدة عدة الأرملة؟,مده عده ارمله
2,126,ما هي شجرة الزقوم؟,شجر زقوم
3,135,ما هي وصايا لقمان لابنه؟,صاي لقمان
4,156,من هن المحرمات من النساء في الزواج؟,محرم نساء زواج
5,157,ما هي منزلة من يقتل في سبيل الله؟,منزل يقتل سبيل الله
6,207,من هلك من أهل سيدنا نوح عليه السلام في الطوفان؟,هلك اهل سيد نوح سلام طوف
7,224,لماذا ألقي سيدنا يوسف عليه السلام في الجب؟,لماذ الق سيد يوسف سلام الجب
8,234,كم دامت دعوة نوح لقومه؟,دام دعو نوح لقوم
9,241,من هم الملائكة المذكورون في القرآن؟,ملاءكه مذكور القر


### Query-Passage Pair

In [110]:
df_qppair_train = read_qrels_file(qp_pair_train_path)
df_qppair_train

Unnamed: 0,qid,Q0,docid,relevance
0,101,0,7:85-93,1
1,101,0,11:84-88,1
2,101,0,26:176-191,1
3,101,0,29:36-37,1
4,102,0,7:103-108,1
...,...,...,...,...
967,374,0,-1,1
968,391,0,-1,1
969,392,0,-1,1
970,394,0,-1,1


In [111]:
df_qppair_dev = read_qrels_file(qp_pair_dev_path)
df_qppair_dev

Unnamed: 0,qid,Q0,docid,relevance
0,114,0,28:76-80,1
1,124,0,2:234-237,1
2,126,0,37:62-74,1
3,126,0,44:40-50,1
4,126,0,56:41-56,1
...,...,...,...,...
155,428,0,47:20-24,1
156,322,0,-1,1
157,260,0,-1,1
158,384,0,-1,1


# Model

## Sentence Embedding


In [112]:
# from simcse import SimSCE
from torch.utils.data import DataLoader
from torch import nn
from sentence_transformers import models, losses
from sentence_transformers import LoggingHandler, SentenceTransformer, CrossEncoder, util, InputExample
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator

import logging
from datetime import datetime
import math

In [113]:
#### Just some code to print debug information to stdout
logging.basicConfig(format='%(asctime)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    level=logging.INFO,
                    handlers=[LoggingHandler()])
#### /print debug information to stdout

### Retrieval

In [114]:
def save_query_passage_retrieval(result, tag, run_save=False):
    if tag == "BM25":
        result["Q0"] = ["Q0"] * len(result)
        result["tag"] = [tag] * len(result)
        result['qid'] = result["qid"]
        result['pid'] = result["docno"]
        tag = "BM25_Final"
        result = result[["qid", "Q0", "pid", "rank", "score", "tag"]]

    elif tag == "SimCSE_biencoder" or tag == "SimCSE_bm25_biencoder":
        np_result = np.array(result).flatten()
        result = pd.DataFrame()

        result["qid"] = df_query_train["qid"].tolist() * top_k
        result = result.sort_values(by=['qid']).reset_index(drop=True)
        result["Q0"] = ["Q0"] * len(np_result)
        result["pid"] = [df_passage.iloc[x['corpus_id']]['pid'] for x in np_result]
        result["rank"] = list(range(1, top_k+1)) * len(df_query_train)
        result["score"] = [x['score'] for x in np_result]
        result["tag"] = [tag] * len(np_result)

    elif tag == "SimCSE_cross":
        result['tag'] = tag
        result['Q0'] = 'Q0'
        result = result[["qid", "Q0", "pid", "rank", "score", "tag"]]

    if run_save:
        run_save_path = os.path.join(data_path, f"runs/{tag}.tsv")
        result.to_csv(run_save_path, sep="\t", index=False, header=False)

    return result

###  SimCSE (Bi-Encoder)

In [115]:
#TODO : test with different models

#bi_model_name = "sentence-transformers/distiluse-base-multilingual-cased-v1"
#bi_model_name = "aubmindlab/bert-large-arabertv02" # run 0 : pAP@10 = 0.289
#bi_model_name = "wissamantoun/araelectra-base-artydiqa" # run 1 : pAP@10 = 0.437
#bi_model_name = "salti/AraElectra-base-finetuned-ARCD" # run 2 : pAP@10 = 0.397
bi_model_name = "ZeyadAhmed/AraElectra-Arabic-SQuADv2-QA" # run 3 : pAP@10 = 0.435
#bi_model_name = "timpal0l/mdeberta-v3-base-squad2" # run 4 : pAP@10 = 0.367
#bi_model_name = "gfdgdfgdg/arap_qa_bert" # run 5 : pAP@10 = 0.184
#bi_model_name = "gfdgdfgdg/arap_qa_bert_large_v2" # run6 : pAP@10 = 0.372
#bi_model_name = "gfdgdfgdg/arap_qa_bert_v2" # run7 : pAP@10 = 0.344
#bi_model_name = "zohaib99k/Bert_Arabic-SQuADv2-QA" # run8 : pAP@10 = 0.435
#bi_model_name = "arabi-elidrisi/ArabicDistilBERT_QA" #run 9 : pAP@10 = 0.343
#bi_model_name = "MMars/Question_Answering_AraBERT_xtreme_ar" #run 10 : pAP@10 = 0.337
# bi_model_name = "abdalrahmanshahrour/ArabicQA" # run 11 : pAP@10 = 0.304
# bi_model_name = "abdalrahmanshahrour/xtremeQA-ar" # run 12 : pAP@10 = 0.120

model_save_path = os.path.join(data_path, f'model/training_simcse-{bi_model_name}-'+datetime.now().strftime("%Y-%m-%d_%H-%M-%S"))

In [116]:
# Use Huggingface/transformers model (like BERT, RoBERTa, XLNet, XLM-R) for mapping tokens to embeddings
word_embedding_model = models.Transformer(bi_model_name, max_seq_length=256)
print("word_embedding_model Max Sequence Length:", word_embedding_model.max_seq_length)
print("word_embedding_model dimension", word_embedding_model.get_word_embedding_dimension())

# Apply mean pooling to get one fixed sized sentence vector
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
print("pooling_model sentence embedding dimension", pooling_model.get_sentence_embedding_dimension())

dense_model = models.Dense(in_features=pooling_model.get_sentence_embedding_dimension(), out_features=256, activation_function=nn.Tanh())

# bi_encoder = SentenceTransformer(modules=[word_embedding_model, pooling_model])
bi_encoder = SentenceTransformer(modules=[word_embedding_model, pooling_model, dense_model])

word_embedding_model Max Sequence Length: 256
word_embedding_model dimension 768
pooling_model sentence embedding dimension 768
2023-08-11 10:12:43 - Use pytorch device: cuda


In [118]:
train_samples_passage = []
#     add query train to train samples to make a more generalized embedding model
for _, row in df_passage.iterrows():
    train_samples_passage.append(InputExample(texts=[row['passage'], row['passage']]))

In [119]:
train_samples_qp = []
for _, row in df_qppair_train.iterrows():
    query_id = row['qid']

    query = df_query_train[df_query_train['qid'] == query_id]['query'].tolist()[0]
    passage_id = row['docid']
    if passage_id == '-1':
        continue
    else:
        passage = df_passage[df_passage['pid'] == passage_id]['passage'].tolist()[0]
        label = row['relevance']
        #positive sample
        train_samples_qp.append(InputExample(texts=[query, passage], label=label))
        train_samples_qp.append(InputExample(texts=[passage, query], label=label))

In [134]:
top_k = 75
BM25_model = pt.BatchRetrieve(index, controls = {"wmodel": "BM25"}, num_results=top_k)

train_samples_qp_contrastive = []
for _, row in df_qppair_train.groupby('qid'):
    # print(row)
    query_id = row['qid'].tolist()[0]
    query = df_query_train[df_query_train['qid'] == query_id]['query'].tolist()[0]
    bm25_related_passage = BM25_model.search(query)['docno'].tolist()
    possitive_passage = row['docid'].tolist()
    negative_passage = list(set(bm25_related_passage) - set(possitive_passage))
    # print(bm25_related_passage)
    # print(possitive_passage)
    # print(negative_passage)

    for pos_passage in possitive_passage:
        if pos_passage == '-1':
            continue
        else:
            passage = df_passage[df_passage['pid'] == pos_passage]['passage'].tolist()[0]
            label = 1
            #positive sample
            train_samples_qp_contrastive.append(InputExample(texts=[query, passage], label=label))
            train_samples_qp_contrastive.append(InputExample(texts=[passage, query], label=label))

    for neg_passage in negative_passage:
        if neg_passage == '-1':
            continue
        else:
            passage = df_passage[df_passage['pid'] == neg_passage]['passage'].tolist()[0]
            label = 0
            #positive sample
            train_samples_qp_contrastive.append(InputExample(texts=[query, passage], label=label))
            train_samples_qp_contrastive.append(InputExample(texts=[passage, query], label=label))


In [131]:
top_k = 50
BM25_model = pt.BatchRetrieve(index, controls = {"wmodel": "BM25"}, num_results=top_k)

train_samples_qp_triple = []
for _, row in df_qppair_train.groupby('qid'):
    # print(row)
    query_id = row['qid'].tolist()[0]
    query = df_query_train[df_query_train['qid'] == query_id]['query'].tolist()[0]
    bm25_related_passage = BM25_model.search(query)['docno'].tolist()
    possitive_passage = row['docid'].tolist()
    negative_passage = list(set(bm25_related_passage) - set(possitive_passage))
    # print(bm25_related_passage)
    # print(possitive_passage)
    # print(negative_passage)

    for pos_passage_id in possitive_passage:
        for neg_passage_id in negative_passage:
            if pos_passage_id == '-1':
                continue
            else:

                pos_passage = df_passage[df_passage['pid'] == pos_passage_id]['passage'].tolist()[0]
                neg_passage = df_passage[df_passage['pid'] == neg_passage_id]['passage'].tolist()[0]
                train_samples_qp_triple.append(InputExample(texts=[query, pos_passage, neg_passage]))


#### Train Bi-Encoder

In [136]:
len(train_samples_qp_triple)

32541

In [137]:
# Configure the training
num_epochs = 10
#TODO: change the batch size to 128 or 256 or 512 to run faster
train_batch_size = 32

# add query train to train samples to make a more generalized embedding model
# train_biencoder_dataloader = DataLoader(train_samples_passage, shuffle=True, batch_size=train_batch_size)
# train_qp_dataloader = DataLoader(train_samples_qp, shuffle=True, batch_size=train_batch_size)
train_qp_contrastive_dataloader = DataLoader(train_samples_qp_contrastive, shuffle=True, batch_size=train_batch_size)
#TODO : change Data to triple to see how performance changes, and change the loss function, also change num_epoch to 5
# train_qp_triple_dataloader = DataLoader(train_samples_qp_triple, shuffle=True, batch_size=train_batch_size)

# train_biencoder_loss = losses.MultipleNegativesRankingLoss(bi_encoder)
train_biencoder_loss = losses.ContrastiveLoss(bi_encoder)
# train_biencoder_loss = losses.TripletLoss(model=bi_encoder)
# train_biencoder_loss = losses.CosineSimilarityLoss(bi_encoder)
# train_biencoder_loss = losses.OnlineContrastiveLoss(bi_encoder)

In [138]:
warmup_steps = math.ceil(len(train_qp_contrastive_dataloader) * num_epochs * 0.1) #10% of train data for warm-up
logging.info("Warmup-steps: {}".format(warmup_steps))

# logging.info("Performance before training")
# dev_evaluator(model)

bi_encoder.fit(train_objectives=[(train_qp_contrastive_dataloader, train_biencoder_loss)],
          # evaluator=dev_evaluator,
          epochs=num_epochs,
          evaluation_steps=100,
          warmup_steps=warmup_steps,
          output_path=model_save_path
          )

2023-08-11 10:19:35 - Warmup-steps: 601


Epoch:   0%|          | 0/10 [00:00<?, ?it/s]

Iteration:   0%|          | 0/601 [00:00<?, ?it/s]

Iteration:   0%|          | 0/601 [00:00<?, ?it/s]

Iteration:   0%|          | 0/601 [00:00<?, ?it/s]

Iteration:   0%|          | 0/601 [00:00<?, ?it/s]

Iteration:   0%|          | 0/601 [00:00<?, ?it/s]

Iteration:   0%|          | 0/601 [00:00<?, ?it/s]

Iteration:   0%|          | 0/601 [00:00<?, ?it/s]

Iteration:   0%|          | 0/601 [00:00<?, ?it/s]

Iteration:   0%|          | 0/601 [00:00<?, ?it/s]

Iteration:   0%|          | 0/601 [00:00<?, ?it/s]

2023-08-11 11:50:58 - Save model to ../data\model/training_simcse-ZeyadAhmed/AraElectra-Arabic-SQuADv2-QA-2023-08-11_10-12-42


In [139]:
passage_embeddings = bi_encoder.encode(df_passage['passage'].tolist(), convert_to_tensor=True, show_progress_bar=True)

query_train_embeddings = bi_encoder.encode(df_query_train['query'].tolist(), convert_to_tensor=True, show_progress_bar=True)
query_dev_embeddings = bi_encoder.encode(df_query_dev['query'].tolist(), convert_to_tensor=True, show_progress_bar=True)

df_passage['embedding'] = passage_embeddings.cpu().numpy().tolist()
df_query_train['embedding'] = query_train_embeddings.cpu().numpy().tolist()
df_query_dev['embedding'] = query_dev_embeddings.cpu().numpy().tolist()

# for sentence, embedding in zip(df_passage, passage_embeddings):
#     print("Sentence:", sentence)
#     print("Embedding:", embedding)
#     print("")

Batches:   0%|          | 0/40 [00:00<?, ?it/s]

Batches:   0%|          | 0/6 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [140]:
# test_samples = []
# with gzip.open(sts_dataset_path, 'rt', encoding='utf8') as fIn:
#     reader = csv.DictReader(fIn, delimiter='\t', quoting=csv.QUOTE_NONE)
#     for row in reader:
#         if row['split'] == 'test':
#             score = float(row['score']) / 5.0 #Normalize score to range 0 ... 1
#             test_samples.append(InputExample(texts=[row['sentence1'], row['sentence2']], label=score))
#
# model = SentenceTransformer(model_save_path)
# test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(test_samples, batch_size=train_batch_size, name='sts-test')
# test_evaluator(model, output_path=model_save_path)


#### Semantic Search

In [141]:
tag = "SimCSE_biencoder"

hits = util.semantic_search(query_train_embeddings, passage_embeddings, top_k=top_k)
df_run = save_query_passage_retrieval(hits, tag, run_save=True)

print(bi_model_name, train_biencoder_loss)
! python QQA23_TaskA_eval.py \
    -r "../data/runs/SimCSE_biencoder.tsv" \
    -q "../data/qrels/QQA23_TaskA_qrels_train.gold"

df_run

ZeyadAhmed/AraElectra-Arabic-SQuADv2-QA ContrastiveLoss(
  (model): SentenceTransformer(
    (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: ElectraModel 
    (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
    (2): Dense({'in_features': 768, 'out_features': 256, 'bias': True, 'activation_function': 'torch.nn.modules.activation.Tanh'})
  )
)
Format check: Passed
     map  recip_rank
0.152326    0.233085


Unnamed: 0,qid,Q0,pid,rank,score,tag
0,101,Q0,17:1-1,1,0.880060,SimCSE_biencoder
1,101,Q0,20:25-37,2,0.840154,SimCSE_biencoder
2,101,Q0,29:36-37,3,0.827499,SimCSE_biencoder
3,101,Q0,7:85-93,4,0.813496,SimCSE_biencoder
4,101,Q0,21:87-88,5,0.813406,SimCSE_biencoder
...,...,...,...,...,...,...
13045,427,Q0,62:9-11,71,0.593552,SimCSE_biencoder
13046,427,Q0,1:7-7,72,0.591879,SimCSE_biencoder
13047,427,Q0,110:1-3,73,0.591441,SimCSE_biencoder
13048,427,Q0,109:1-6,74,0.589989,SimCSE_biencoder


In [142]:
#Ghazaleh write below code

In [143]:
tag = "SimCSE_bm25_biencoder"
top_k = 10
bm25_biencoder_hit= []
for query in df_query_train['query'].tolist():
    bm25_result = BM25_model.search(query)
    bm25_related_passage = bm25_result['docno'].tolist()
    passage =  df_passage[df_passage['pid'].isin(bm25_related_passage)]['passage'].tolist()
    try:
        query_embedding = bi_encoder.encode(query, convert_to_tensor=True, show_progress_bar=False)
        passage_embeddings = bi_encoder.encode(passage, convert_to_tensor=True, show_progress_bar=False)
        # TODO
        '''
        check similarity between query and passage, with methods is better for this task?
        util.dot_score
        util.cos_sim
        util.pairwise_dot_score
        util.pairwise_cos_sim
        '''
        hit = util.semantic_search(query_embedding, passage_embeddings, top_k=top_k)[0]

        mapping = {index : row['docno'] for index, row in bm25_result.iterrows()}

    except:
        #len passage is 0 but why ?
        print(f"len passage : {len(passage)}, qury : {query}")
        query_embedding = bi_encoder.encode(query, convert_to_tensor=True, show_progress_bar=False)
        passage = df_passage['passage'].tolist()
        passage_embeddings = bi_encoder.encode(passage, convert_to_tensor=True, show_progress_bar=False)
        hit = util.semantic_search(query_embedding, passage_embeddings, top_k=top_k)[0]
        mapping = {index : row['pid'] for index, row in df_passage.iterrows()}

    for i in range(len(hit)):
        hit[i]['corpus_id'] = mapping[hit[i]['corpus_id']]

    hit = sorted(hit, key=lambda x: x['score'], reverse=True)

    bm25_biencoder_hit.append(hit)


len passage : 0, qury : معن جاثيه
len passage : 0, qury : عقوب سارق
len passage : 0, qury : مطفف
len passage : 0, qury : تجوز صدقه


In [144]:
bm25_biencoder_hit_df = pd.DataFrame()
for i in range(len(bm25_biencoder_hit)):
    for j in range(len(bm25_biencoder_hit[i])):
        new_record = pd.DataFrame([{"qid": df_query_train['qid'].tolist()[i],
                "Q0": "Q0",
                "pid": bm25_biencoder_hit[i][j]['corpus_id'],
                "rank": j,
                "score": bm25_biencoder_hit[i][j]['score'],
                "tag": tag
            }])
        bm25_biencoder_hit_df = pd.concat([bm25_biencoder_hit_df, new_record], ignore_index=True)


In [145]:
bm25_biencoder_hit_df

Unnamed: 0,qid,Q0,pid,rank,score,tag
0,101,Q0,7:73-79,0,0.827499,SimCSE_bm25_biencoder
1,101,Q0,46:29-32,1,0.813496,SimCSE_bm25_biencoder
2,101,Q0,20:86-94,2,0.769314,SimCSE_bm25_biencoder
3,101,Q0,13:8-11,3,0.762408,SimCSE_bm25_biencoder
4,101,Q0,10:79-86,4,0.745519,SimCSE_bm25_biencoder
...,...,...,...,...,...,...
1586,427,Q0,2:104-105,5,0.520318,SimCSE_bm25_biencoder
1587,427,Q0,18:1-6,6,0.514551,SimCSE_bm25_biencoder
1588,427,Q0,24:30-31,7,0.507530,SimCSE_bm25_biencoder
1589,427,Q0,3:98-101,8,0.500259,SimCSE_bm25_biencoder


In [146]:
bm25_biencoder_hit_df.to_csv("../data/runs/BM25_BiEncoder.tsv", sep="\t", index=False, header=False)

In [147]:
! python QQA23_TaskA_eval.py \
    -r "../data/runs/BM25_BiEncoder.tsv" \
    -q "../data/qrels/QQA23_TaskA_qrels_train.gold"

Format check: Passed
     map  recip_rank
0.091142    0.184013


### Re-ranker (Cross-Encoder)

In [None]:
# cross_model_name = "cross-encoder/ms-marco-MiniLM-L-6-v2"
# cross_model_name = "cross-encoder/ms-marco-TinyBERT-L-2"
cross_model_name = "distilroberta-base"

In [None]:
cross_encoder = CrossEncoder(cross_model_name)

#### Train Cross-Encoder

In [None]:
# Fine-tune cross-encoder on the query-passage
train_cross_loss = losses.ContrastiveLoss(cross_encoder)

torch.cuda.empty_cache()

warmup_steps = math.ceil(len(train_qp_dataloader) * num_epochs * 0.1) #10% of train data for warm-up
logging.info("Warmup-steps: {}".format(warmup_steps))

cross_encoder.fit(train_qp_dataloader,
          # evaluator=dev_evaluator,
          epochs=num_epochs,
          evaluation_steps=100,
          warmup_steps=warmup_steps,
          output_path=model_save_path
          )


In [None]:
tag = "SimCSE_cross"

df_final = pd.DataFrame()

for qid, filter_passages in df_run.groupby('qid'):
    cross_inp = []
    q = df_query_train[df_query_train['qid'] == qid]
    for x in filter_passages.values:
        p = df_passage[df_passage['pid'] == x[2]]
        cross_inp.append([q['query'].tolist()[0], p['passage'].tolist()[0]])

    cross_scores = cross_encoder.predict(cross_inp, show_progress_bar=False)
    df_run = pd.DataFrame({'qid': qid, 'pid': filter_passages['pid'].values, 'score': cross_scores})
    df_run = df_run.sort_values(by=['score'], ascending=False)[:10]
    df_run['rank'] = range(1, len(df_run) + 1)

    df_final = pd.concat([df_final, df_run], ignore_index=True)

df_final = save_query_passage_retrieval(df_final, tag, run_save=True)

print("cross-encoder model name", cross_model_name)
! python QQA23_TaskA_eval.py \
    -r "../data/runs/SimCSE_cross.tsv" \
    -q "../data/qrels/QQA23_TaskA_qrels_train.gold"
df_final

In [None]:
! python QQA23_TaskA_eval.py \
    -r "../data/runs/SimCSE_cross.tsv" \
    -q "../data/qrels/QQA23_TaskA_qrels_train.gold"

## BM25 - Search
Search in the index and find the relevant passages.

In [None]:
# initialize the BM25 retrieval model
top_k = 15
BM25_model = pt.BatchRetrieve(index, controls = {"wmodel": "BM25"}, num_results=top_k)
# wmodel="TF_IDF"

In [None]:
 # define some global constants
TEXT = "text"
QUERY = "query"
LABEL = "label"
RANK = "rank"
TAG = "tag"
SCORE = "score"
QID = "qid"
DOC_NO = "docno"
DOCID = "docid"

def prepare_query_for_search(query_path, query_column=TEXT,
                        id_column=DOC_NO):

        names = [DOC_NO, TEXT]
        print("Cleaning queries and applying preprocessing steps")
        df_query = read_file(query_path, names=names)
        # apply the cleaning functions on the queries/questions
        df_query[QUERY] =df_query[query_column].apply(clean)

        # apply normalization, stemming and stop word removal
        print("Applying normalization, stemming and stop word removal")
        df_query[QUERY] =df_query[QUERY].apply(preprocess_arabic)

        df_query[QID] = df_query[id_column].astype(str) # convert the id column to string
        df_query = df_query[[QID, QUERY]] # keep the columns needed for search
        print("Done with preparation!")
        return df_query


In [None]:
tag = "BM25"

df_query = prepare_query_for_search(query_train_path)
# df_query = prepare_query_for_search(query_dev_path)

# search using BM25 model
df_run = BM25_model.transform(df_query)
# df_run
# save the run in trec format to a file
df_run = save_query_passage_retrieval(df_run, tag, run_save=True)
df_run
# df_run

In [None]:
! python QQA23_TaskA_eval.py \
    -r "../data/runs/BM25_Final.tsv" \
    -q "../data/qrels/QQA23_TaskA_qrels_dev.gold"

In [None]:
sample = "عمران"
BM25_model.search(sample)

In [None]:
def batch_emb(l1, l2):
    l2 = sentence_embedding.encode(l2)
    l1.extend(l2)
    return l1

def get_embedding(text, n=50):
    batched_num = [text[i:i + n] for i in range(0, len(text), n)]
    texts_embed = reduce(batch_emb, batched_num, [])
    print(len(texts_embed))
    return texts_embed



## RocketQA

# Evaluation

In [None]:
! python QQA23_TaskA_eval.py \
    -r "../data/runs/bigIR_BM25.tsv" \
    -q "../data/qrels/QQA23_TaskA_qrels_dev.gold"

Here, we are just evaluating the perfect run for the dev set

In [None]:
! python QQA23_TaskA_eval.py \
    -r "../data/runs/dev_perfect.tsv" \
    -q "../data/qrels/QQA23_TaskA_qrels_dev.gold"