In [1]:
import logging
import math
import os
import re
from datetime import datetime

import arabicstopwords.arabicstopwords as ar_stp
import nltk
import numpy as np
import pandas as pd
import pyterrier as pt
import torch
from datasets import load_dataset
from sentence_transformers import LoggingHandler, SentenceTransformer, util, InputExample
from sentence_transformers import evaluation
from sentence_transformers import models, losses, datasets
from snowballstemmer import stemmer
from torch import nn
# from simcse import SimSCE
from torch.utils.data import DataLoader

nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\yegmo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
# torch.cude.empty_cache
#### Just some code to print debug information to stdout
logging.basicConfig(format='%(asctime)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    level=logging.INFO,
                    handlers=[LoggingHandler()])
#### /print debug information to stdout

In [3]:
print(torch.__version__)
print(torch.cuda.is_available())
# print(torch.cuda.device_count())
# print(torch.cuda.current_device())
print(torch.cuda.get_device_name(0))

2.0.1+cu118
True
NVIDIA GeForce RTX 3090


# Get Data

In [4]:
data_path = "../data"
index_path = os.path.join(data_path, "QPC_Index/data.properties")

query_train_path = os.path.join(data_path, "QQA23_TaskA_train.tsv")
query_dev_path = os.path.join(data_path, "QQA23_TaskA_dev.tsv")
query_test_path = os.path.join(data_path, "QQA23_TaskA_test.tsv")

passage_path = os.path.join(data_path, "Thematic_QPC/QQA23_TaskA_QPC_v1.1.tsv")

qp_pair_train_path = os.path.join(data_path, "qrels\QQA23_TaskA_qrels_train.gold")
qp_pair_dev_path = os.path.join(data_path, "qrels\QQA23_TaskA_qrels_dev.gold")

task_B_train_path = os.path.join(data_path, "Task B data/QQA23_TaskB_qrcd_v1.2_train_preprocessed.jsonl")


## Read file

In [5]:
# read file based on its extension (tsv or xlsx)
def read_file(input_file, sep="\t", names = ""):
    if input_file.endswith(".xlsx"):
        df = pd.read_excel(input_file)
    else:
        if names != "":
            df = pd.read_csv(input_file, sep=sep, names=names,encoding="utf-8")
        else:
            df = pd.read_csv(input_file, sep=sep,encoding="utf-8")
    return df

In [6]:
qrels_columns = ["qid", "Q0", "docid", "relevance"]

def read_qrels_file(qrels_file):
    # split_token = '\t' if format_checker.is_tab_sparated(qrels_file) else  "\s+"
    df_qrels = pd.read_csv(qrels_file, sep='\t', names=qrels_columns)
    df_qrels["qid"] = df_qrels["qid"].astype(str)
    df_qrels["docid"] = df_qrels["docid"].astype(str)
    return df_qrels

In [7]:
def load_index(index_path):
    if not pt.started():
        pt.init(helper_version="0.0.6")

    try:
        index = pt.IndexFactory.of(index_path)
        print("Index was loaded successfully from this path: ", index_path)
        return index
    except Exception as e:
        print('Cannot load the index, check exception details {}'.format(e))
        return []

## Cleaning & Preprocessing
Clean text from urls, handles, special characters, tabs, line jumps, extra white space, and puntuations.
Preprocess the arabic input text by performing normalization, stemming, and removing stop words.

In [8]:
# Clean text from urls, handles, special characters, tabs, line jumps, and extra white space.
def clean(text):
    text = re.sub(r"http\S+", " ", text)  # remove urls
    text = re.sub(r"@[\w]*", " ", text)  # remove handles
    text = re.sub(r"[\.\,\#_\|\:\?\?\/\=]", " ", text) # remove special characters
    text = re.sub(r"\t", " ", text)  # remove tabs
    text = re.sub(r"\n", " ", text)  # remove line jump
    text = re.sub(r"\s+", " ", text)  # remove extra white space
    text = re.sub(r'[^\w\s]', '', text) # Removing punctuations in string using regex
    text = text.strip()
    return text

In [9]:
# arabic stemmer
ar_stemmer = stemmer("arabic")

# remove arabic stop words
def ar_remove_stop_words(sentence):
    terms=[]
    stopWords= set(ar_stp.stopwords_list())
    for term in sentence.split() : 
        if term not in stopWords :
            terms.append(term)
    return " ".join(terms)


# normalize the arabic text
def normalize_arabic(text):
    text = re.sub("[إأٱآا]", "ا", text)
    text = re.sub("ى", "ي", text)
    text = re.sub("ؤ", "ء", text)
    text = re.sub("ئ", "ء", text)
    text = re.sub("ة", "ه", text)
    return(text)

# stem the arabic text
def ar_stem(sentence):
    return " ".join([ar_stemmer.stemWord(i) for i in sentence.split()])


# apply all preprocessing steps needed for Arabic text
def preprocess_arabic(text): 
    text = normalize_arabic(text)
    text = ar_remove_stop_words(text)
    text = ar_stem(text)
    return text

In [10]:
def prepare_data(path, column, id_type, id_column='docno'):
        df = read_file(path, names=['docno', 'text'])

        df[column] = df['text'].apply(clean)
        df[column] = df[column].apply(preprocess_arabic)

        df[id_type] = df[id_column].astype(str) # convert the id column to string
        df = df[[id_type, 'text', column]] # keep the columns needed for search

        print(f"Done with preparation with {column}!")
        return df

## Loading

In [11]:
index = load_index(index_path=index_path)

# print(index.getCollectionStatistics().toString())
# print(index.getMetaIndex().getKeys())
# for kv in index.getLexicon():
#     print((kv.getKey())+"\t"+ kv.getValue().toString())
# index.getLexicon()["فاعل"].toString()

PyTerrier 0.9.2 has loaded Terrier 5.7 (built by craigm on 2022-11-10 18:30) and terrier-helper 0.0.6



Index was loaded successfully from this path:  ../data\QPC_Index/data.properties


In [12]:
df_passage = prepare_data(passage_path, 'passage', 'pid')

df_query_train = prepare_data(query_train_path, 'query', 'qid')
df_query_dev = prepare_data(query_dev_path, 'query', 'qid')
df_query_test = prepare_data(query_test_path, 'query', 'qid')

df_qppair_train = read_qrels_file(qp_pair_train_path)

df_qppair_dev = read_qrels_file(qp_pair_dev_path)


Done with preparation with passage!
Done with preparation with query!
Done with preparation with query!
Done with preparation with query!


# Model - Sentence Embedding

In [13]:
def save_query_passage_retrieval(result, tag, run_save=False, df_query=df_query_train, top_k=10):
    if tag == "BM25":
        result["Q0"] = ["Q0"] * len(result)
        result["tag"] = [tag] * len(result)
        result['qid'] = result["qid"]
        result['pid'] = result["docno"]
        tag = "BM25_Final"
        result = result[["qid", "Q0", "pid", "rank", "score", "tag"]]

    elif "SimCSE_bi" in tag:
        np_result = np.array(result).flatten()
        result = pd.DataFrame()

        result["qid"] = df_query["qid"].tolist() * top_k
        result = result.sort_values(by=['qid']).reset_index(drop=True)
        result["Q0"] = ["Q0"] * len(result)
        result["pid"] = [df_passage.iloc[x['corpus_id']]['pid'] for x in np_result]
        result["rank"] = list(range(1, top_k+1)) * len(df_query)
        result["score"] = [x['score'] for x in np_result]
        result["tag"] = [tag] * len(np_result)

    elif tag == "SimCSE_bmbiencd":
        df_result = pd.DataFrame()
        for i in range(len(bm25_biencoder_hit)):
            for j in range(len(bm25_biencoder_hit[i])):
                new_record = pd.DataFrame([{"qid": df_query_dev['qid'].tolist()[i],
                        "Q0": "Q0",
                        "pid": bm25_biencoder_hit[i][j]['corpus_id'],
                        "rank": j,
                        "score": bm25_biencoder_hit[i][j]['score'],
                        "tag": tag
                    }])
                df_result = pd.concat([df_result, new_record], ignore_index=True)
        result = df_result
        print(type(result))

    elif tag == "SimCSE_cross":
        result['tag'] = tag
        result['Q0'] = 'Q0'
        result = result[["qid", "Q0", "pid", "rank", "score", "tag"]]

    if run_save:
        run_save_path = os.path.join(data_path, f"runs/{tag}.tsv")
        # print(run_save_path)
        result.to_csv(run_save_path, sep="\t", index=False, header=False)

    return result

## Convert Data to SentenceTransformer InputFormat

#### Simple passage-passage pair

In [14]:
train_samples_passage = []
for _, row in df_passage.iterrows():
    train_samples_passage.append(InputExample(texts=[row['passage'], row['passage']]))

print("len(train_samples_passage) =", len(train_samples_passage))

len(train_samples_passage) = 1266


#### query-passage double pair with relevance label = 1 (positive)

In [15]:
train_samples_qp = []
for _, row in df_qppair_train.iterrows():
    query_id = row['qid']
    query = df_query_train[df_query_train['qid'] == query_id]['query'].tolist()[0]
    passage_id = row['docid']

    if passage_id == '-1':
        continue
    else:
        passage = df_passage[df_passage['pid'] == passage_id]['passage'].tolist()[0]
        label = row['relevance']
        #positive sample
        train_samples_qp.append(InputExample(texts=[query, passage], label=label))
        train_samples_qp.append(InputExample(texts=[passage, query], label=label))

print("len(train_samples_qp) =", len(train_samples_qp))

len(train_samples_qp) = 1892


#### contrastive: query-passage double pair with relevance label = 1 (positive) and not found BM25 top-k passages with relevance label = 0 (negative)

In [16]:
train_samples_qp_contrastive = []

top_k = 1000
print("top_k =", top_k)
BM25_model = pt.BatchRetrieve(index, controls = {"wmodel": "BM25"}, num_results=top_k)

for _, row in df_qppair_train.groupby('qid'):
    query_id = row['qid'].tolist()[0]
    query = df_query_train[df_query_train['qid'] == query_id]['query'].tolist()[0]
    bm25_related_passage = BM25_model.search(query)['docno'].tolist()
    positive_passage = row['docid'].tolist()
    negative_passage = list(set(bm25_related_passage) - set(positive_passage))

    for pos_passage in positive_passage:
        if pos_passage == '-1':
            continue
        else:
            passage = df_passage[df_passage['pid'] == pos_passage]['passage'].tolist()[0]
            label = 1
            #positive sample
            train_samples_qp_contrastive.append(InputExample(texts=[query, passage], label=label))
            train_samples_qp_contrastive.append(InputExample(texts=[passage, query], label=label))

    for neg_passage in negative_passage:
        if neg_passage == '-1':
            continue
        else:
            passage = df_passage[df_passage['pid'] == neg_passage]['passage'].tolist()[0]
            label = 0
            #positive sample
            train_samples_qp_contrastive.append(InputExample(texts=[query, passage], label=label))
            train_samples_qp_contrastive.append(InputExample(texts=[passage, query], label=label))

print("len(train_samples_qp_contrastive) =", len(train_samples_qp_contrastive))

top_k = 1000
len(train_samples_qp_contrastive) = 87784


#### multiple negative ranking: query-passage double pair with relevance label = 1 (positive)

In [17]:
train_samples_qp_multiple_negative_ranking = []
for _, row in df_qppair_train.groupby('qid'):
    query_id = row['qid'].tolist()[0]
    query = df_query_train[df_query_train['qid'] == query_id]['query'].tolist()[0]
    positive_passage = row['docid'].tolist()
    for pos_passage in positive_passage:
        if pos_passage == '-1':
            continue
        else:
            passage = df_passage[df_passage['pid'] == pos_passage]['passage'].tolist()[0]
            label = 1
            #positive sample
            train_samples_qp_multiple_negative_ranking.append(InputExample(texts=[query, passage], label=label))
            train_samples_qp_multiple_negative_ranking.append(InputExample(texts=[passage, query], label=label))

print("len(train_samples_qp_multiple_negative_ranking) =", len(train_samples_qp_multiple_negative_ranking))

len(train_samples_qp_multiple_negative_ranking) = 1892


#### triple: query-positive passage-negative passage; negative passage is not found in BM25 top-k passages

In [18]:
top_k = 100
print("top_k =", top_k)
BM25_model = pt.BatchRetrieve(index, controls = {"wmodel": "BM25"}, num_results=top_k)

train_samples_qp_triple = []

for _, row in df_qppair_train.groupby('qid'):
    # print(row)
    query_id = row['qid'].tolist()[0]
    query = df_query_train[df_query_train['qid'] == query_id]['query'].tolist()[0]
    bm25_related_passage = BM25_model.search(query)['docno'].tolist()
    positive_passage = row['docid'].tolist()
    negative_passage = list(set(bm25_related_passage) - set(positive_passage))
    # print(bm25_related_passage)
    # print(possitive_passage)
    # print(negative_passage)

    for pos_passage_id in positive_passage:
        for neg_passage_id in negative_passage:
            if pos_passage_id == '-1':
                continue
            else:
                pos_passage = df_passage[df_passage['pid'] == pos_passage_id]['passage'].tolist()[0]
                neg_passage = df_passage[df_passage['pid'] == neg_passage_id]['passage'].tolist()[0]
                train_samples_qp_triple.append(InputExample(texts=[query, pos_passage, neg_passage]))

print("len train_samples_qp_triple =", len(train_samples_qp_triple))

top_k = 100
len train_samples_qp_triple = 59120


## Additional Data

### Mr. TyDi

In [19]:
train_samples_mr_tydi_triple = []
#https://huggingface.co/datasets/castorini/mr-tydi
mr_tydi_dataset = load_dataset("castorini/mr-tydi", "arabic", split="train").to_pandas()

for _, row in mr_tydi_dataset.iterrows():
    query = row['query']
    positive_passages = row['positive_passages']
    negative_passages = row['negative_passages']

    for pos_passage in positive_passages:
        for neg_passage in negative_passages:
            train_samples_mr_tydi_triple.append(InputExample(texts=[query, pos_passage, neg_passage]))

print("len(train_samples_mr_tydi_triple) =", len(train_samples_mr_tydi_triple))

len(train_samples_mr_tydi_triple) = 362146


### Task B Data

In [20]:
import json

def load_jsonl(input_path) -> list:
    data = []
    with open(input_path, 'r', encoding='utf-8') as f:
        for line in f:
            data.append(json.loads(line.rstrip('\n|\r')))
    print('Loaded {} records from {}'.format(len(data), input_path))
    return data


train_samples_qp_task_B = []
train_passage_question_objects_task_B  = load_jsonl(task_B_train_path)
for passage_question_object in train_passage_question_objects_task_B:
    query = passage_question_object["question"]
    passage = passage_question_object["passage"]
    if len(passage_question_object["answers"]) > 0:
        train_samples_qp_task_B.append(InputExample(texts=[query, passage], label=1))
    else:
        train_samples_qp_task_B.append(InputExample(texts=[query, passage], label=0))

train_passage_question_objects_task_B = pd.DataFrame(train_passage_question_objects_task_B)
print("len(train_samples_qp_task_B) =", len(train_samples_qp_task_B))

Loaded 992 records from ../data\Task B data/QQA23_TaskB_qrcd_v1.2_train_preprocessed.jsonl
len(train_samples_qp_task_B) = 992


## Fine Tuning

### TSDAE

In [None]:
run_tsdae = True
model_name = 'aubmindlab/bert-base-arabert'
batch_size = 8
num_epochs = 8

if run_tsdae:
    # Define your sentence transformer model using CLS pooling


    word_embedding_model = models.Transformer(model_name)
    pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), 'cls')
    model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

    # Define a list with sentences (1k - 100k sentences)
    train_sentences = df_passage['passage'].tolist()

    # Create the special denoising dataset that adds noise on-the-fly
    train_dataset = datasets.DenoisingAutoEncoderDataset(train_sentences)

    # DataLoader to batch your data
    train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    train_samples_qp_task_B_dataloader = DataLoader(train_samples_qp_task_B, batch_size=batch_size, shuffle=True)

    # Use the denoising auto-encoder loss
    train_loss = losses.DenoisingAutoEncoderLoss(model, decoder_name_or_path=model_name, tie_encoder_decoder=True)
    constraint_loss = losses.ContrastiveLoss(model)

    # Call the fit method
    model.fit(
        train_objectives=[(train_dataloader, train_loss), (train_samples_qp_task_B_dataloader, constraint_loss)],
        epochs=num_epochs,
        weight_decay=0,
        scheduler='constantlr',
        optimizer_params={'lr': 3e-5},
        show_progress_bar=True
    )

    model.save(os.path.join(data_path, f'fine_tune/tsdae-model-v1'))

### SimCSE

In [None]:
run_simcse = False
model_name = 'aubmindlab/bert-base-arabert'
batch_size = 128
num_epochs = 5

if run_simcse:
    # Define your sentence transformer model using CLS pooling
    word_embedding_model = models.Transformer(model_name, max_seq_length=164)
    pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
    model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

    # Define a list with sentences (1k - 100k sentences)
    train_sentences = df_passage['passage'].tolist()

    # Convert train sentences to sentence pairs
    train_data = [InputExample(texts=[s, s]) for s in train_sentences]

    # DataLoader to batch your data
    train_dataloader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
    train_samples_qp_task_B_dataloader = DataLoader(train_samples_qp_task_B, batch_size=batch_size, shuffle=True)

    # Use the MultipleNegativesRankingLoss loss
    train_loss = losses.MultipleNegativesRankingLoss(model)
    constraint_loss = losses.ContrastiveLoss(model)

    model.fit(
        train_objectives=[(train_dataloader, train_loss), (train_samples_qp_task_B_dataloader, constraint_loss)],
        epochs=num_epochs,
        show_progress_bar=True
    )

    # model.save(os.path.join(data_path, f'fine_tune/simcse-model'))
    # model.save(os.path.join(data_path, 'fine_tune/simcse-model-v1'))
    # model.save(os.path.join(data_path, 'fine_tune/simcse-model-v2'))
    # model.save(os.path.join(data_path, 'fine_tune/simcse-model-v3'))
    # model.save(os.path.join(data_path, 'fine_tune/simcse-model-v4'))
    model.save(os.path.join(data_path, 'fine_tune/simcse-model-v5'))



### CT (Semantic Re-Tuning With Contrastive Tension (CT))

In [None]:
run_ct = False
if run_ct:
    # Define your sentence transformer model using CLS pooling
    model_name = 'aubmindlab/bert-base-arabert'
    word_embedding_model = models.Transformer(model_name, max_seq_length=164)
    # Apply mean pooling to get one fixed sized sentence vector
    pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
    model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

    # Define a list with sentences (1k - 100k sentences)
    train_sentences = df_passage['passage'].tolist()

    # For ContrastiveTension we need a special data loader to construct batches with the desired properties
    train_dataloader =  losses.ContrastiveTensionDataLoader(train_sentences, batch_size=64, pos_neg_ratio=8)

    # As loss, we losses.ContrastiveTensionLoss
    train_loss = losses.ContrastiveTensionLoss(model)
    num_epochs = 8
    warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1)  # 10% of train data for warm-up
    logging.info("Warmup-steps: {}".format(warmup_steps))

    # Train the model
    model.fit(train_objectives=[(train_dataloader, train_loss)],
              epochs=num_epochs,
              optimizer_class=torch.optim.RMSprop,
              optimizer_params={'lr': 5e-5},
              show_progress_bar=True,
              use_amp=False  # Set to True, if your GPU supports FP16 cores
              )

    model.save(data_path, f'fine_tune/ct-model')

#### CT In Batch Negative

In [None]:
# # Define your sentence transformer model using CLS pooling
# model_name = 'aubmindlab/bert-base-arabert'
# word_embedding_model = models.Transformer(model_name, max_seq_length=164)
# # Apply mean pooling to get one fixed sized sentence vector
# pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
# model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
#
# # Define a list with sentences (1k - 100k sentences)
# train_sentences = df_passage['passage'].tolist()
#
# # A regular torch DataLoader and as loss we use losses.ContrastiveTensionLossInBatchNegatives
# train_dataloader = DataLoader(train_sentences, batch_size=64, shuffle=True, drop_last=True)
# train_loss = losses.ContrastiveTensionLossInBatchNegatives(model)
#
# num_epochs = 8
# warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1)  # 10% of train data for warm-up
# logging.info("Warmup-steps: {}".format(warmup_steps))
#
# # Train the model
# model.fit(train_objectives=[(train_dataloader, train_loss)],
#           epochs=num_epochs,
#           optimizer_class=torch.optim.RMSprop,
#           optimizer_params={'lr': 5e-5},
#           show_progress_bar=True,
#           use_amp=False  # Set to True, if your GPU supports FP16 cores
#           )
#
# # model.save(data_path, f'fine_tune/ct-model')
# model.save(data_path, f'fine_tune/ct-in-batch-negative-model')

## Train Bi-Encoder

In [43]:
# https://www.sbert.net/docs/pretrained_models.html
# https://www.sbert.net/docs/pretrained-models/msmarco-v3.html

num_epochs = 3
train_batch_size = 64

################# fine tune #################
# bi_model_name = "data_path, f'model/fine_tune/tsdae-model"
# bi_model_name = "fine_tune/tsdae-model"


bi_model_name = os.path.join(data_path, "fine_tune/simcse-model") # Best
# bi_model_name = os.path.join(data_path, "fine_tune/simcse-model-v1")
# bi_model_name = os.path.join(data_path, "fine_tune/simcse-model-v2")
# bi_model_name = os.path.join(data_path, "fine_tune/simcse-model-v4")
# bi_model_name = os.path.join(data_path, "fine_tune/simcse-model-v5")

# bi_model_name = os.path.join(data_path, "fine_tune/tsdae-model-v1")


# bi_model_name = "fine_tine/ct-model"
# bi_model_name = "fine_tine/simcse-model-paraphrase-multilingual-mpnet-base-v2"

####################################### Our Best Models #######################################
# bi_model_name = "aubmindlab/bert-base-arabert"
# bi_model_name = "sentence-transformers/paraphrase-multilingual-mpnet-base-v2"
#
####################################### TODO: try other models
# bi_model_name = "xlm-roberta-base"
# bi_model_name = "sentence-transformers/paraphrase-xlm-r-multilingual-v1"
# bi_model_name = "sentence-transformers/msmarco-distilbert-base-v4"

# bi_model_name = "aubmindlab/bert-base-arabertv2"
# bi_model_name = "aubmindlab/araelectra-base-discriminator"
# bi_model_name = "aubmindlab/bert-base-arabertv01"
# bi_model_name = "aubmindlab/bert-base-arabertv2"
# bi_model_name = "aubmindlab/bert-base-arabertv02"

# i guess this is not a good model for this task but we should try
#bi_model_name = "gfdgdfgdg/arap_qa_bert" # run 5 : pAP@10 = 0.184
#bi_model_name = "gfdgdfgdg/arap_qa_bert_large_v2" # run6 : pAP@10 = 0.372
#bi_model_name = "gfdgdfgdg/arap_qa_bert_v2" # run7 : pAP@10 = 0.344
#bi_model_name = "zohaib99k/Bert_Arabic-SQuADv2-QA" # run8 : pAP@10 = 0.435
#bi_model_name = "arabi-elidrisi/ArabicDistilBERT_QA" #run 9 : pAP@10 = 0.343
#bi_model_name = "MMars/Question_Answering_AraBERT_xtreme_ar" #run 10 : pAP@10 = 0.337
# bi_model_name = "abdalrahmanshahrour/ArabicQA" # run 11 : pAP@10 = 0.304
# bi_model_name = "abdalrahmanshahrour/xtremeQA-ar" # run 12 : pAP@10 = 0.120


####################################### Checked models #######################################
# bi_model_name = "aubmindlab/bert-large-arabertv02"

# bi_model_name = "sentence-transformers/distiluse-base-multilingual-cased-v1" ## train : 0.48, dev : 0.12
# bi_model_name = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"

# bi_model_name = "wissamantoun/araelectra-base-artydiqa" # run 1 : train:0.51 but dev near 0
# bi_model_name = "salti/AraElectra-base-finetuned-ARCD" # run 2 : pAP@10 = 0.397
# bi_model_name = "ZeyadAhmed/AraElectra-Arabic-SQuADv2-QA" # run 3 : pAP@10 = 0.435
# bi_model_name = "timpal0l/mdeberta-v3-base-squad2" # run 4 : pAP@10 = 0.367

model_save_path = os.path.join(data_path, f'model/training_simcse-{bi_model_name}-'+datetime.now().strftime("%Y-%m-%d_%H-%M-%S"))

In [44]:
# Use Huggingface/transformers model (like BERT, RoBERTa, XLNet, XLM-R) for mapping tokens to embeddings
#TODO : change max_seq_length to 384 or 512
max_seq_length = 128
word_embedding_model = models.Transformer(bi_model_name, max_seq_length=128)
print("word_embedding_model Max Sequence Length:", word_embedding_model.max_seq_length)
print("word_embedding_model dimension", word_embedding_model.get_word_embedding_dimension())

# Apply mean pooling to get one fixed sized sentence vector
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
print("pooling_model sentence embedding dimension", pooling_model.get_sentence_embedding_dimension())

#TODO : change out_features to 512
dense_model = models.Dense(in_features=pooling_model.get_sentence_embedding_dimension(), out_features=512, activation_function=nn.Tanh())

# bi_encoder = SentenceTransformer(modules=[word_embedding_model, pooling_model])
bi_encoder = SentenceTransformer(modules=[word_embedding_model, pooling_model, dense_model])

word_embedding_model Max Sequence Length: 128
word_embedding_model dimension 768
pooling_model sentence embedding dimension 768
2023-08-17 04:18:50 - Use pytorch device: cuda


In [47]:
# train_passage_dataloader = DataLoader(train_samples_passage, shuffle=True, batch_size=train_batch_size)
# train_qp_dataloader = DataLoader(train_samples_qp, shuffle=True, batch_size=train_batch_size)
train_qp_contrastive_dataloader = DataLoader(train_samples_qp_contrastive, shuffle=True, batch_size=train_batch_size)
train_qp_triple_dataloader = DataLoader(train_samples_qp_triple, shuffle=True, batch_size=train_batch_size)
# train_qp_multiple_negative_ranking_dataloader = DataLoader(train_samples_qp_multiple_negative_ranking, shuffle=True, batch_size=train_batch_size)
train_qp_mr_tydi_triple = DataLoader(train_samples_mr_tydi_triple, shuffle=True, batch_size=train_batch_size)
train_samples_qp_task_B_dataloader = DataLoader(train_samples_qp_task_B, shuffle=True, batch_size=train_batch_size)

train_biencoder_loss_MultipleNegativesRanking = losses.MultipleNegativesRankingLoss(bi_encoder)
# train_biencoder_loss_Contrastive = losses.ContrastiveLoss(bi_encoder)
train_biencoder_loss_Triple = losses.TripletLoss(model=bi_encoder)
train_biencoder_loss_OnlineContrastive = losses.OnlineContrastiveLoss(bi_encoder)
# train_biencoder_loss_CosineSimilarity = losses.CosineSimilarityLoss(bi_encoder)

In [48]:
#TODO : https://github.com/UKPLab/sentence-transformers/blob/master/examples/training/multilingual/make_multilingual.py
#teacher_model_name = 'fine_tine/simcse-model'   #Our monolingual teacher model, we want to convert to multiple languages
#student_model_name = 'xlm-roberta-base'       #Multilingual base model we use to imitate the teacher model

In [49]:
torch.cuda.empty_cache()
#  asymmetric semantic search
warmup_steps = math.ceil(len(train_qp_triple_dataloader) * num_epochs * 0.1) #10% of train data for warm-up
logging.info("Warmup-steps: {}".format(warmup_steps))

# logging.info("Performance before training")
# dev_evaluator(model)
#RerankingEvaluator
#InformationRetrievalEvaluator
# https://github.com/UKPLab/sentence-transformers/blob/master/examples/training/quora_duplicate_questions/training_OnlineContrastiveLoss.py


#Task B data -> good results

# bi_encoder.fit(
#           train_objectives=[
#                         (train_samples_qp_task_B_dataloader, train_biencoder_loss_MultipleNegativesRanking)],  epochs=3)

dev_evaluator = evaluation.InformationRetrievalEvaluator(
                                df_query_dev.groupby('qid')['query'].apply(str).to_dict(),
                                df_passage.groupby('pid')['passage'].apply(str).to_dict(),
                                df_qppair_dev.groupby('qid')['docid'].apply(set).to_dict(),
                                accuracy_at_k = [10],
                                precision_recall_at_k  = [10],
                                map_at_k = [10], mrr_at_k=[10]
                                #, score_functions='cos_sim'
                                )

# multi-task training
print("train_batch_size", train_batch_size)
bi_encoder.fit(
          train_objectives=[
                        # (train_passage_dataloader, train_biencoder_loss_MultipleNegativesRanking), #SimCSE
                        (train_qp_contrastive_dataloader, train_biencoder_loss_OnlineContrastive),
                        # (train_qp_multiple_negative_ranking_dataloader, train_biencoder_loss_MultipleNegativesRanking),

                        (train_qp_mr_tydi_triple, train_biencoder_loss_MultipleNegativesRanking),
                        # (train_samples_qp_task_B_dataloader, train_biencoder_loss_MultipleNegativesRanking),
                        # (train_qp_triple_dataloader, train_biencoder_loss_Triple)
              ],
              evaluator=dev_evaluator,
              epochs=num_epochs,
              evaluation_steps=301,
              warmup_steps=warmup_steps,
              # output_path="D:/IR/Quran_QA/Task A/data/fine_tune_simcse-model_MultiTaskTraining_3epoch_64batchsize_contraintive_multiple_negative_ranking"
              )

2023-08-17 04:19:10 - Warmup-steps: 278
train_batch_size 64


Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

Iteration:   0%|          | 0/1372 [00:00<?, ?it/s]

2023-08-17 04:22:35 - Information Retrieval Evaluation on  dataset in epoch 0 after 301 steps:
2023-08-17 04:22:36 - Queries: 25
2023-08-17 04:22:36 - Corpus: 1266

2023-08-17 04:22:36 - Score-Function: cos_sim
2023-08-17 04:22:36 - Accuracy@10: 8.00%
2023-08-17 04:22:36 - Precision@10: 1.20%
2023-08-17 04:22:36 - Recall@10: 1.64%
2023-08-17 04:22:36 - MRR@10: 0.0140
2023-08-17 04:22:36 - NDCG@10: 0.0108
2023-08-17 04:22:36 - MAP@10: 0.0031
2023-08-17 04:22:36 - Score-Function: dot_score
2023-08-17 04:22:36 - Accuracy@10: 0.00%
2023-08-17 04:22:36 - Precision@10: 0.00%
2023-08-17 04:22:36 - Recall@10: 0.00%
2023-08-17 04:22:36 - MRR@10: 0.0000
2023-08-17 04:22:36 - NDCG@10: 0.0000
2023-08-17 04:22:36 - MAP@10: 0.0000
2023-08-17 04:26:00 - Information Retrieval Evaluation on  dataset in epoch 0 after 602 steps:
2023-08-17 04:26:01 - Queries: 25
2023-08-17 04:26:01 - Corpus: 1266

2023-08-17 04:26:01 - Score-Function: cos_sim
2023-08-17 04:26:01 - Accuracy@10: 20.00%
2023-08-17 04:26:01 

Iteration:   0%|          | 0/1372 [00:00<?, ?it/s]

2023-08-17 04:38:11 - Information Retrieval Evaluation on  dataset in epoch 1 after 301 steps:
2023-08-17 04:38:11 - Queries: 25
2023-08-17 04:38:11 - Corpus: 1266

2023-08-17 04:38:11 - Score-Function: cos_sim
2023-08-17 04:38:11 - Accuracy@10: 12.00%
2023-08-17 04:38:11 - Precision@10: 2.00%
2023-08-17 04:38:11 - Recall@10: 2.82%
2023-08-17 04:38:11 - MRR@10: 0.0700
2023-08-17 04:38:11 - NDCG@10: 0.0315
2023-08-17 04:38:11 - MAP@10: 0.0151
2023-08-17 04:38:11 - Score-Function: dot_score
2023-08-17 04:38:11 - Accuracy@10: 12.00%
2023-08-17 04:38:11 - Precision@10: 1.20%
2023-08-17 04:38:11 - Recall@10: 0.88%
2023-08-17 04:38:11 - MRR@10: 0.0297
2023-08-17 04:38:11 - NDCG@10: 0.0117
2023-08-17 04:38:11 - MAP@10: 0.0031
2023-08-17 04:41:36 - Information Retrieval Evaluation on  dataset in epoch 1 after 602 steps:
2023-08-17 04:41:36 - Queries: 25
2023-08-17 04:41:36 - Corpus: 1266

2023-08-17 04:41:36 - Score-Function: cos_sim
2023-08-17 04:41:36 - Accuracy@10: 12.00%
2023-08-17 04:41:3

Iteration:   0%|          | 0/1372 [00:00<?, ?it/s]

2023-08-17 04:53:45 - Information Retrieval Evaluation on  dataset in epoch 2 after 301 steps:
2023-08-17 04:53:46 - Queries: 25
2023-08-17 04:53:46 - Corpus: 1266

2023-08-17 04:53:46 - Score-Function: cos_sim
2023-08-17 04:53:46 - Accuracy@10: 8.00%
2023-08-17 04:53:46 - Precision@10: 1.20%
2023-08-17 04:53:46 - Recall@10: 1.35%
2023-08-17 04:53:46 - MRR@10: 0.0267
2023-08-17 04:53:46 - NDCG@10: 0.0134
2023-08-17 04:53:46 - MAP@10: 0.0048
2023-08-17 04:53:46 - Score-Function: dot_score
2023-08-17 04:53:46 - Accuracy@10: 20.00%
2023-08-17 04:53:46 - Precision@10: 2.00%
2023-08-17 04:53:46 - Recall@10: 3.55%
2023-08-17 04:53:46 - MRR@10: 0.0407
2023-08-17 04:53:46 - NDCG@10: 0.0247
2023-08-17 04:53:46 - MAP@10: 0.0073
2023-08-17 04:57:10 - Information Retrieval Evaluation on  dataset in epoch 2 after 602 steps:
2023-08-17 04:57:11 - Queries: 25
2023-08-17 04:57:11 - Corpus: 1266

2023-08-17 04:57:11 - Score-Function: cos_sim
2023-08-17 04:57:11 - Accuracy@10: 8.00%
2023-08-17 04:57:11 

In [50]:
passage_embeddings = bi_encoder.encode(df_passage['passage'].tolist(), convert_to_tensor=True, show_progress_bar=True)
query_train_embeddings = bi_encoder.encode(df_query_train['query'].tolist(), convert_to_tensor=True, show_progress_bar=True)
query_dev_embeddings = bi_encoder.encode(df_query_dev['query'].tolist(), convert_to_tensor=True, show_progress_bar=True)
query_tset_embeddings = bi_encoder.encode(df_query_test['query'].tolist(), convert_to_tensor=True, show_progress_bar=True)

df_passage['embedding'] = passage_embeddings.cpu().numpy().tolist()
df_query_train['embedding'] = query_train_embeddings.cpu().numpy().tolist()
df_query_dev['embedding'] = query_dev_embeddings.cpu().numpy().tolist()
df_query_test['embedding'] = query_tset_embeddings.cpu().numpy().tolist()

Batches:   0%|          | 0/40 [00:00<?, ?it/s]

Batches:   0%|          | 0/6 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

#### Semantic Search

In [51]:
tag = "SimCSE_bitrain40"
hits = util.semantic_search(query_train_embeddings, passage_embeddings, top_k=10)
df_run = save_query_passage_retrieval(hits, tag, run_save=True, df_query=df_query_train)

print(bi_model_name, "all passage embeddings", "\nnum_epochs =", num_epochs,)
! python QQA23_TaskA_eval.py \
    -r "../data/runs/SimCSE_bitrain40.tsv" \
    -q "../data/qrels/QQA23_TaskA_qrels_train.gold"
# df_run

../data\fine_tune/simcse-model all passage embeddings 
num_epochs = 3
Format check: Passed
 map_cut_10  recip_rank
   0.524258    0.657998


In [52]:
tag = "SimCSE_bidev40"
top_k = 10
hits = util.semantic_search(query_dev_embeddings, passage_embeddings, top_k=top_k)
# hits = util.cos_sim(query_dev_embeddings, passage_embeddings)
df_run = save_query_passage_retrieval(hits, tag, run_save=True, df_query=df_query_dev, top_k=top_k)

print(bi_model_name, "all passage embeddings")
! python QQA23_TaskA_eval.py \
    -r "../data/runs/SimCSE_bidev40.tsv" \
    -q "../data/qrels/QQA23_TaskA_qrels_dev.gold"

../data\fine_tune/simcse-model all passage embeddings
Format check: Passed
 map_cut_10  recip_rank
   0.108253    0.169333


### Test data

In [29]:
tag = "SimCSE_bitest40"
hits = util.semantic_search(query_tset_embeddings, passage_embeddings, top_k=top_k)
# hits = util.cos_sim(query_dev_embeddings, passage_embeddings)
df_run = save_query_passage_retrieval(hits, tag, run_save=True, df_query=df_query_test, top_k=top_k)


In [30]:
tag = "SimCSE_bmbiencd"
bm_num_results = 15
BM25_model = pt.BatchRetrieve(index, controls = {"wmodel": "BM25"}, num_results=bm_num_results)

bm25_biencoder_hit= []

for query in df_query_dev['query'].tolist():
    bm25_result = BM25_model.search(query)
    bm25_related_passage = bm25_result['docno'].tolist()
    passage =  df_passage[df_passage['pid'].isin(bm25_related_passage)]['passage'].tolist()

    try:
        query_embedding = bi_encoder.encode(query, convert_to_tensor=True, show_progress_bar=False)
        passage_embeddings = bi_encoder.encode(passage, convert_to_tensor=True, show_progress_bar=False)
        # TODO
        '''
        check similarity between query and passage, with methods is better for this task?
        util.dot_score
        util.cos_sim
        util.pairwise_dot_score
        util.pairwise_cos_sim
        '''
        hit = util.semantic_search(query_embedding, passage_embeddings, top_k=10)[0]
        mapping = {index : row['docno'] for index, row in bm25_result.iterrows()}

    except:
        #len passage is 0 but why ?
        print(f"len passage : {len(passage)}, qury : {query}")
        query_embedding = bi_encoder.encode(query, convert_to_tensor=True, show_progress_bar=False)
        passage = df_passage['passage'].tolist()
        passage_embeddings = bi_encoder.encode(passage, convert_to_tensor=True, show_progress_bar=False)

        hit = util.semantic_search(query_embedding, passage_embeddings, top_k=top_k)[0]
        mapping = {index : row['pid'] for index, row in df_passage.iterrows()}

    for i in range(len(hit)):
        hit[i]['corpus_id'] = mapping[hit[i]['corpus_id']]
    hit = sorted(hit, key=lambda x: x['score'], reverse=True)
    bm25_biencoder_hit.append(hit)

df_run = save_query_passage_retrieval(bm25_biencoder_hit, tag, run_save=True, df_query=df_query_dev)

print(bi_model_name, "BM25 hits with BiEncoder")
!python QQA23_TaskA_eval.py \
    -r "../data/runs/SimCSE_bmbiencd.tsv" \
    -q "../data/qrels/QQA23_TaskA_qrels_dev.gold"
# df_run

<class 'pandas.core.frame.DataFrame'>
../data\fine_tune/simcse-model BM25 hits with BiEncoder
Format check: Passed
 map_cut_10  recip_rank
   0.088159    0.212381


## Re-ranker (Cross-Encoder)

In [31]:
#TODO:
'''
https://github.com/Guzpenha/transformer_rankers
https://colab.research.google.com/drive/1wGmaO3emC7Sg-tA7nGehIQ2vjOLN9S5e?usp=sharing#scrollTo=y9ps5zmOHxe4
'''

# cross_model_name = "cross-encoder/ms-marco-MiniLM-L-6-v2"
# cross_model_name = "cross-encoder/ms-marco-TinyBERT-L-2"
# cross_model_name = "distilroberta-base"
# cross_model_name = "amberoad/bert-multilingual-passage-reranking-msmarco"
# cross_model_name = "ZeyadAhmed/AraElectra-Arabic-SQuADv2-CLS"

'\nhttps://github.com/Guzpenha/transformer_rankers\nhttps://colab.research.google.com/drive/1wGmaO3emC7Sg-tA7nGehIQ2vjOLN9S5e?usp=sharing#scrollTo=y9ps5zmOHxe4\n'

In [32]:
# torch.cuda.empty_cache()
#https://sbert.net/docs/package_reference/cross_encoder.html

# cross_encoder = CrossEncoder(cross_model_name)
# warmup_steps = math.ceil(len(train_qp_dataloader) * num_epochs * 0.1) #10% of train data for warm-up
# logging.info("Warmup-steps: {}".format(warmup_steps))
#
# cross_encoder.fit(train_qp_dataloader,
#           # evaluator=dev_evaluator,
#           epochs=num_epochs,
#           evaluation_steps=100,
#           warmup_steps=warmup_steps,
#           output_path=model_save_path
#           )

In [33]:
# tag = "SimCSE_cross"
# df_final = pd.DataFrame()
# for qid, filter_passages in df_run.groupby('qid'):
#     cross_inp = []
#     q = df_query_dev[df_query_dev['qid'] == qid]
#     for x in filter_passages.values:
#         p = df_passage[df_passage['pid'] == x[2]]
#         cross_inp.append([q['query'].tolist()[0], p['passage'].tolist()[0]])
#
#     similarity_scores = cross_encoder.predict(cross_inp, show_progress_bar=False)
#
#     # Sort the scores in decreasing order
#     sim_scores_argsort = reversed(np.argsort(similarity_scores))
#
#     print(len(similarity_scores))
#     print(len(filter_passages['pid'].values))
#     df_temp = pd.DataFrame({'qid': qid, 'pid': filter_passages['pid'].values, 'score': similarity_scores})
#     df_temp = df_temp.sort_values(by=['score'], ascending=False)[:10]
#     df_temp['rank'] = range(1, len(df_temp) + 1)
#
#     df_final = pd.concat([df_final, df_temp], ignore_index=True)
#
# df_final = save_query_passage_retrieval(df_final, tag, run_save=True)
#
# print("cross-encoder model name", cross_model_name)
# ! python QQA23_TaskA_eval.py \
#     -r "../data/runs/SimCSE_cross.tsv" \
#     -q "../data/qrels/QQA23_TaskA_qrels_train.gold"
# df_final

# Evaluation

In [34]:
# ! python QQA23_TaskA_eval.py \
#     -r "../data/runs/BM25_Final.tsv" \
#     -q "../data/qrels/QQA23_TaskA_qrels_dev.gold"

In [35]:
# ! python QQA23_TaskA_eval.py \
#     -r "../data/runs/bigIR_BM25.tsv" \
#     -q "../data/qrels/QQA23_TaskA_qrels_dev.gold"

In [36]:
# ! python QQA23_TaskA_eval.py \
#     -r "../data/runs/dev_perfect.tsv" \
#     -q "../data/qrels/QQA23_TaskA_qrels_dev.gold"