# Task A: Arabic Passage Retrieval

# Import Libraries

In [1]:

import math
import os
import re

import arabicstopwords.arabicstopwords as ar_stp
import nltk
import numpy as np
import pandas as pd
import pyterrier as pt
import torch
from datasets import load_dataset
from sentence_transformers import SentenceTransformer, util, InputExample
from sentence_transformers import evaluation
from sentence_transformers import models, losses, datasets
from snowballstemmer import stemmer
from torch import nn
# from simcse import SimSCE
from torch.utils.data import DataLoader

nltk.download('punkt')

print(torch.__version__)
print(torch.cuda.is_available())
# print(torch.cuda.device_count())
# print(torch.cuda.current_device())
print(torch.cuda.get_device_name(0))

2.0.1+cu118
True
NVIDIA GeForce RTX 3090


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\yegmo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Get Data

In [2]:
data_path = "../data"
index_path = os.path.join(data_path, "QPC_Index/data.properties")

query_train_path = os.path.join(data_path, "QQA23_TaskA_train.tsv")
query_dev_path = os.path.join(data_path, "QQA23_TaskA_dev.tsv")
query_test_path = os.path.join(data_path, "QQA23_TaskA_test.tsv")

passage_path = os.path.join(data_path, "Thematic_QPC/QQA23_TaskA_QPC_v1.1.tsv")

qp_pair_train_path = os.path.join(data_path, "qrels\QQA23_TaskA_qrels_train.gold")
qp_pair_dev_path = os.path.join(data_path, "qrels\QQA23_TaskA_qrels_dev.gold")

task_B_train_path = os.path.join(data_path, "Task B data/QQA23_TaskB_qrcd_v1.2_train_preprocessed.jsonl")

## Read file

In [3]:
def read_file(input_file, sep="\t", names=""):
    if input_file.endswith(".xlsx"):
        df = pd.read_excel(input_file)
    else:
        if names != "":
            df = pd.read_csv(input_file, sep=sep, names=names, encoding="utf-8")
        else:
            df = pd.read_csv(input_file, sep=sep, encoding="utf-8")
    return df


qrels_columns = ["qid", "Q0", "docid", "relevance"]


def read_qrels_file(qrels_file_path):
    # split_token = '\t' if format_checker.is_tab_sparated(qrels_file) else  "\s+"
    df_qrels = pd.read_csv(qrels_file_path, sep='\t', names=qrels_columns)
    df_qrels["qid"] = df_qrels["qid"].astype(str)
    df_qrels["docid"] = df_qrels["docid"].astype(str)
    return df_qrels

## Load Index

In [4]:
def load_index(index_path):
    if not pt.started():
        pt.init(helper_version="0.0.6")

    try:
        index = pt.IndexFactory.of(index_path)
        print("Index was loaded successfully from this path: ", index_path)
        return index
    except Exception as e:
        print('Cannot load the index, check exception details {}'.format(e))
        return []

In [5]:
# load index for BM25 retrieval
index = load_index(index_path=index_path)

PyTerrier 0.9.2 has loaded Terrier 5.7 (built by craigm on 2022-11-10 18:30) and terrier-helper 0.0.6



Index was loaded successfully from this path:  ../data\QPC_Index/data.properties


## Cleaning & Preprocessing
Clean text from urls, handles, special characters, tabs, line jumps, extra white space, and puntuations.
Preprocess the arabic input text by performing normalization, stemming, and removing stop words.

In [6]:
def clean(text):
    text = re.sub(r"http\S+", " ", text)  # remove urls
    text = re.sub(r"@[\w]*", " ", text)  # remove handles
    text = re.sub(r"[\.\,\#_\|\:\?\?\/\=]", " ", text)  # remove special characters
    text = re.sub(r"\t", " ", text)  # remove tabs
    text = re.sub(r"\n", " ", text)  # remove line jump
    text = re.sub(r"\s+", " ", text)  # remove extra white space
    text = re.sub(r'[^\w\s]', '', text)  # Removing punctuations in string using regex
    text = text.strip()
    return text

In [7]:
def ar_remove_stop_words(sentence):
    terms = []
    stopWords = set(ar_stp.stopwords_list())
    for term in sentence.split():
        if term not in stopWords:
            terms.append(term)
    return " ".join(terms)


def normalize_arabic(text):
    text = re.sub("[إأٱآا]", "ا", text)
    text = re.sub("ى", "ي", text)
    text = re.sub("ؤ", "ء", text)
    text = re.sub("ئ", "ء", text)
    text = re.sub("ة", "ه", text)
    return (text)


ar_stemmer = stemmer("arabic")


def ar_stem(sentence):
    return " ".join([ar_stemmer.stemWord(i) for i in sentence.split()])


def preprocess_arabic(text):
    text = normalize_arabic(text)
    text = ar_remove_stop_words(text)
    text = ar_stem(text)
    return text


def prepare_data(path, column, id_type, id_column='docno'):
    df = read_file(path, names=['docno', 'text'])

    df[column] = df['text'].apply(clean)
    df[column] = df[column].apply(preprocess_arabic)

    df[id_type] = df[id_column].astype(str)  # convert the id column to string
    df = df[[id_type, 'text', column]]  # keep the columns needed for search

    return df

In [8]:
df_passage = prepare_data(passage_path, 'passage', 'pid')

df_query_train = prepare_data(query_train_path, 'query', 'qid')
df_query_dev = prepare_data(query_dev_path, 'query', 'qid')
df_query_test = prepare_data(query_test_path, 'query', 'qid')

df_qppair_train = read_qrels_file(qp_pair_train_path)
df_qppair_dev = read_qrels_file(qp_pair_dev_path)

# Model - Sentence Embedding

## Convert Data to SentenceTransformer InputFormat

In [9]:
# contrastive: query-passage double pair with relevance label = 1 (positive) and not found BM25 top-k passages with relevance label = 0 (negative)
def get_qp_contrastive_data():
    top_k = 1000
    print("top_k =", top_k)
    train_samples_qp_contrastive = []

    BM25_model = pt.BatchRetrieve(index, controls={"wmodel": "BM25"}, num_results=top_k)

    for _, row in df_qppair_train.groupby('qid'):
        query_id = row['qid'].tolist()[0]
        query = df_query_train[df_query_train['qid'] == query_id]['query'].tolist()[0]
        bm25_related_passage = BM25_model.search(query)['docno'].tolist()
        positive_passage = row['docid'].tolist()
        negative_passage = list(set(bm25_related_passage) - set(positive_passage))

        for pos_passage in positive_passage:
            if pos_passage == '-1':
                continue
            else:
                passage = df_passage[df_passage['pid'] == pos_passage]['passage'].tolist()[0]
                label = 1
                #positive sample
                train_samples_qp_contrastive.append(InputExample(texts=[query, passage], label=label))
                train_samples_qp_contrastive.append(InputExample(texts=[passage, query], label=label))

        for neg_passage in negative_passage:
            if neg_passage == '-1':
                continue
            else:
                passage = df_passage[df_passage['pid'] == neg_passage]['passage'].tolist()[0]
                label = 0
                #negative sample
                train_samples_qp_contrastive.append(InputExample(texts=[query, passage], label=label))
                train_samples_qp_contrastive.append(InputExample(texts=[passage, query], label=label))

    print("len(train_samples_qp_contrastive) =", len(train_samples_qp_contrastive))
    return train_samples_qp_contrastive

In [10]:
# multiple negative ranking: query-passage double pair with relevance label = 1 (positive)
def get_qp_multi_neg_data():
    train_samples_qp_multiple_negative_ranking = []

    for _, row in df_qppair_train.groupby('qid'):
        query_id = row['qid'].tolist()[0]
        query = df_query_train[df_query_train['qid'] == query_id]['query'].tolist()[0]
        positive_passage = row['docid'].tolist()
        for pos_passage in positive_passage:
            if pos_passage == '-1':
                continue
            else:
                passage = df_passage[df_passage['pid'] == pos_passage]['passage'].tolist()[0]
                label = 1
                #positive sample
                train_samples_qp_multiple_negative_ranking.append(InputExample(texts=[query, passage], label=label))
                train_samples_qp_multiple_negative_ranking.append(InputExample(texts=[passage, query], label=label))
    print("len(train_samples_qp_multiple_negative_ranking) =", len(train_samples_qp_multiple_negative_ranking))
    return train_samples_qp_multiple_negative_ranking

In [11]:
# triple: query-positive passage-negative passage; negative passage is not found in BM25 top-k passages
def get_qp_triple_data():
    top_k = 100
    print("top_k =", top_k)
    train_samples_qp_triple = []

    BM25_model = pt.BatchRetrieve(index, controls={"wmodel": "BM25"}, num_results=top_k)

    for _, row in df_qppair_train.groupby('qid'):
        query_id = row['qid'].tolist()[0]
        query = df_query_train[df_query_train['qid'] == query_id]['query'].tolist()[0]
        bm25_related_passage = BM25_model.search(query)['docno'].tolist()
        positive_passage = row['docid'].tolist()
        negative_passage = list(set(bm25_related_passage) - set(positive_passage))

        for pos_passage_id in positive_passage:
            for neg_passage_id in negative_passage:
                if pos_passage_id == '-1':
                    continue
                else:
                    pos_passage = df_passage[df_passage['pid'] == pos_passage_id]['passage'].tolist()[0]
                    neg_passage = df_passage[df_passage['pid'] == neg_passage_id]['passage'].tolist()[0]
                    train_samples_qp_triple.append(InputExample(texts=[query, pos_passage, neg_passage]))
    print("len train_samples_qp_triple =", len(train_samples_qp_triple))
    return train_samples_qp_triple

### Additional Data

#### Mr. TyDi

In [12]:
#https://huggingface.co/datasets/castorini/mr-tydi
def load_mr_tydi_data():
    mr_tydi_dataset = load_dataset("castorini/mr-tydi", "arabic", split="train").to_pandas()

    train_samples_mr_tydi_triple = []
    for _, row in mr_tydi_dataset.iterrows():
        query = row['query']
        positive_passages = row['positive_passages']
        negative_passages = row['negative_passages']

        for pos_passage in positive_passages:
            for neg_passage in negative_passages:
                train_samples_mr_tydi_triple.append(InputExample(texts=[query, pos_passage, neg_passage]))

    print("len(train_samples_mr_tydi_triple) =", len(train_samples_mr_tydi_triple))
    return train_samples_mr_tydi_triple

#### Task B Data

In [13]:
def load_task_b_data(task_B_train_path):
    import json

    train_passage_question_objects_task_B = []
    with open(task_B_train_path, 'r', encoding='utf-8') as f:
        for line in f:
            train_passage_question_objects_task_B.append(json.loads(line.rstrip('\n|\r')))
        print('Loaded {} records from {}'.format(len(train_passage_question_objects_task_B), task_B_train_path))

    train_samples_qp_task_B = []
    for passage_question_object in train_passage_question_objects_task_B:
        query = passage_question_object["question"]
        passage = passage_question_object["passage"]
        if len(passage_question_object["answers"]) > 0:
            train_samples_qp_task_B.append(InputExample(texts=[query, passage], label=1))
        else:
            train_samples_qp_task_B.append(InputExample(texts=[query, passage], label=0))

    train_passage_question_objects_task_B = pd.DataFrame(train_passage_question_objects_task_B)

    print("len(train_samples_qp_task_B) =", len(train_samples_qp_task_B))
    return train_samples_qp_task_B

## Pre-Training

### TSDAE

In [14]:
def run_tsdae(model_name, train_samples_qp_task_B, batch_size, num_epochs, save_name):
    word_embedding_model = models.Transformer(model_name)
    pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), 'cls')
    model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

    # Define a list with sentences (1k - 100k sentences)
    train_sentences = df_passage['passage'].tolist()
    # Create the special denoising dataset that adds noise on-the-fly
    train_dataset = datasets.DenoisingAutoEncoderDataset(train_sentences)
    train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

    train_samples_qp_task_B_dataloader = DataLoader(train_samples_qp_task_B, batch_size=batch_size, shuffle=True)

    # Use the denoising auto-encoder loss
    train_loss = losses.DenoisingAutoEncoderLoss(model, decoder_name_or_path=model_name, tie_encoder_decoder=True)
    constraint_loss = losses.ContrastiveLoss(model)

    model.fit(
        train_objectives=[(train_dataloader, train_loss), (train_samples_qp_task_B_dataloader, constraint_loss)],
        epochs=num_epochs,
        weight_decay=0,
        scheduler='constantlr',
        optimizer_params={'lr': 3e-5},
        show_progress_bar=True
    )

    model_name = os.path.join(data_path, f'fine_tune/tsdae-model-{save_name}')
    model.save(model_name)

    return model_name

### SimCSE

In [29]:
def run_simcse(model_name, batch_size, num_epochs, save_name, train_samples_qp_task_B=None):
    # Define your sentence transformer model using CLS pooling
    word_embedding_model = models.Transformer(model_name, max_seq_length=164)
    pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
    model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

    train_objectives = []

    # Define a list with sentences (1k - 100k sentences)
    train_sentences = df_passage['passage'].tolist()
    # Convert train sentences to sentence pairs
    train_data = [InputExample(texts=[s, s]) for s in train_sentences]
    train_dataloader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
    train_loss = losses.MultipleNegativesRankingLoss(model)
    train_objectives.append((train_dataloader, train_loss))

    if train_samples_qp_task_B:
        train_samples_qp_task_B_dataloader = DataLoader(train_samples_qp_task_B, batch_size=batch_size, shuffle=True)
        constraint_loss = losses.ContrastiveLoss(model)
        train_objectives.append((train_samples_qp_task_B_dataloader, constraint_loss))

    model.fit(
        train_objectives=train_objectives,
        epochs=num_epochs,
        show_progress_bar=True)

    model_name = os.path.join(data_path, f'fine_tune/simcse-model-{save_name}')
    model.save(model_name)

    return model_name

# Bi-Encoder

### Fine-Tuning

In [16]:
def build_biencoder(sentence_embedder, max_seq_len):
    # Use Huggingface/transformers model (like BERT, RoBERTa, XLNet, XLM-R) for mapping tokens to embeddings
    word_embedding_model = models.Transformer(sentence_embedder, max_seq_length=max_seq_len)
    print("word_embedding_model Max Sequence Length:", word_embedding_model.max_seq_length)
    print("word_embedding_model dimension", word_embedding_model.get_word_embedding_dimension())

    # Apply mean pooling to get one fixed sized sentence vector
    pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
    print("pooling_model sentence embedding dimension", pooling_model.get_sentence_embedding_dimension())

    dense_model = models.Dense(in_features=pooling_model.get_sentence_embedding_dimension(), out_features=512,
                               activation_function=nn.Tanh())
    bi_encoder = SentenceTransformer(modules=[word_embedding_model, pooling_model, dense_model])

    return bi_encoder

In [17]:
def train_biencoder(bi_encoder, train_objectives, batch_size, num_epochs, warmup_steps):
    torch.cuda.empty_cache()

    dev_evaluator = evaluation.InformationRetrievalEvaluator(
        df_query_dev.groupby('qid')['query'].apply(str).to_dict(),
        df_passage.groupby('pid')['passage'].apply(str).to_dict(),
        df_qppair_dev.groupby('qid')['docid'].apply(set).to_dict(),
        accuracy_at_k=[10],
        precision_recall_at_k=[10],
        map_at_k=[10], mrr_at_k=[10]
    )

    # multi-task training
    print("train_batch_size", batch_size)
    bi_encoder.fit(
        train_objectives=train_objectives,
        evaluator=dev_evaluator,
        epochs=num_epochs,
        evaluation_steps=301,
        warmup_steps=warmup_steps,
        # output_path="D:/IR/Quran_QA/Task A/data/fine_tune_simcse-model_MultiTaskTraining_3epoch_64batchsize_contraintive_multiple_negative_ranking"
    )

    return bi_encoder

### Encode

In [18]:
def encode_by_biencoder(encoder):
    passage_embeddings = encoder.encode(df_passage['passage'].tolist(), convert_to_tensor=True,
                                        show_progress_bar=True)
    query_train_embeddings = encoder.encode(df_query_train['query'].tolist(), convert_to_tensor=True,
                                            show_progress_bar=True)
    query_dev_embeddings = encoder.encode(df_query_dev['query'].tolist(), convert_to_tensor=True,
                                          show_progress_bar=True)
    query_tset_embeddings = encoder.encode(df_query_test['query'].tolist(), convert_to_tensor=True,
                                           show_progress_bar=True)

    # won't be returned therefore ignored
    # df_passage['embedding'] = passage_embeddings.cpu().numpy().tolist()
    # df_query_train['embedding'] = query_train_embeddings.cpu().numpy().tolist()
    # df_query_dev['embedding'] = query_dev_embeddings.cpu().numpy().tolist()
    # df_query_test['embedding'] = query_tset_embeddings.cpu().numpy().tolist()

    return passage_embeddings, query_train_embeddings, query_dev_embeddings, query_tset_embeddings

### Evaluate - Semantic Search

In [19]:
def evaluate_biencoder(query_embeddings, passage_embeddings, df_query, tag):
    hits = util.semantic_search(query_embeddings, passage_embeddings, top_k=10)
    df_run = save_query_passage_retrieval(hits, tag, run_save=True, df_query=df_query, top_k=10)
    return df_run

In [20]:
def save_query_passage_retrieval(result, tag, run_save=False, df_query=df_query_train, top_k=10):
    if "bienc_" in tag:
        np_result = np.array(result).flatten()
        result = pd.DataFrame()

        result["qid"] = df_query["qid"].tolist() * top_k
        result = result.sort_values(by=['qid']).reset_index(drop=True)
        result["Q0"] = ["Q0"] * len(result)
        result["pid"] = [df_passage.iloc[x['corpus_id']]['pid'] for x in np_result]
        result["rank"] = list(range(1, top_k + 1)) * len(df_query)
        result["score"] = [x['score'] for x in np_result]
        result["tag"] = [tag] * len(np_result)

    elif tag == "SimCSE_bmbiencd":
        df_result = pd.DataFrame()
        for i in range(len(bm25_biencoder_hit)):
            for j in range(len(bm25_biencoder_hit[i])):
                new_record = pd.DataFrame([{"qid": df_query_dev['qid'].tolist()[i],
                                            "Q0": "Q0",
                                            "pid": bm25_biencoder_hit[i][j]['corpus_id'],
                                            "rank": j,
                                            "score": bm25_biencoder_hit[i][j]['score'],
                                            "tag": tag
                                            }])
                df_result = pd.concat([df_result, new_record], ignore_index=True)
        result = df_result
        print(type(result))

    elif tag == "BM25":
        result["Q0"] = ["Q0"] * len(result)
        result["tag"] = [tag] * len(result)
        result['qid'] = result["qid"]
        result['pid'] = result["docno"]
        tag = "BM25_Final"
        result = result[["qid", "Q0", "pid", "rank", "score", "tag"]]

    elif tag == "biencoder_cross":
        result['tag'] = tag
        result['Q0'] = 'Q0'
        result = result[["qid", "Q0", "pid", "rank", "score", "tag"]]

    if run_save:
        run_save_path = os.path.join(data_path, f"runs/{tag}.tsv")
        # print(run_save_path)
        result.to_csv(run_save_path, sep="\t", index=False, header=False)

    return result

#  GYM Runs

In [22]:
def gym_run1():
    model_name = 'aubmindlab/bert-base-arabert'
    train_samples_qp_task_B = load_task_b_data(task_B_train_path)
    sentence_embedder_name = run_tsdae(model_name, train_samples_qp_task_B, batch_size=8, num_epochs=3,
                                       save_name="run1")

    max_seq_len = 128
    bi_encoder = build_biencoder(sentence_embedder_name, max_seq_len)

    train_batch_size = 64
    train_samples_qp_contrastive = get_qp_contrastive_data()
    train_samples_mr_tydi_triple = load_mr_tydi_data()
    train_qp_contrastive_dataloader = DataLoader(train_samples_qp_contrastive, shuffle=True,
                                                 batch_size=train_batch_size)
    train_qp_mr_tydi_triple = DataLoader(train_samples_mr_tydi_triple, shuffle=True, batch_size=train_batch_size)

    # train_biencoder_loss_OnlineContrastive = losses.OnlineContrastiveLoss(bi_encoder)
    train_biencoder_loss_Contrastive = losses.ContrastiveLoss(bi_encoder)
    train_biencoder_loss_MultipleNegativesRanking = losses.MultipleNegativesRankingLoss(bi_encoder)

    num_epochs = 3
    warmup_steps = math.ceil(len(train_qp_contrastive_dataloader) * num_epochs * 0.1)  #10% of train data for warm-up

    train_objectives = [
        # (train_qp_contrastive_dataloader, train_biencoder_loss_OnlineContrastive),
        (train_qp_contrastive_dataloader, train_biencoder_loss_Contrastive),
        (train_qp_mr_tydi_triple, train_biencoder_loss_MultipleNegativesRanking),
    ]
    bi_encoder = train_biencoder(bi_encoder, train_objectives, train_batch_size, num_epochs, warmup_steps)
    # model_save_path = os.path.join(data_path, f'model/biencoder-tsdae-run1')

    passage_embeddings, query_train_embeddings, query_dev_embeddings, query_tset_embeddings = encode_by_biencoder(
        bi_encoder)

    df_run_train = evaluate_biencoder(query_train_embeddings, passage_embeddings, df_query_train, tag="bienc_train1")
    ! python QQA23_TaskA_eval.py \
            -r "../data/runs/bienc_train1.tsv" \
            -q "../data/qrels/QQA23_TaskA_qrels_train.gold"

    df_run_dev = evaluate_biencoder(query_dev_embeddings, passage_embeddings, df_query_dev, tag="bienc_dev1")
    ! python QQA23_TaskA_eval.py \
            -r "../data/runs/bienc_dev1.tsv" \
            -q "../data/qrels/QQA23_TaskA_qrels_dev.gold"

    hits = util.semantic_search(query_tset_embeddings, passage_embeddings, top_k=10)
    df_run = save_query_passage_retrieval(hits, "bienc_test1", run_save=True, df_query=df_query_test, top_k=10)

    return bi_encoder


bi_encoder_run1 = gym_run1()

Loaded 992 records from ../data\Task B data/QQA23_TaskB_qrcd_v1.2_train_preprocessed.jsonl
len(train_samples_qp_task_B) = 992


When tie_encoder_decoder=True, the decoder_name_or_path will be invalid.
Some weights of BertLMHeadModel were not initialized from the model checkpoint at aubmindlab/bert-base-arabert and are newly initialized: ['bert.encoder.layer.4.crossattention.self.key.weight', 'bert.encoder.layer.11.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.10.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.3.crossattention.self.query.bias', 'bert.encoder.layer.7.crossattention.self.key.weight', 'bert.encoder.layer.7.crossattention.self.value.weight', 'bert.encoder.layer.9.crossattention.self.key.bias', 'bert.encoder.layer.6.crossattention.self.value.bias', 'bert.encoder.layer.5.crossattention.output.dense.weight', 'bert.encoder.layer.11.crossattention.self.value.weight', 'bert.encoder.layer.4.crossattention.self.key.bias', 'bert.encoder.layer.4.crossattention.self.query.weight', 'bert.encoder.layer.8.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.6.crossattention.outpu

Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

Iteration:   0%|          | 0/124 [00:00<?, ?it/s]

Iteration:   0%|          | 0/124 [00:00<?, ?it/s]

Iteration:   0%|          | 0/124 [00:00<?, ?it/s]

word_embedding_model Max Sequence Length: 128
word_embedding_model dimension 768
pooling_model sentence embedding dimension 768
top_k = 1000
len(train_samples_qp_contrastive) = 87784
len(train_samples_mr_tydi_triple) = 362146
train_batch_size 64


Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

Iteration:   0%|          | 0/1372 [00:00<?, ?it/s]

Iteration:   0%|          | 0/1372 [00:00<?, ?it/s]

Iteration:   0%|          | 0/1372 [00:00<?, ?it/s]

Batches:   0%|          | 0/40 [00:00<?, ?it/s]

Batches:   0%|          | 0/6 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Format check: Passed
 map_cut_10  recip_rank
   0.679667    0.792241
Format check: Passed
 map_cut_10  recip_rank
    0.14628        0.26


In [30]:
def gym_run0():
    model_name = 'aubmindlab/bert-base-arabert'
    sentence_embedder_name = run_simcse(model_name, batch_size=128, num_epochs=5, save_name="run0")

    max_seq_len = 128
    bi_encoder = build_biencoder(sentence_embedder_name, max_seq_len)

    train_batch_size = 64
    train_samples_qp_contrastive = get_qp_contrastive_data()
    train_samples_mr_tydi_triple = load_mr_tydi_data()
    train_qp_contrastive_dataloader = DataLoader(train_samples_qp_contrastive, shuffle=True,
                                                 batch_size=train_batch_size)
    train_qp_mr_tydi_triple = DataLoader(train_samples_mr_tydi_triple, shuffle=True, batch_size=train_batch_size)

    # train_biencoder_loss_OnlineContrastive = losses.OnlineContrastiveLoss(bi_encoder)
    train_biencoder_loss_Contrastive = losses.ContrastiveLoss(bi_encoder)
    train_biencoder_loss_MultipleNegativesRanking = losses.MultipleNegativesRankingLoss(bi_encoder)

    num_epochs = 3
    warmup_steps = math.ceil(len(train_qp_contrastive_dataloader) * num_epochs * 0.1)  #10% of train data for warm-up

    train_objectives = [
        # (train_qp_contrastive_dataloader, train_biencoder_loss_OnlineContrastive),
        (train_qp_contrastive_dataloader, train_biencoder_loss_Contrastive),
        (train_qp_mr_tydi_triple, train_biencoder_loss_MultipleNegativesRanking),
    ]
    bi_encoder = train_biencoder(bi_encoder, train_objectives, train_batch_size, num_epochs, warmup_steps)
    # model_save_path = os.path.join(data_path, f'model/biencoder-simsce-run0')

    passage_embeddings, query_train_embeddings, query_dev_embeddings, query_tset_embeddings = encode_by_biencoder(
        bi_encoder)

    df_run_train = evaluate_biencoder(query_train_embeddings, passage_embeddings, df_query_train, tag="bienc_train0")
    ! python QQA23_TaskA_eval.py \
            -r "../data/runs/bienc_train0.tsv" \
            -q "../data/qrels/QQA23_TaskA_qrels_train.gold"

    df_run_dev = evaluate_biencoder(query_dev_embeddings, passage_embeddings, df_query_dev, tag="bienc_dev0")
    ! python QQA23_TaskA_eval.py \
            -r "../data/runs/bienc_dev0.tsv" \
            -q "../data/qrels/QQA23_TaskA_qrels_dev.gold"

    hits = util.semantic_search(query_tset_embeddings, passage_embeddings, top_k=10)
    df_run = save_query_passage_retrieval(hits, "bienc_test0", run_save=True, df_query=df_query_test, top_k=10)

    return bi_encoder


bi_encoder_run0 = gym_run0()

Epoch:   0%|          | 0/5 [00:00<?, ?it/s]

Iteration:   0%|          | 0/10 [00:00<?, ?it/s]

Iteration:   0%|          | 0/10 [00:00<?, ?it/s]

Iteration:   0%|          | 0/10 [00:00<?, ?it/s]

Iteration:   0%|          | 0/10 [00:00<?, ?it/s]

Iteration:   0%|          | 0/10 [00:00<?, ?it/s]

word_embedding_model Max Sequence Length: 128
word_embedding_model dimension 768
pooling_model sentence embedding dimension 768
top_k = 1000
len(train_samples_qp_contrastive) = 87784
len(train_samples_mr_tydi_triple) = 362146
train_batch_size 64


Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

Iteration:   0%|          | 0/1372 [00:00<?, ?it/s]

Iteration:   0%|          | 0/1372 [00:00<?, ?it/s]

Iteration:   0%|          | 0/1372 [00:00<?, ?it/s]

Batches:   0%|          | 0/40 [00:00<?, ?it/s]

Batches:   0%|          | 0/6 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Format check: Passed
 map_cut_10  recip_rank
   0.645399    0.756034
Format check: Passed
 map_cut_10  recip_rank
   0.098728    0.206381


In [31]:
def gym_run2():
    model_name = 'aubmindlab/bert-base-arabert'
    train_samples_qp_task_B = load_task_b_data(task_B_train_path)
    sentence_embedder_name = run_simcse(model_name, batch_size=128, num_epochs=5, save_name="run2")

    max_seq_len = 128
    bi_encoder = build_biencoder(sentence_embedder_name, max_seq_len)

    train_batch_size = 64
    train_samples_qp_triple = get_qp_triple_data()
    train_samples_mr_tydi_triple = load_mr_tydi_data()
    train_qp_triple_dataloader = DataLoader(train_samples_qp_triple, shuffle=True, batch_size=train_batch_size)
    train_qp_mr_tydi_triple = DataLoader(train_samples_mr_tydi_triple, shuffle=True, batch_size=train_batch_size)

    train_biencoder_loss_Triple = losses.TripletLoss(bi_encoder)
    train_biencoder_loss_MultipleNegativesRanking = losses.MultipleNegativesRankingLoss(bi_encoder)

    num_epochs = 3
    warmup_steps = math.ceil(len(train_qp_triple_dataloader) * num_epochs * 0.1)  #10% of train data for warm-up

    train_objectives = [
        (train_qp_triple_dataloader, train_biencoder_loss_Triple),
        (train_qp_mr_tydi_triple, train_biencoder_loss_MultipleNegativesRanking),
    ]
    bi_encoder = train_biencoder(bi_encoder, train_objectives, train_batch_size, num_epochs, warmup_steps)
    # model_save_path = os.path.join(data_path, f'model/biencoder-simsce-run2')

    passage_embeddings, query_train_embeddings, query_dev_embeddings, query_tset_embeddings = encode_by_biencoder(
        bi_encoder)

    df_run_train = evaluate_biencoder(query_train_embeddings, passage_embeddings, df_query_train, tag="bienc_train2")
    ! python QQA23_TaskA_eval.py \
            -r "../data/runs/bienc_train2.tsv" \
            -q "../data/qrels/QQA23_TaskA_qrels_train.gold"

    df_run_dev = evaluate_biencoder(query_dev_embeddings, passage_embeddings, df_query_dev, tag="bienc_dev2")
    ! python QQA23_TaskA_eval.py \
            -r "../data/runs/bienc_dev2.tsv" \
            -q "../data/qrels/QQA23_TaskA_qrels_dev.gold"

    hits = util.semantic_search(query_tset_embeddings, passage_embeddings, top_k=10)
    df_run = save_query_passage_retrieval(hits, "bienc_test2", run_save=True, df_query=df_query_test, top_k=10)

    return bi_encoder


bi_encoder_run2 = gym_run2()

Loaded 992 records from ../data\Task B data/QQA23_TaskB_qrcd_v1.2_train_preprocessed.jsonl
len(train_samples_qp_task_B) = 992


Epoch:   0%|          | 0/5 [00:00<?, ?it/s]

Iteration:   0%|          | 0/10 [00:00<?, ?it/s]

Iteration:   0%|          | 0/10 [00:00<?, ?it/s]

Iteration:   0%|          | 0/10 [00:00<?, ?it/s]

Iteration:   0%|          | 0/10 [00:00<?, ?it/s]

Iteration:   0%|          | 0/10 [00:00<?, ?it/s]

word_embedding_model Max Sequence Length: 128
word_embedding_model dimension 768
pooling_model sentence embedding dimension 768
top_k = 100
len train_samples_qp_triple = 59120
len(train_samples_mr_tydi_triple) = 362146
train_batch_size 64


Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

Iteration:   0%|          | 0/924 [00:00<?, ?it/s]

Iteration:   0%|          | 0/924 [00:00<?, ?it/s]

Iteration:   0%|          | 0/924 [00:00<?, ?it/s]

Batches:   0%|          | 0/40 [00:00<?, ?it/s]

Batches:   0%|          | 0/6 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Format check: Passed
 map_cut_10  recip_rank
   0.496633    0.614637
Format check: Passed
 map_cut_10  recip_rank
   0.062572    0.091111


# Scratchpad

In [None]:
tag = "SimCSE_bmbiencd"
bm_num_results = 15

BM25_model = pt.BatchRetrieve(index, controls={"wmodel": "BM25"}, num_results=bm_num_results)

bm25_biencoder_hit = []

for query in df_query_dev['query'].tolist():
    bm25_result = BM25_model.search(query)
    bm25_related_passage = bm25_result['docno'].tolist()
    passage = df_passage[df_passage['pid'].isin(bm25_related_passage)]['passage'].tolist()

    try:
        query_embedding = bi_encoder.encode(query, convert_to_tensor=True, show_progress_bar=False)
        passage_embeddings = bi_encoder.encode(passage, convert_to_tensor=True, show_progress_bar=False)
        # TODO
        '''
        check similarity between query and passage, with methods is better for this task?
        util.dot_score
        util.cos_sim
        util.pairwise_dot_score
        util.pairwise_cos_sim
        '''
        hit = util.semantic_search(query_embedding, passage_embeddings, top_k=10)[0]
        mapping = {index: row['docno'] for index, row in bm25_result.iterrows()}

    except:
        #len passage is 0 but why ?
        print(f"len passage : {len(passage)}, qury : {query}")
        query_embedding = bi_encoder.encode(query, convert_to_tensor=True, show_progress_bar=False)
        passage = df_passage['passage'].tolist()
        passage_embeddings = bi_encoder.encode(passage, convert_to_tensor=True, show_progress_bar=False)

        hit = util.semantic_search(query_embedding, passage_embeddings, top_k=top_k)[0]
        mapping = {index: row['pid'] for index, row in df_passage.iterrows()}

    for i in range(len(hit)):
        hit[i]['corpus_id'] = mapping[hit[i]['corpus_id']]
    hit = sorted(hit, key=lambda x: x['score'], reverse=True)
    bm25_biencoder_hit.append(hit)

df_run = save_query_passage_retrieval(bm25_biencoder_hit, tag, run_save=True, df_query=df_query_dev)

print(bi_model_name, "BM25 hits with BiEncoder")
!python QQA23_TaskA_eval.py \
    -r "../data/runs/SimCSE_bmbiencd.tsv" \
    -q "../data/qrels/QQA23_TaskA_qrels_dev.gold"
# df_run

In [None]:
def help_variables():
    train_samples_qp_contrastive = get_qp_contrastive_data()
    train_samples_qp_multiple_negative_ranking = get_qp_multi_neg_data()
    train_samples_qp_triple = get_qp_triple_data()
    train_samples_mr_tydi_triple = load_mr_tydi_data()
    train_samples_qp_task_B = load_task_b_data(task_B_train_path)

    train_passage_dataloader = DataLoader(train_samples_passage, shuffle=True, batch_size=train_batch_size)
    train_qp_dataloader = DataLoader(train_samples_qp, shuffle=True, batch_size=train_batch_size)
    train_qp_contrastive_dataloader = DataLoader(train_samples_qp_contrastive, shuffle=True,
                                                 batch_size=train_batch_size)
    train_qp_triple_dataloader = DataLoader(train_samples_qp_triple, shuffle=True, batch_size=train_batch_size)
    train_qp_multiple_negative_ranking_dataloader = DataLoader(train_samples_qp_multiple_negative_ranking, shuffle=True,
                                                               batch_size=train_batch_size)
    train_qp_mr_tydi_triple = DataLoader(train_samples_mr_tydi_triple, shuffle=True, batch_size=train_batch_size)
    train_samples_qp_task_B_dataloader = DataLoader(train_samples_qp_task_B, shuffle=True, batch_size=train_batch_size)

    train_biencoder_loss_MultipleNegativesRanking = losses.MultipleNegativesRankingLoss(bi_encoder)
    train_biencoder_loss_Contrastive = losses.ContrastiveLoss(bi_encoder)
    train_biencoder_loss_Triple = losses.TripletLoss(model=bi_encoder)
    train_biencoder_loss_OnlineContrastive = losses.OnlineContrastiveLoss(bi_encoder)
    train_biencoder_loss_CosineSimilarity = losses.CosineSimilarityLoss(bi_encoder)

    train_objectives = [
        (train_passage_dataloader, train_biencoder_loss_MultipleNegativesRanking),  #SimCSE
        (train_qp_contrastive_dataloader, train_biencoder_loss_OnlineContrastive),
        (train_qp_multiple_negative_ranking_dataloader, train_biencoder_loss_MultipleNegativesRanking),

        (train_qp_mr_tydi_triple, train_biencoder_loss_MultipleNegativesRanking),
        (train_samples_qp_task_B_dataloader, train_biencoder_loss_MultipleNegativesRanking),
        (train_qp_triple_dataloader, train_biencoder_loss_Triple)
    ]

In [None]:
# https://www.sbert.net/docs/pretrained_models.html
# https://www.sbert.net/docs/pretrained-models/msmarco-v3.html

################# fine tune #################
# bi_model_name = "data_path, f'model/fine_tune/tsdae-model"
# bi_model_name = "fine_tune/tsdae-model"

# bi_model_name = os.path.join(data_path, "fine_tune/simcse-model") # Best
# bi_model_name = os.path.join(data_path, "fine_tune/simcse-model-v1")
# bi_model_name = os.path.join(data_path, "fine_tune/simcse-model-v2")
# bi_model_name = os.path.join(data_path, "fine_tune/simcse-model-v4")
# bi_model_name = os.path.join(data_path, "fine_tune/simcse-model-v5")

# bi_model_name = os.path.join(data_path, "fine_tune/tsdae-model-v1")


# bi_model_name = "fine_tine/ct-model"
# bi_model_name = "fine_tine/simcse-model-paraphrase-multilingual-mpnet-base-v2"

####################################### Our Best Models #######################################
# bi_model_name = "aubmindlab/bert-base-arabert"
# bi_model_name = "sentence-transformers/paraphrase-multilingual-mpnet-base-v2"

# bi_model_name = "xlm-roberta-base"
# bi_model_name = "sentence-transformers/paraphrase-xlm-r-multilingual-v1"
# bi_model_name = "sentence-transformers/msmarco-distilbert-base-v4"

# bi_model_name = "aubmindlab/bert-base-arabertv2"
# bi_model_name = "aubmindlab/araelectra-base-discriminator"
# bi_model_name = "aubmindlab/bert-base-arabertv01"
# bi_model_name = "aubmindlab/bert-base-arabertv2"
# bi_model_name = "aubmindlab/bert-base-arabertv02"

# i guess this is not a good model for this task but we should try
#bi_model_name = "gfdgdfgdg/arap_qa_bert" # run 5 : pAP@10 = 0.184
#bi_model_name = "gfdgdfgdg/arap_qa_bert_large_v2" # run6 : pAP@10 = 0.372
#bi_model_name = "gfdgdfgdg/arap_qa_bert_v2" # run7 : pAP@10 = 0.344
#bi_model_name = "zohaib99k/Bert_Arabic-SQuADv2-QA" # run8 : pAP@10 = 0.435
#bi_model_name = "arabi-elidrisi/ArabicDistilBERT_QA" #run 9 : pAP@10 = 0.343
#bi_model_name = "MMars/Question_Answering_AraBERT_xtreme_ar" #run 10 : pAP@10 = 0.337
# bi_model_name = "abdalrahmanshahrour/ArabicQA" # run 11 : pAP@10 = 0.304
# bi_model_name = "abdalrahmanshahrour/xtremeQA-ar" # run 12 : pAP@10 = 0.120


####################################### Checked models #######################################
# bi_model_name = "aubmindlab/bert-large-arabertv02"

# bi_model_name = "sentence-transformers/distiluse-base-multilingual-cased-v1" ## train : 0.48, dev : 0.12
# bi_model_name = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"

# bi_model_name = "wissamantoun/araelectra-base-artydiqa" # run 1 : train:0.51 but dev near 0
# bi_model_name = "salti/AraElectra-base-finetuned-ARCD" # run 2 : pAP@10 = 0.397
# bi_model_name = "ZeyadAhmed/AraElectra-Arabic-SQuADv2-QA" # run 3 : pAP@10 = 0.435
# bi_model_name = "timpal0l/mdeberta-v3-base-squad2" # run 4 : pAP@10 = 0.367


#TODO : https://github.com/UKPLab/sentence-transformers/blob/master/examples/training/multilingual/make_multilingual.py
#teacher_model_name = 'fine_tine/simcse-model'   #Our monolingual teacher model, we want to convert to multiple languages
#student_model_name = 'xlm-roberta-base'       #Multilingual base model we use to imitate the teacher model

# model_save_path = os.path.join(data_path, f'model/training_simcse-{bi_model_name}-'+datetime.now().strftime("%Y-%m-%d_%H-%M-%S"))

## Re-ranker (Cross-Encoder)

In [None]:
'''
https://github.com/Guzpenha/transformer_rankers
https://colab.research.google.com/drive/1wGmaO3emC7Sg-tA7nGehIQ2vjOLN9S5e?usp=sharing#scrollTo=y9ps5zmOHxe4
'''

# cross_model_name = "cross-encoder/ms-marco-MiniLM-L-6-v2"
# cross_model_name = "cross-encoder/ms-marco-TinyBERT-L-2"
# cross_model_name = "distilroberta-base"
# cross_model_name = "amberoad/bert-multilingual-passage-reranking-msmarco"
# cross_model_name = "ZeyadAhmed/AraElectra-Arabic-SQuADv2-CLS"