In [1]:
import re
import os
import pandas as pd
from snowballstemmer import stemmer
import arabicstopwords.arabicstopwords as ar_stp
from functools import reduce
import numpy as np
import pyterrier as pt
import torch

In [2]:
print(torch.__version__)
print(torch.cuda.is_available())
print(torch.cuda.device_count())
print(torch.cuda.current_device())
print(torch.cuda.get_device_name(0))

2.0.1+cu118
True
1
0
NVIDIA GeForce RTX 3090


# Get Data

In [3]:
data_path = "../data"
index_path = os.path.join(data_path ,"QPC_Index/data.properties")

query_train_path = os.path.join(data_path, "QQA23_TaskA_train.tsv")
query_dev_path = os.path.join(data_path, "QQA23_TaskA_dev.tsv")

passage_path = os.path.join(data_path, "Thematic_QPC/QQA23_TaskA_QPC_v1.1.tsv")

In [4]:
# define some global constants
TEXT = "text"
PASSAGE = "passage"
QUERY = "query"
LABEL = "label"
RANK = "rank"
TAG = "tag"
SCORE = "score"
PID = 'pid'
QID = "qid"
DOC_NO = "docno"
DOCID = "docid"

## Read file

In [5]:
# read file based on its extension (tsv or xlsx)
def read_file(input_file, sep="\t", names = ""):
    if input_file.endswith(".xlsx"):
        df = pd.read_excel(input_file)
    else:
        if names != "":
            df = pd.read_csv(input_file, sep=sep, names=names,encoding="utf-8")
        else:
            df = pd.read_csv(input_file, sep=sep,encoding="utf-8")
    return df

## Load the index

In [6]:
def load_index(index_path):
    if not pt.started():
        pt.init(helper_version="0.0.6")

    try:
        index = pt.IndexFactory.of(index_path)
        print("Index was loaded successfully from this path: ", index_path)
        return index
    except Exception as e:
        print('Cannot load the index, check exception details {}'.format(e))
        return []

In [7]:
index = load_index(index_path=index_path)

print(index.getCollectionStatistics().toString())
print(index.getMetaIndex().getKeys())

PyTerrier 0.9.2 has loaded Terrier 5.7 (built by craigm on 2022-11-10 18:30) and terrier-helper 0.0.6



Index was loaded successfully from this path:  ../data\QPC_Index/data.properties
Number of documents: 1266
Number of terms: 8278
Number of postings: 46732
Number of fields: 1
Number of tokens: 55596
Field names: [text]
Positions:   false

['docno']


In [8]:
for kv in index.getLexicon():
    print((kv.getKey())+"\t"+ kv.getValue().toString())

# index.getLexicon()["فاعل"].toString()

ءاد	term4208 Nt=3 TF=3 maxTF=1 @{0 0 0} TFf=3
ءتين	term1555 Nt=2 TF=2 maxTF=1 @{0 6 4} TFf=2
ءسم	term584 Nt=3 TF=3 maxTF=1 @{0 10 4} TFf=3
ءيس	term3457 Nt=1 TF=1 maxTF=1 @{0 15 0} TFf=1
اءت	term2907 Nt=12 TF=13 maxTF=2 @{0 17 4} TFf=13
اءتم	term1502 Nt=1 TF=1 maxTF=1 @{0 38 2} TFf=1
اءتي	term6790 Nt=1 TF=1 maxTF=1 @{0 40 2} TFf=1
اءذ	term3745 Nt=2 TF=2 maxTF=1 @{0 43 0} TFf=2
اءفك	term6583 Nt=1 TF=1 maxTF=1 @{0 48 2} TFf=1
اءم	term3677 Nt=5 TF=5 maxTF=1 @{0 51 0} TFf=5
اءن	term2803 Nt=6 TF=6 maxTF=1 @{0 60 6} TFf=6
اءنا	term6084 Nt=2 TF=2 maxTF=1 @{0 72 6} TFf=2
اءنبء	term1565 Nt=1 TF=1 maxTF=1 @{0 77 4} TFf=1
ااتخذ	term6489 Nt=1 TF=1 maxTF=1 @{0 79 4} TFf=1
ااذ	term4422 Nt=11 TF=11 maxTF=1 @{0 82 2} TFf=11
اارباب	term4287 Nt=1 TF=1 maxTF=1 @{0 102 2} TFf=1
ااسجد	term4873 Nt=1 TF=1 maxTF=1 @{0 104 6} TFf=1
ااسلم	term1588 Nt=1 TF=1 maxTF=1 @{0 107 4} TFf=1
ااشفق	term7413 Nt=1 TF=1 maxTF=1 @{0 109 4} TFf=1
ااشكر	term6061 Nt=1 TF=1 maxTF=1 @{0 112 4} TFf=1
ااعجم	term6811 Nt=1 TF=1 maxTF=1

## Cleaning
Clean text from urls, handles, special characters, tabs, line jumps, extra white space, and puntuations.

In [9]:
# Clean text from urls, handles, special characters, tabs, line jumps, and extra white space.
def clean(text):
    text = re.sub(r"http\S+", " ", text)  # remove urls
    text = re.sub(r"@[\w]*", " ", text)  # remove handles
    text = re.sub(r"[\.\,\#_\|\:\?\?\/\=]", " ", text) # remove special characters
    text = re.sub(r"\t", " ", text)  # remove tabs
    text = re.sub(r"\n", " ", text)  # remove line jump
    text = re.sub(r"\s+", " ", text)  # remove extra white space
    text = re.sub(r'[^\w\s]', '', text) # Removing punctuations in string using regex
    text = text.strip()
    return text

## Preprocessing
Preprocess the arabic input text by performing normalization, stemming, and removing stop words.

In [10]:
# arabic stemmer
ar_stemmer = stemmer("arabic")

# remove arabic stop words
def ar_remove_stop_words(sentence):
    terms=[]
    stopWords= set(ar_stp.stopwords_list())
    for term in sentence.split() : 
        if term not in stopWords :
            terms.append(term)
    return " ".join(terms)


# normalize the arabic text
def normalize_arabic(text):
    text = re.sub("[إأٱآا]", "ا", text)
    text = re.sub("ى", "ي", text)
    text = re.sub("ؤ", "ء", text)
    text = re.sub("ئ", "ء", text)
    text = re.sub("ة", "ه", text)
    return(text)

# stem the arabic text
def ar_stem(sentence):
    return " ".join([ar_stemmer.stemWord(i) for i in sentence.split()])


# apply all preprocessing steps needed for Arabic text
def preprocess_arabic(text): 
    text = normalize_arabic(text)
    text = ar_remove_stop_words(text)
    text = ar_stem(text)
    return text

In [44]:
def prepare_data(path, column, id_type, id_column=DOC_NO):
        df = read_file(path, names=[DOC_NO, TEXT])

        print("Cleaning passages")
        # apply the cleaning functions on the queries/questions
        df[column] = df[TEXT].apply(clean)

        # apply normalization, stemming and stop word removal
        print("Preprocessing - Applying normalization, stemming and stop word removal")
        df[column] = df[column].apply(preprocess_arabic)

        df[id_type] = df[id_column].astype(str) # convert the id column to string
        df = df[[id_type, TEXT, column]] # keep the columns needed for search

        print("Done with preparation!")
        return df


### Passage

In [45]:
df_passage = prepare_data(passage_path, PASSAGE, PID)
df_passage

Cleaning passages
Preprocessing - Applying normalization, stemming and stop word removal
Done with preparation!


Unnamed: 0,pid,text,passage
0,1:1-4,بسم الله الرحمن الرحيم. الحمد لله رب العالمين....,بسم الله رحم رحيم حمد لله عالم رحم رحيم مال يو...
1,1:5-6,إياك نعبد وإياك نستعين. اهدنا الصراط المستقيم.,ايا نعبد واي نستع اهد صراط مستقيم
2,1:7-7,صراط الذين أنعمت عليهم غير المغضوب عليهم ولا ا...,صراط انعم مغضوب ضال
3,2:1-2,الم. ذلك الكتاب لا ريب فيه هدى للمتقين.,الم كتاب ريب هد متق
4,2:3-5,الذين يؤمنون بالغيب ويقيمون الصلاة ومما رزقناه...,يءمن غيب يقيم صلاه رزق ينفق يءمن انزل اليك انز...
...,...,...,...
1261,110:1-3,إذا جاء نصر الله والفتح. ورأيت الناس يدخلون في...,اذا جاء نصر الله والفتح راي ناس يدخل دين الله ...
1262,111:1-5,تبت يدا أبي لهب وتب. ما أغنى عنه ماله وما كسب....,تبت يدا اب وتب اغن مال كسب سيصل نار وامرا حمال...
1263,112:1-4,قل هو الله أحد. الله الصمد. لم يلد ولم يولد. و...,قل الله احد الله صمد يلد يولد يكن كفو احد
1264,113:1-5,قل أعوذ برب الفلق. من شر ما خلق. ومن شر غاسق إ...,قل اعوذ برب فلق شر خلق شر غاسق اذا وقب شر نفاث...


### Query

In [46]:
df_query_train = prepare_data(query_train_path, QUERY, QID)
df_query_train

Cleaning passages
Preprocessing - Applying normalization, stemming and stop word removal
Done with preparation!


Unnamed: 0,qid,text,query
0,101,من هم قوم شعيب؟,قوم شعيب
1,102,من هم قوم موسى؟,قوم موس
2,103,من بنى الكعبة؟,بن كعبه
3,105,من هو النبي المعروف بالصبر؟,النب معروف صبر
4,106,من كفل السيدة مريم؟,كفل سيده مريم
...,...,...,...
169,422,ما هي الأماكن التي ذُكرت في القرآن كأماكن مقدسة؟,اماك ذكر القر كام مقدس
170,423,لماذا لم يتم حذف الآيات المنسوخة من القرآن؟,لماذ يتم حذف الا منسوخه القر
171,425,هل سيدنا محمد هو أفضل الأنبياء؟,سيد محمد افضل انبياء
172,426,هل حذر القرآن المؤمنين من اتخاذ أهل الكتاب أول...,حذر القر مءمن اتخاذ اهل كتاب اولياء


In [47]:
df_query_dev = prepare_data(query_dev_path, QUERY, QID)
df_query_dev

Cleaning passages
Preprocessing - Applying normalization, stemming and stop word removal
Done with preparation!


Unnamed: 0,qid,text,query
0,114,من الذي خسف الله به الأرض؟,خسف الله ارض
1,124,كم مدة عدة الأرملة؟,مده عده ارمله
2,126,ما هي شجرة الزقوم؟,شجر زقوم
3,135,ما هي وصايا لقمان لابنه؟,صاي لقمان
4,156,من هن المحرمات من النساء في الزواج؟,محرم نساء زواج
5,157,ما هي منزلة من يقتل في سبيل الله؟,منزل يقتل سبيل الله
6,207,من هلك من أهل سيدنا نوح عليه السلام في الطوفان؟,هلك اهل سيد نوح سلام طوف
7,224,لماذا ألقي سيدنا يوسف عليه السلام في الجب؟,لماذ الق سيد يوسف سلام الجب
8,234,كم دامت دعوة نوح لقومه؟,دام دعو نوح لقوم
9,241,من هم الملائكة المذكورون في القرآن؟,ملاءكه مذكور القر


# Model

## Save Run

In [15]:
def save_query_passage_retrieval(result, tag, run_save_path=None):
    result["Q0"] = ["Q0"] * len(result)
    result["tag"] = [tag] * len(result)
    result['question-id'] = result["qid"]
    result['passage-id'] = result["docno"]
    result = result[["question-id", "Q0", "passage-id", "rank", "score", "tag"]]

    if run_save_path:
        result.to_csv(run_save_path, sep="\t", index=False, header=False)
    return result

# SimCSE

In [16]:
# from simcse import SimSCE
from torch.utils.data import DataLoader
from torch import nn
from sentence_transformers import models, losses
from sentence_transformers import LoggingHandler, SentenceTransformer, CrossEncoder, util, InputExample
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator

import logging
from datetime import datetime
import math

In [17]:
#### Just some code to print debug information to stdout
logging.basicConfig(format='%(asctime)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    level=logging.INFO,
                    handlers=[LoggingHandler()])
#### /print debug information to stdout

In [18]:
bi_model_name = "sentence-transformers/distiluse-base-multilingual-cased-v1"
#bi_model_name = "aubmindlab/bert-large-arabertv02" # run 0 : pAP@10 = 0.289
# bi_model_name = "wissamantoun/araelectra-base-artydiqa" # run 1 : pAP@10 = 0.437
# bi_model_name = "salti/AraElectra-base-finetuned-ARCD" # run 2 : pAP@10 = 0.397
# bi_model_name = "ZeyadAhmed/AraElectra-Arabic-SQuADv2-QA" # run 3 : pAP@10 = 0.435
#bi_model_name = "timpal0l/mdeberta-v3-base-squad2" # run 4 : pAP@10 = 0.367
#bi_model_name = "gfdgdfgdg/arap_qa_bert" # run 5 : pAP@10 = 0.184
#bi_model_name = "gfdgdfgdg/arap_qa_bert_large_v2" # run6 : pAP@10 = 0.372
#bi_model_name = "gfdgdfgdg/arap_qa_bert_v2" # run7 : pAP@10 = 0.344
#bi_model_name = "zohaib99k/Bert_Arabic-SQuADv2-QA" # run8 : pAP@10 = 0.435
#bi_model_name = "arabi-elidrisi/ArabicDistilBERT_QA" #run 9 : pAP@10 = 0.343
#bi_model_name = "MMars/Question_Answering_AraBERT_xtreme_ar" #run 10 : pAP@10 = 0.337
# bi_model_name = "abdalrahmanshahrour/ArabicQA" # run 11 : pAP@10 = 0.304
# bi_model_name = "abdalrahmanshahrour/xtremeQA-ar" # run 12 : pAP@10 = 0.120

cross_model_name = "cross-encoder/ms-marco-MiniLM-L-6-v2"
# cross_model_name = "cross-encoder/ms-marco-TinyBERT-L-2"

model_save_path = os.path.join(data_path, f'model/training_simcse-{bi_model_name}-'+datetime.now().strftime("%Y-%m-%d_%H-%M-%S"))

In [19]:
# Use Huggingface/transformers model (like BERT, RoBERTa, XLNet, XLM-R) for mapping tokens to embeddings
word_embedding_model = models.Transformer(bi_model_name, max_seq_length=64)
# Apply mean pooling to get one fixed sized sentence vector
print("word_embedding_model Max Sequence Length:", word_embedding_model.max_seq_length)
print("word_embedding_model dimension", word_embedding_model.get_word_embedding_dimension())

pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
print("pooling_model sentence embedding dimension", pooling_model.get_sentence_embedding_dimension())

dense_model = models.Dense(in_features=pooling_model.get_sentence_embedding_dimension(), out_features=256, activation_function=nn.Tanh())

# bi_encoder = SentenceTransformer(modules=[word_embedding_model, pooling_model])
bi_encoder = SentenceTransformer(modules=[word_embedding_model, pooling_model, dense_model])

word_embedding_model Max Sequence Length: 64
word_embedding_model dimension 768
pooling_model sentence embedding dimension 768
2023-08-07 11:28:17 - Use pytorch device: cuda


In [52]:
# Configure the training
train_batch_size = 32
num_epochs = 1
top_k = 10


In [53]:
train_samples = []
for _, row in df_passage.iterrows():
    train_samples.append(InputExample(texts=[row['passage'], row['passage']]))
train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=train_batch_size)

In [54]:
# Use MultipleNegativesRankingLoss for SimCSE
train_loss = losses.MultipleNegativesRankingLoss(bi_encoder)

warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1) #10% of train data for warm-up
logging.info("Warmup-steps: {}".format(warmup_steps))

# logging.info("Performance before training")
# dev_evaluator(model)

# Train the model
bi_encoder.fit(train_objectives=[(train_dataloader, train_loss)],
          # evaluator=dev_evaluator,
          epochs=num_epochs,
          evaluation_steps=100,
          warmup_steps=warmup_steps,
          output_path=model_save_path
          )

2023-08-07 12:37:40 - Warmup-steps: 4


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/40 [00:00<?, ?it/s]

2023-08-07 12:37:46 - Save model to ../data\model/training_simcse-sentence-transformers/distiluse-base-multilingual-cased-v1-2023-08-07_11-28-16


In [55]:
passage_embeddings = bi_encoder.encode(df_passage[PASSAGE].tolist(), convert_to_tensor=True, show_progress_bar=True)
query_embeddings = bi_encoder.encode(df_query_train[QUERY].tolist(), convert_to_tensor=True, show_progress_bar=True)

Batches:   0%|          | 0/40 [00:00<?, ?it/s]

Batches:   0%|          | 0/6 [00:00<?, ?it/s]

In [56]:
for sentence, embedding in zip(df_passage, passage_embeddings):
    print("Sentence:", sentence)
    print("Embedding:", embedding)
    print("")

Sentence: pid
Embedding: tensor([ 0.0099, -0.0128, -0.0419,  0.0571, -0.0321,  0.0006, -0.0041, -0.0243,
         0.0441, -0.0336, -0.0450, -0.0178, -0.0406,  0.0262,  0.0133, -0.0490,
        -0.0158,  0.0244,  0.0462,  0.0230,  0.0014,  0.0776, -0.0403,  0.0181,
         0.0002,  0.0181,  0.0439,  0.0806,  0.0584,  0.0118,  0.0230, -0.0232,
         0.0059,  0.0426,  0.0107,  0.0298,  0.0122,  0.0322,  0.0015,  0.0409,
        -0.0010, -0.0197,  0.0044,  0.0504,  0.0196,  0.0246, -0.0201, -0.0006,
         0.0166, -0.0827, -0.0156, -0.0296, -0.0083,  0.0232,  0.0707, -0.0217,
         0.0190,  0.0019,  0.0359,  0.0411,  0.0346, -0.0224, -0.0376, -0.0338,
        -0.0333, -0.0126, -0.0153,  0.0008, -0.0537, -0.0681, -0.0273,  0.0537,
        -0.0043,  0.0048, -0.0123, -0.0330,  0.0215,  0.0350, -0.0343,  0.0252,
        -0.0764, -0.0079, -0.0025, -0.0198, -0.0113,  0.0107,  0.0600, -0.0253,
         0.0150,  0.0011,  0.0325, -0.0072, -0.0258, -0.0232,  0.0338, -0.0102,
        -0.0058

In [42]:
cross_encoder = CrossEncoder(cross_model_name)

Downloading (…)lve/main/config.json:   0%|          | 0.00/794 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

2023-08-07 11:23:45 - Use pytorch device: cuda


In [47]:
print("bi-encoder model name", bi_model_name)
print("cross-encoder model name", cross_model_name)

bi-encoder model name sentence-transformers/distiluse-base-multilingual-cased-v1 
cross encoder model name cross-encoder/ms-marco-MiniLM-L-6-v2


## BM25 - Search
Search in the index and find the relevant passages.

In [19]:
# initialize the BM25 retrieval model
BM25_model = pt.BatchRetrieve(index, controls = {"wmodel": "BM25"}, num_results=5)
# wmodel="TF_IDF"

In [29]:
tag="BM25"
save_path = os.path.join(data_path, f"runs/{tag}.tsv")

# search using BM25 model
df_run = BM25_model.transform(df_query_train)

# save the run in trec format to a file
df_run = save_query_passage_retrieval(df_run, tag, save_path)

df_run

Unnamed: 0,question-id,Q0,passage-id,rank,score,tag
0,101,Q0,7:85-93,0,14.236550,BM25
1,101,Q0,29:36-37,1,13.002848,BM25
2,101,Q0,11:89-95,2,12.651979,BM25
3,101,Q0,11:84-88,3,11.658893,BM25
4,101,Q0,26:176-191,4,6.080667,BM25
...,...,...,...,...,...,...
812,427,Q0,46:29-32,0,13.713442,BM25
813,427,Q0,7:96-102,1,11.975072,BM25
814,427,Q0,33:25-27,2,11.300501,BM25
815,427,Q0,17:9-11,3,10.401655,BM25


In [28]:
sample = "عمران"
BM25_model.search(sample)

Unnamed: 0,qid,docid,docno,rank,score,query
0,1,1115,66:10-12,0,8.576149,عمران
1,1,116,3:33-41,1,8.129249,عمران


## Sentence Embedding

In [21]:
from sentence_transformers import SentenceTransformer
# sentence_embedding = SentenceTransformer('multi-qa-MiniLM-L6-dot-v1')
sentence_embedding = SentenceTransformer('all-MiniLM-L6-v2')


In [22]:
def batch_emb(l1, l2):
    l2 = sentence_embedding.encode(l2)
    l1.extend(l2)
    return l1

def get_embedding(text, n=50):
    batched_num = [text[i:i + n] for i in range(0, len(text), n)]
    texts_embed = reduce(batch_emb, batched_num, [])
    print(len(texts_embed))
    return texts_embed



## RocketQA

# Evaluation

In [None]:
! python QQA23_TaskA_eval.py \
    -r "../data/runs/bigIR_BM25.tsv" \
    -q "../data/qrels/QQA23_TaskA_qrels_dev.gold"

Here, we are just evaluating the perfect run for the dev set

In [None]:
! python QQA23_TaskA_eval.py \
    -r "../data/runs/dev_perfect.tsv" \
    -q "../data/qrels/QQA23_TaskA_qrels_dev.gold"