In [1]:
!pip install transformers
!pip install tqdm



In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import json
import torch
from transformers import AutoTokenizer, AutoModel
import tqdm
import numpy as np
import pickle
import os
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import jaccard_score

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f'Using device: {device}')

Using device: cpu


In [4]:
class NeuralEmbedder():
  def __init__(self, model_name, tokenizer_name):
    self.tokenizer = AutoTokenizer.from_pretrained(model_name)
    self.bert_model = AutoModel.from_pretrained(tokenizer_name).to(device)
  def embed(self,text):
    return self.bert_model(**self.tokenizer(text,return_tensors="pt", truncation=True, max_length=512).to(device))[0][:,0,:].detach().cpu().numpy() #limite de 512 characteres donc legere perte d'information

In [46]:
class NeuralSearchEngine:

    def __init__(self, embedder, index_file='new_index.npy', docs_file='new_docs.pickle'):
        self.embedder = embedder
        self.index_file = index_file
        self.docs_file = docs_file
        self.index_l = None
        self.documents = []
        print('Initiated!')

    def index(self, documents):
        print('Indexing')
        self.documents = documents
        if os.path.exists(self.index_file):
            print(f'Loading index from {self.index_file}')
            self.index_l = np.load(self.index_file)
        else:
            print('Indexing documents and saving...')
            encoded_docs = []
            for d in tqdm.tqdm(documents, desc="Indexing documents"):
                with torch.no_grad():
                    d_encoded = self.embedder.embed(d)
                encoded_docs.append(d_encoded.reshape(-1, 768))
            self.index_l = np.concatenate(encoded_docs, axis=0)
            np.save(self.index_file, self.index_l)
            with open(self.docs_file, 'wb') as handle:
                pickle.dump(self.documents, handle, protocol=pickle.HIGHEST_PROTOCOL)
            print(f'Index saved to {self.index_file}')

    def search(self, query, k=3, verbose=True):
        with torch.no_grad():
            q_encoded = self.embedder.embed(query).reshape(-1, 768)
        scores = q_encoded.dot(self.index_l.T)[0]
        args = np.argsort(scores)[::-1]
        # ici on va juste travailler avec k=1 je pense
        alldocs = []
        for i in range(k):
            if verbose:
                print((i + 1), '-', 'Score:', scores[args[i]], 'doc:', self.documents[args[i]])
            alldocs.append(self.documents[args[i]])
        #on return le top doc
        return alldocs

    def save(self):
        with open('new_index.pickle', 'wb') as handle:
            pickle.dump(self.index_l, handle, protocol=pickle.HIGHEST_PROTOCOL)
        with open('new_docs.pickle', 'wb') as handle:
            pickle.dump(self.documents, handle, protocol=pickle.HIGHEST_PROTOCOL)

    def load(self):
        with open('new_index.pickle', 'rb') as handle:
            self.index_l = pickle.load(handle)
        with open('new_docs.pickle', 'rb') as handle:
            self.documents = pickle.load(handle)

In [6]:
with open('converted_data.json', 'r') as file:
    testing_dataset = json.load(file)

full_texts = []
for key, val in testing_dataset.items():
    full_texts += val
print(full_texts[0:100])

['Townspeople who lived in chartered towns were burghers, as opposed to serfs who lived in villages. Towns were often free, in the sense that they were directly protected by the king or emperor, and were not part of a feudal fief. [citation needed]\n\nToday the process for granting charters is determined by the type of government of the state in question. In monarchies, charters are still often a royal charter given by the Crown or the state authorities acting on behalf of the Crown. In federations, the granting of charters may be within the jurisdiction of the lower level of government such as a state or province. [citation needed]\n\nIn Brazil, municipal corporations are called municipios and are created by means of local legislation at the state level, or after passing a referendum vote of the affected population.', 'A municipal corporation is the legal term for a local governing body, including cities, counties, towns, townships, charter townships, villages, and boroughs. Municipal

In [47]:
embedder = NeuralEmbedder("sebastian-hofstaetter/distilbert-dot-tas_b-b256-msmarco","sebastian-hofstaetter/distilbert-dot-tas_b-b256-msmarco")

if not os.path.exists('new_embeddings.npy'):
    print('Saving embeddings...')
    embeddings = []
    for text in tqdm.tqdm(full_texts, desc="Embedding documents"):
        embeddings.append(embedder.embed(text))
    np.save('new_embeddings.npy', embeddings)
else:
    print('Loading embeddings...')
    embeddings = np.load('new_embeddings.npy', allow_pickle=True)

engine = NeuralSearchEngine(embedder, index_file='new_document_index.npy', docs_file='new_docs.pickle')
engine.index(full_texts)
engine.save()
engine.load()

Loading embeddings...
Initiated!
Indexing
Loading index from new_document_index.npy


In [59]:
engine.search('What is the role of conversionism in Evangelicalism?', k=3)

1 - Score: 103.01952 doc: Conversionism, or belief in the necessity of being born again, has been a constant theme of Evangelicalism since its beginnings. To Evangelicals, the central message of the gospel is justification by faith in Christ and repentance, or turning away, from sin. Conversion differentiates the Christian from the non-Christian, and the change in life it leads to is marked by both a rejection of sin and a corresponding personal holiness of life. A conversion experience can be emotional, including grief and sorrow for sin followed by great relief at receiving forgiveness. The stress on conversion is further differentiated from other forms of Protestantism by the belief that an assurance of salvation will accompany conversion. Among Evangelicals, individuals have testified to both sudden and gradual conversions.
2 - Score: 101.399055 doc: Evangelicalism , Evangelical Christianity, or Evangelical Protestantism[a] is a worldwide, transdenominational movement within Protes

['Conversionism, or belief in the necessity of being born again, has been a constant theme of Evangelicalism since its beginnings. To Evangelicals, the central message of the gospel is justification by faith in Christ and repentance, or turning away, from sin. Conversion differentiates the Christian from the non-Christian, and the change in life it leads to is marked by both a rejection of sin and a corresponding personal holiness of life. A conversion experience can be emotional, including grief and sorrow for sin followed by great relief at receiving forgiveness. The stress on conversion is further differentiated from other forms of Protestantism by the belief that an assurance of salvation will accompany conversion. Among Evangelicals, individuals have testified to both sudden and gradual conversions.',
 'Evangelicalism , Evangelical Christianity, or Evangelical Protestantism[a] is a worldwide, transdenominational movement within Protestant Christianity maintaining that the essence 

In [49]:
train_file_path = 'WikiPassageQA/train.txt'
test_file_path = 'WikiPassageQA/test.txt'

train_df = pd.read_csv(train_file_path, delimiter='\t')
test_df = pd.read_csv(test_file_path, delimiter='\t')

with open('WikiPassageQA/document_passages.json', 'r') as file:
    full_documents = json.load(file)

In [50]:
def get_passage_text(document_id, segment_id):
    """Retrieve the passage text given a document ID and segments."""
    return full_documents[str(document_id)][str(segment_id)]

In [51]:
print(test_df.columns)
print(test_df['Question'], test_df['RelevantPassages'])

Index(['QID', 'Question', 'DocumentID', 'DocumentName', 'RelevantPassages'], dtype='object')
0      What is Iraq's role in political unstabilization?
1      What have been the warmest years of global war...
2                How were the Olympic games broadcasted?
3                What was the election turnout for 2008?
4      How has secularism been misinterpreted in Amer...
                             ...                        
411          What does the Church mean by Fallen Nature?
412                      What is the history of Eurasia?
413                  What are the nicknames of Missouri?
414    How do endoglossic and exoglossic languages co...
415    Why are drainage basins important to the ecology?
Name: Question, Length: 416, dtype: object 0            42,43
1              7,8
2      37,38,39,40
3               68
4            10,11
          ...     
411          24,25
412            1,2
413            4,5
414              1
415              8
Name: RelevantPassages, Leng

In [52]:
testing = test_df[['Question', 'DocumentID', 'RelevantPassages']]
#print(testing.head())

for _, row in testing.head().iterrows():
    question = row['Question']
    doc_id = row['DocumentID']
    relevant_passages = row['RelevantPassages'].split(',')
    print(f"Question: {question}")
    print("Relevant Passages:")
    for passage_id in relevant_passages:
        print(f"  - Passage ID: {passage_id}")
        print(get_passage_text(str(doc_id), str(passage_id)))
    print()              

Question: What is Iraq's role in political unstabilization?
Relevant Passages:
  - Passage ID: 42
The party has a more consistent anti-sectarian perspective than most of its rivals. The Kurdistan List is dominated by two parties, the Kurdistan Democratic Party led by Masood Barzani and the Patriotic Union of Kurdistan headed by Jalal Talabani. Both parties are secular and enjoy close ties with the West. In 2010, according to the Failed States Index, Iraq was the world's seventh most politically unstable country. The concentration of power in the hands of Prime Minister Nouri al-Maliki and growing pressure on the opposition led to growing concern about the future of political rights in Iraq. Nevertheless, progress was made and the country had risen to 11th place by 2013.
  - Passage ID: 43
In August 2014, al-Maliki's reign came to an end. He announced on 14 August 2014 that he would stand aside so that Haider Al-Abadi, who had been nominated just days earlier by newly installed Presiden

In [53]:
def bigram_similarity(text1, text2):
    vectorizer = CountVectorizer(ngram_range=(2, 2), analyzer='word').fit([text1, text2])
    bigrams1 = vectorizer.transform([text1])
    bigrams2 = vectorizer.transform([text2])
    intersection = np.sum(np.minimum(bigrams1.toarray(), bigrams2.toarray()))
    union = np.sum(np.maximum(bigrams1.toarray(), bigrams2.toarray()))
    return intersection / union if union != 0 else 0

In [67]:
def testingPipeline(engine, testing_file, threshold=0.15, k=20):
    total_tests = len(testing_file)
    relevant_count = 0
    average_precisions = []
    reciprocal_ranks = []
    precision_at_5 = []
    precision_at_10 = []
    recall_at_5 = []
    recall_at_10 = []
    recall_at_20 = []
    ndcg_scores = []

    def calculate_dcg(scores):
        return np.sum([rel / np.log2(idx + 2) for idx, rel in enumerate(scores)])

    progress_bar = tqdm.tqdm(total=total_tests, desc="Testing", position=0, leave=True)

    for _, row in testing_file.iterrows():
        question = row['Question']
        doc_id = row['DocumentID']
        relevant_passages = row['RelevantPassages'].split(',')
        returned_passages = engine.search(question, k=k, verbose=False)

        relevance_scores = []
        precisions = []  # Initialize precisions for each new question
        found_first_relevant = False
        num_relevant_found = 0  # This also needs to be reset for each new question

        for rank, returned_passage in enumerate(returned_passages, start=1):
            max_similarity = max([bigram_similarity(returned_passage, get_passage_text(doc_id, passage_id)) for passage_id in relevant_passages])
            relevance = 2 ** (1 if max_similarity >= threshold else 0) - 1
            relevance_scores.append(relevance)
            
            if max_similarity >= threshold:
                num_relevant_found += 1
                precision_at_rank = num_relevant_found / rank
                precisions.append(precision_at_rank)
                
                if not found_first_relevant:
                    reciprocal_ranks.append(1 / rank)
                    found_first_relevant = True
            
            if rank == 5 or rank == 10 or rank == 20:
                current_precision = num_relevant_found / rank
                current_recall = num_relevant_found / len(relevant_passages)
                if rank == 5:
                    precision_at_5.append(current_precision)
                    recall_at_5.append(current_recall)
                elif rank == 10:
                    precision_at_10.append(current_precision)
                    recall_at_10.append(current_recall)
                elif rank == 20:
                    recall_at_20.append(current_recall)

        # Calculate DCG and IDCG for nDCG
        dcg = calculate_dcg(relevance_scores)
        idcg = calculate_dcg(sorted(relevance_scores, reverse=True))
        ndcg = dcg / idcg if idcg > 0 else 0
        ndcg_scores.append(ndcg)

        if num_relevant_found > 0:
            relevant_count += 1
        
        if precisions:
            average_precisions.append(np.mean(precisions))

        progress_bar.update(1)
        progress_bar.set_postfix({"Current Score": relevant_count})
    
    progress_bar.close()
    
    accuracy = relevant_count / total_tests
    map_score = np.mean(average_precisions) if average_precisions else 0
    mrr_score = np.mean(reciprocal_ranks) if reciprocal_ranks else 0
    p_at_5 = np.mean(precision_at_5) if precision_at_5 else 0
    p_at_10 = np.mean(precision_at_10) if precision_at_10 else 0
    r_at_5 = np.mean(recall_at_5) if recall_at_5 else 0
    r_at_10 = np.mean(recall_at_10) if recall_at_10 else 0
    r_at_20 = np.mean(recall_at_20) if recall_at_20 else 0
    mean_ndcg = np.mean(ndcg_scores) if ndcg_scores else 0

    return accuracy, map_score, mrr_score, p_at_5, p_at_10, r_at_5, r_at_10, r_at_20, mean_ndcg

In [71]:
accuracy, map_score, mrr_score, p_at_5, p_at_10, r_at_5, r_at_10, r_at_20, mean_ndcg = testingPipeline(engine, train_df)
print(f"Accuracy: {accuracy:.4f}")
print(f"MAP: {map_score:.4f}")
print(f"MRR: {mrr_score:.4f}")
print(f"P@5: {p_at_5:.4f}")
print(f"P@10: {p_at_10:.4f}")
print(f"R@5: {r_at_5:.4f}")
print(f"R@10: {r_at_10:.4f}")
print(f"R@20: {r_at_20:.4f}")
print(f"nDCG: {mean_ndcg:.4f}")

Testing:   0%|          | 6/3332 [00:00<05:18, 10.43it/s, Current Score=4]

KeyboardInterrupt: 

In [72]:
accuracy, map_score, mrr_score, p_at_5, p_at_10, r_at_5, r_at_10, r_at_20, mean_ndcg = testingPipeline(engine, test_df)
print(f"Accuracy: {accuracy:.4f}")
print(f"MAP: {map_score:.4f}")
print(f"MRR: {mrr_score:.4f}")
print(f"P@5: {p_at_5:.4f}")
print(f"P@10: {p_at_10:.4f}")
print(f"R@5: {r_at_5:.4f}")
print(f"R@10: {r_at_10:.4f}")
print(f"R@20: {r_at_20:.4f}")
print(f"nDCG: {mean_ndcg:.4f}")

Testing:   1%|          | 5/416 [00:00<00:44,  9.19it/s, Current Score=4]

KeyboardInterrupt: 

Testing:   0%|          | 6/3332 [00:20<05:18, 10.43it/s, Current Score=4]