In [24]:
!pip install transformers
!pip install tqdm



In [25]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import json

''' jsp si on utilise ca encore '''
# Load the dataset files
train_file_path = 'WikiPassageQA/train.txt'
test_file_path = 'WikiPassageQA/test.txt'

# Read the files into pandas DataFrames
train_df = pd.read_csv(train_file_path, delimiter='\t')
test_df = pd.read_csv(test_file_path, delimiter='\t')

# Load the full document segments
with open('WikiPassageQA/document_passages.json', 'r') as file:
    full_documents = json.load(file)

# Combine train and test data for TF-IDF vectorization
combined_df = pd.concat([train_df, test_df])

# Function to retrieve passage texts given document ID and segments
def get_passage_texts(doc_id, segments):
    segments = segments.split(',')
    passages = [full_documents[str(doc_id)][segment.strip()] for segment in segments]
    return ' '.join(passages)

# Add a new column to the dataframe with the full passage text
combined_df['PassageText'] = combined_df.apply(lambda row: get_passage_texts(row['DocumentID'], row['RelevantPassages']), axis=1)

# Split back into train and test sets
train_df = combined_df.iloc[:len(train_df)]
test_df = combined_df.iloc[len(train_df):]

In [26]:
print(type(full_documents))

<class 'dict'>


In [27]:
from transformers import AutoTokenizer, AutoModel
import tqdm

class NeuralEmbedder():
  def __init__(self, model_name, tokenizer_name):
    self.tokenizer = AutoTokenizer.from_pretrained(model_name) 
    self.bert_model = AutoModel.from_pretrained(tokenizer_name)
  def embed(self,text):
    return self.bert_model(**self.tokenizer(text,return_tensors="pt", truncation=True, max_length=512))[0][:,0,:].squeeze(0).numpy() #limite de 512 characteres donc legere perte d'information

In [31]:
import numpy as np
import torch
import pickle

class NeuralSearchEngine():


  def __init__(self, embedder):
    self.embedder = embedder

  def index(self, documents):
    self.documents = documents
    encoded_docs = []
    for d in tqdm.tqdm(documents, desc="Indexing documents"):
      with torch.no_grad():
        d_encoded = self.embedder.embed(d)
      encoded_docs.append(d_encoded.reshape(-1, 768))
    self.index = np.concatenate(encoded_docs, axis=0)
   

  
  def search(self, query):
    with torch.no_grad():
      q_encoded = self.embedder.embed(query).reshape(-1,768)
    scores = q_encoded.dot(self.index.T)[0]
    # print(scores)
    args = np.argsort(scores)[::-1]
    for i in range(3):
      print((i+1),'-','Score:',scores[args[i]],'doc:',self.documents[args[i]])
      
      
  def save(self):
    with open('index.pickle', 'wb') as handle:
        pickle.dump(self.index, handle, protocol=pickle.HIGHEST_PROTOCOL)
    with open('docs.pickle', 'wb') as handle:
        pickle.dump(self.documents, handle, protocol=pickle.HIGHEST_PROTOCOL)
  
  def load(self):
    with open('index.pickle', 'rb') as handle:
        self.index = pickle.load(handle)
    with open('docs.pickle', 'rb') as handle:
        self.documents = pickle.load(handle)

In [32]:
with open('data.json', 'r') as file:
    testing_dataset = json.load(file)

full_texts = []
for key, val in testing_dataset.items():
    full_texts += val
print(full_texts[0:100])

['Evangelicalism (), also called evangelical Christianity or evangelical Protestantism, is a worldwide interdenominational movement within Protestant Christianity that emphasizes the centrality of sharing the "good news" of Christianity, being "born again" in which an individual experiences personal conversion, as authoritatively guided by the Bible, God\'s revelation to humanity. The word evangelical comes from the Greek word for \'good news\' (euangelion).The theological nature of evangelicalism was first explored during the Protestant Reformation in 16th century Europe. Martin Luther\'s Ninety-Five Theses in 1517 emphasized that scripture and the preaching of the gospel had ultimate authority over the practices of the Church. The origins of modern evangelicalism are usually traced to 1738, with various theological streams contributing to its foundation, including Pietism and Radical Pietism, Puritanism, Quakerism and Moravianism (in particular its bishop Nicolaus Zinzendorf and his 

In [33]:
embedder = NeuralEmbedder("sebastian-hofstaetter/distilbert-dot-tas_b-b256-msmarco","sebastian-hofstaetter/distilbert-dot-tas_b-b256-msmarco")
engine = NeuralSearchEngine(embedder)
engine.index(full_texts)

Indexing documents:   0%|          | 158/54152 [00:36<3:25:04,  4.39it/s]


KeyboardInterrupt: 

In [None]:
engine.search('What is the role of conversionism in Evangelicalism?')

1 - Score: 102.809235 doc: Conversionism, or belief in the necessity of being "born again", has been a constant theme of Evangelicalism since its beginnings. To Evangelicals, the central message of the gospel is justification by faith in Christ and repentance, or turning away, from sin. Conversion differentiates the Christian from the non-Christian, and the change in life it leads to is marked by both a rejection of sin and a corresponding personal holiness of life. A conversion experience can be emotional, including grief and sorrow for sin followed by great relief at receiving forgiveness. The stress on conversion is further differentiated from other forms of Protestantism by the belief that an assurance of salvation will accompany conversion. Among Evangelicals, individuals have testified to both sudden and gradual conversions.
2 - Score: 101.5385 doc: Evangelicalism , Evangelical Christianity, or Evangelical Protestantism[a] is a worldwide, transdenominational movement within Prote