In [21]:
import json
import pyterrier as pt
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import string

nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\titouan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\titouan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
if not pt.started():
    pt.init()

PyTerrier 0.10.0 has loaded Terrier 5.8 (built by craigm on 2023-11-01 18:05) and terrier-helper 0.0.8



In [5]:
def flatten_documents(data):
    documents = []
    for doc_id, contents in data.items():
        for sub_id, text in contents.items():
            # Construct a unique document ID
            unique_doc_id = f"{doc_id}_{sub_id}"
            documents.append({'docno': unique_doc_id, 'text': text})
    return documents

# Flatten the data
with open('WikiPassageQA/document_passages.json', 'r') as file:
        data = json.load(file)
documents = flatten_documents(data)
print(documents[0])

{'docno': '344_1', 'text': 'Townspeople who lived in chartered towns were burghers, as opposed to serfs who lived in villages. Towns were often "free", in the sense that they were directly protected by the king or emperor, and were not part of a feudal fief. [citation needed]\n\nToday the process for granting charters is determined by the type of government of the state in question. In monarchies, charters are still often a royal charter given by the Crown or the state authorities acting on behalf of the Crown. In federations, the granting of charters may be within the jurisdiction of the lower level of government such as a state or province. [citation needed]\n\nIn Brazil, municipal corporations are called municipios and are created by means of local legislation at the state level, or after passing a referendum vote of the affected population.'}


In [6]:
import pandas as pd

# Assuming 'documents' is your list of dictionaries from the flatten_documents function
documents_df = pd.DataFrame(documents)
print(documents_df)

        docno                                               text
0       344_1  Townspeople who lived in chartered towns were ...
1       344_0  A municipal corporation is the legal term for ...
2       344_3  This standard varies from state to state, acco...
3       344_2  All municipal corporations must also abide by ...
4       344_5  Under the panchayati raj system, it interacts ...
...       ...                                                ...
50607  479_18  All dialects also divided in two main chronolo...
50608  479_31  The official language in Moscow and Novgorod, ...
50609  479_30  It was soon followed by the adoption of Christ...
50610  479_33  ['knam va'kosk@ z@stU'tcit]\n\nThe political u...
50611  479_32  Leo Tolstoy's War and Peace, contain entire pa...

[50612 rows x 2 columns]


In [7]:
# Load English stopwords
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

def process_text(text):
    tokens = word_tokenize(text)
    
    filtered_tokens = [stemmer.stem(word.lower()) for word in tokens if word.lower() not in stop_words and word.isalpha()]
    
    return ' '.join(filtered_tokens)

# documents_df['text'] = documents_df['text'].apply(process_text)
# print(documents_df)


In [8]:
pd_indexer = pt.DFIndexer("C:/Users/titouan/Documents/new_indexer")

In [9]:
indexref = pd_indexer.index(documents_df["text"], documents_df["docno"])

In [28]:
index = pt.IndexFactory.of(indexref)
br = pt.BatchRetrieve(index, wmodel="BM25")

# Execute a search query
query = "How is agriculture fundamental to life"
results = br.search(query)

# Display the results
print(results.head())

  qid  docid   docno  rank      score                                   query
0   1  39600  280_49     0  16.327567  How is agriculture fundamental to life
1   1  30400  496_47     1  15.701398  How is agriculture fundamental to life
2   1  34470  251_16     2  15.352186  How is agriculture fundamental to life
3   1  22105   253_1     3  15.101824  How is agriculture fundamental to life
4   1  41382  701_22     4  14.839388  How is agriculture fundamental to life


In [12]:
def loadTestTrain(path):
    data_dict = {}
    with open(path, 'r') as file:
        lines = file.readlines()
        header = lines[0].strip().split('\t')  # Split the header to use as keys for inner dictionaries
        for line in lines[1:]:
            parts = line.strip().split('\t')
            # Create a dictionary for each line, pairing header elements with the corresponding data elements
            entry = {header[i]: parts[i] for i in range(1, len(header))}
            data_dict[parts[0]] = entry  # Use QID as the key for the outer dictionary
    return data_dict

data_dict = loadTestTrain('WikiPassageQA/train.txt')
print(data_dict)
print(len(data_dict))

{'3086': {'Question': 'What is the role of conversionism in Evangelicalism?', 'DocumentID': '672', 'DocumentName': 'Evangelicalism.html', 'RelevantPassages': '4'}, '195': {'Question': 'How did the assault on the Bastille the first year of the Revolution ultimately culminate into the capture & execution of Louis XVI in January 1793?', 'DocumentID': '359', 'DocumentName': 'French_Revolution.html', 'RelevantPassages': '1,2'}, '557': {'Question': 'What is the prehistory of Albania?', 'DocumentID': '285', 'DocumentName': 'Albania.html', 'RelevantPassages': '4'}, '1508': {'Question': 'What significance did Bulgaria have in the ending of World War I?', 'DocumentID': '579', 'DocumentName': 'Central_Powers.html', 'RelevantPassages': '14'}, '956': {'Question': 'What is the rationale of support of the Common Era?', 'DocumentID': '204', 'DocumentName': 'Common_Era.html', 'RelevantPassages': '9'}, '1993': {'Question': "What has characterized Indonesia's foreign relations since the New Order era?", 

In [22]:
def remove_punctuation(query):
    # Create a translation table to map all punctuation characters to None
    translator = str.maketrans('', '', string.punctuation)
    
    # Remove punctuation from the query using the translation table
    query_without_punctuation = query.translate(translator)
    
    return query_without_punctuation

In [23]:
queries_labels = {}

for query_id, data in data_dict.items():
    question = data['Question'][:-1]
    document_id = data['DocumentID']
    relevant_passages = data['RelevantPassages'].split(',')  # Split relevant passages into a list
    
    # Create docno list by concatenating DocumentID and RelevantPassage
    docno_list = [f"{document_id}_{passage}" for passage in relevant_passages]
    
    # Add query and docno list to the modified dictionary
    queries_labels[query_id] = {'query': remove_punctuation(question), 'docno': docno_list}

print(queries_labels)


{'3086': {'query': 'What is the role of conversionism in Evangelicalism', 'docno': ['672_4']}, '195': {'query': 'How did the assault on the Bastille the first year of the Revolution ultimately culminate into the capture  execution of Louis XVI in January 1793', 'docno': ['359_1', '359_2']}, '557': {'query': 'What is the prehistory of Albania', 'docno': ['285_4']}, '1508': {'query': 'What significance did Bulgaria have in the ending of World War I', 'docno': ['579_14']}, '956': {'query': 'What is the rationale of support of the Common Era', 'docno': ['204_9']}, '1993': {'query': 'What has characterized Indonesias foreign relations since the New Order era', 'docno': ['2_16']}, '2260': {'query': 'How violent is the Pacific Ocean', 'docno': ['430_10', '430_11']}, '2678': {'query': 'Why is SubSaharan Africa considered to have a paradoxical birth rate', 'docno': ['341_7', '341_8']}, '2165': {'query': 'What were the effects of the Napoleonic Wars on Britains empire', 'docno': ['420_22', '420_

In [50]:
import time

def search_and_return_results(indexref, queries_dict):
    index = pt.IndexFactory.of(indexref)
    br = pt.BatchRetrieve(index, wmodel="BM25", num_results=10)

    all_results = []

    for query_id, query_data in queries_dict.items():
        query = query_data['query']
        results = br.search(query)
        
        docnos = []
        for _, row in results.iterrows():
            docnos.append(row['docno'])



        saved_values = [query, docnos]
        all_results.append(saved_values)
        
    # Concatenate all results into a single DataFrame
    results_df = pd.DataFrame(all_results, columns=['query', 'docno'])
    return results_df

# Usage
results_df = search_and_return_results(indexref, queries_labels)
print(results_df.head())


  warn("Skipping empty query for qid %s" % qid)
  warn("Skipping empty query for qid %s" % qid)


                                               query  \
0  What is the role of conversionism in Evangelic...   
1  How did the assault on the Bastille the first ...   
2                  What is the prehistory of Albania   
3  What significance did Bulgaria have in the end...   
4  What is the rationale of support of the Common...   

                                               docno  
0  [672_3, 672_4, 672_41, 242_21, 672_10, 443_16,...  
1  [359_2, 359_53, 792_12, 26_17, 649_3, 20_12, 4...  
2  [517_120, 691_59, 285_17, 285_26, 360_81, 285_...  
3  [417_13, 417_12, 417_32, 212_18, 212_15, 579_0...  
4  [204_5, 204_10, 706_2, 204_4, 204_6, 204_0, 20...  


In [59]:
def calculate_map(queries_df):
    total_queries = len(queries_df)
    map_sum = 0

    for i, key in zip(range(total_queries), queries_labels):
        relevant_passages = queries_df.iloc[i]['docno']
        real_relevant_passages = queries_labels[key]['docno']
        

        # Initialize variables
        num_relevant = 0
        precision_sum = 0

        time.sleep(5)
    return map_score

map_score = calculate_map(results_df)
print("MAP:", map_score)

['672_3', '672_4', '672_41', '242_21', '672_10', '443_16', '672_11', '672_12', '672_8', '672_49']
['672_4']


KeyboardInterrupt: 