In [13]:
import json
from rank_bm25 import BM25Okapi
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\titouan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [14]:
def load_data(filepath):
    with open(filepath, 'r') as file:
        data = json.load(file)
    return data

In [15]:
def preprocess(text):
    return word_tokenize(text.lower())

In [16]:
def prepare_bm25(data):
    doc_passages = {}
    corpus = []
    for doc_id, passages in data.items():
        doc_passages[doc_id] = []
        for passage_id, text in passages.items():
            processed_text = preprocess(text)
            corpus.append(processed_text)
            doc_passages[doc_id].append((passage_id, processed_text))
    bm25 = BM25Okapi(corpus)
    return bm25, doc_passages

def get_most_relevant_document(bm25, doc_passages, query):
    tokenized_query = preprocess(query)
    scores = bm25.get_scores(tokenized_query)  # This gives scores for all passages in the corpus

    # Calculate aggregate scores for each document based on its passages
    doc_scores = {}
    start_index = 0
    for doc_id, passages in doc_passages.items():
        doc_scores[doc_id] = np.mean(scores[start_index:start_index + len(passages)])  # Use mean score of passages for the document
        start_index += len(passages)

    most_relevant_doc = max(doc_scores, key=doc_scores.get)
    return most_relevant_doc

def get_top_passages_from_doc(bm25, doc_passages, doc_id, query, top_n=2):
    tokenized_query = preprocess(query)
    passages = doc_passages[doc_id]
    scores = bm25.get_scores(tokenized_query)

    # Filter scores for passages in the specific document
    start_index = sum(len(doc_passages[doc]) for doc in sorted(doc_passages) if doc < doc_id)
    doc_scores = scores[start_index:start_index + len(passages)]

    scored_passages = [(score, passage_id, ' '.join(text)) for score, (passage_id, text) in zip(doc_scores, passages)]
    top_passages = sorted(scored_passages, reverse=True, key=lambda x: x[0])[:top_n]
    return top_passages


In [17]:
def print_results(data, top_docs):
    print("Top Documents and Passages:")
    for doc, score in top_docs:
        print(f"Document ID: {doc[0]}, Passage ID: {doc[1]}, Score: {score:.2f}")
        print(data[doc[0]][doc[1]])
        print()

In [19]:
def main():
    filepath = 'WikiPassageQA/document_passages.json'
    data = load_data(filepath)
    bm25, doc_passages = prepare_bm25(data)
    query = "What is the structure of Australia’s members of parliament?"

    most_relevant_doc = get_most_relevant_document(bm25, doc_passages, query)
    top_passages = get_top_passages_from_doc(bm25, doc_passages, most_relevant_doc, query, top_n=5)

    print(f"Most Relevant Document: {most_relevant_doc}")
    for score, passage_id, text in top_passages:
        print(f"Passage ID: {passage_id}, Score: {score:.2f}")
        print(text)
        print()

main()

Most Relevant Document: 400
Passage ID: 15, Score: 21.36
there are 123 members of parliament in total . they are also alternatively called member of the national assembly . parliamentary elections are traditionally held every five years with no term limits imposed . the 25 provinces of cambodia are represented by the members of parliament in the national assembly . a constituency may have more than one mp , depending on the population . a member of parliament is a member of either of the two chambers of the parliament of the czech republic , although the term members of parliament of the czech republic is commonly referred to deputies of the parliament of the czech republic who are members of the lower house of the parliament , chamber of deputies .

Passage ID: 18, Score: 19.48
the parliament of lebanon is the lebanese national legislature . it is elected to a four-year term by universal adult suffrage in multi-member constituencies , apportioned among lebanon 's diverse christian and