In [38]:
import json
from rank_bm25 import BM25Okapi
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\titouan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [39]:
def load_data(filepath):
    with open(filepath, 'r') as file:
        data = json.load(file)
    return data

In [40]:
def preprocess(text):
    return word_tokenize(text.lower())

In [41]:
def prepare_bm25(data):
    doc_names = []
    corpus = []
    for doc_id, passages in data.items():
        for passage_id, text in passages.items():
            doc_names.append((doc_id, passage_id))
            corpus.append(preprocess(text))
    bm25 = BM25Okapi(corpus)
    return bm25, doc_names

def get_top_documents_bm25(bm25, doc_names, query, top_n=1):
    tokenized_query = preprocess(query)
    doc_scores = bm25.get_scores(tokenized_query)
    top_doc_indexes = sorted(range(len(doc_scores)), key=lambda i: doc_scores[i], reverse=True)[:top_n]
    top_docs = [(doc_names[i], doc_scores[i]) for i in top_doc_indexes]
    return top_docs

In [42]:
def print_results(data, top_docs):
    print("Top Documents and Passages:")
    for doc, score in top_docs:
        print(f"Document ID: {doc[0]}, Passage ID: {doc[1]}, Score: {score:.2f}")
        print(data[doc[0]][doc[1]])
        print()

In [43]:
def main():
    filepath = 'WikiPassageQA/document_passages.json'
    
    # Load data
    data = load_data(filepath)
    
    # Prepare VSM
    bm25, doc_names = prepare_bm25(data)
    
    # Example query
    query = "What is the structure of Australia’s members of parliament?"

    # Retrieve top documents using VSM
    top_docs_bm25 = get_top_documents_bm25(bm25, doc_names, query)
    
    # Print results
    print("BM25 RESULTATS")
    print_results(data, top_docs_bm25)

main()

BM25 RESULTATS
Top Documents and Passages:
Document ID: 400, Passage ID: 1, Score: 34.39
A member of the upper house of the Commonwealth parliament, the Senate, is known as a "Senator". In the Australian states of New South Wales, Victoria and South Australia, a Member of the Legislative Assembly or "lower house," may also use the post-nominal "MP." Members of the Legislative Council use the post-nominal "MLC." Members of the Jatiyo Sangshad, or National Assembly, are elected every five years and are referred to in English as members of Parliament. The assembly has directly elected 300 seats, and further 50 reserved selected seats for women. The Parliament of Canada consists of the monarch, the Senate , and the House of Commons .

