In [1]:
import json
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
# Load the dataset
dataset_path = 'WikiPassageQA/document_passages.json'
with open(dataset_path, 'r') as file:
    dataset = json.load(file)

In [4]:
documents = [' '.join(passages.values()) for passages in dataset.values()]

In [5]:
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the corpus
tfidf_matrix = tfidf_vectorizer.fit_transform(documents)

In [9]:
def retrieve_documents(query, tfidf_matrix, tfidf_vectorizer, top_n=1, passages_per_doc=3):
    # Transform the query into TF-IDF representation
    query_tfidf = tfidf_vectorizer.transform([query])

    # Calculate cosine similarity between the query and all documents
    cosine_similarities = cosine_similarity(query_tfidf, tfidf_matrix)

    # Get the indices of the top_n most similar documents
    top_indices = cosine_similarities.argsort()[0][-top_n:][::-1]

    # Retrieve the relevant documents and passages
    relevant_documents = {}
    for index in top_indices:
        document_id = list(dataset.keys())[index]
        document_passages = list(dataset[str(document_id)].values())
        document_sim = cosine_similarities[0][index]

        # Get the top passages for this document
        top_passages_indices = cosine_similarity(query_tfidf, tfidf_matrix[index])[0].argsort()[-passages_per_doc:][::-1]
        top_passages = [document_passages[idx] for idx in top_passages_indices]

        relevant_documents[document_id] = {
            'similarity': document_sim,
            'passages': top_passages
        }

    return relevant_documents

In [10]:
query = "who lived in chartered towns"
top_documents = retrieve_documents(query, tfidf_matrix, tfidf_vectorizer)

In [11]:
for document_id, details in top_documents.items():
    print(f"Document ID: {document_id}, Similarity: {details['similarity']}")
    for passage in details['passages']:
        print(f"Passage: {passage}")
    print()

Document ID: 250, Similarity: 0.09687355296576257
Passage: Originally a program of the National League of Cities, Sister Cities International became a separate corporation in 1967 due to the growth and popularity of the US program. SCI is now a nonprofit citizen diplomacy network that creates and strengthens partnerships between communities in the US and other countries, organises cultural exchanges, and provides support and funding. Under its administration, more than 2,000 cities, states and counties are partnered in 136 countries. According to the SCI website, these exchanges include "musical performances, art exhibits, construction of peace parks or tea gardens, international cultural festivals, and teacher exchanges". Sister city cultural events include the annual National Cherry Blossom Festival in Washington, D.C., honouring Washington's sister city relationship with Tokyo City. Capitalizing on the growing world economy, many sister city members developed business agreements wit