## Merged Index

In [2]:
import json

def load_index(file_path):
    with open(file_path, 'r') as f:
        data = json.load(f)
        return {item.split(':')[0].strip(): [int(x) for x in item.split(':')[1].strip().split(',')] for item in data}
        

expedia_index = load_index('output_expedia.json')
airbnb_index = load_index('output_airbnb.json')
lonelyplanet_index = load_index('output_lonelyplanet.json')

final_index = {}

for index in [expedia_index, airbnb_index, lonelyplanet_index]:
    for term, doc_ids in index.items(): 
        if term in final_index:
            final_index[term].extend(doc_ids)  
        else:
            final_index[term] = doc_ids


with open('final_inverted_index.json', 'w') as f:
    json.dump(final_index, f, indent=4)

print("Final inverted index saved successfully!")

Final inverted index saved successfully!


## TF-IDF Calculation 

In [4]:
import math
from collections import defaultdict

with open('final_inverted_index.json', 'r') as f:
    inverted_index = json.load(f)

doc_ids = set()
for term_docs in inverted_index.values():
    doc_ids.update(term_docs)
N = len(doc_ids)

# TF-IDF scores
tf_idf = defaultdict(dict)

for term, docs in inverted_index.items():
    df = len(docs)  
    idf = math.log((N / df) + 1) 

    for doc in docs:
        tf = docs.count(doc) / len(docs)  
        tf_idf[doc][term] = tf * idf 

with open('tf_idf_vectors.json', 'w') as f:
    json.dump(tf_idf, f, indent=4)

print("TF-IDF document vectors saved!")


TF-IDF document vectors saved!


In [6]:
def process_query(query, inverted_index, N):
    query_terms = query.lower().split()
    query_vector = {}

    for term in query_terms:
        if term in inverted_index:
            df = len(inverted_index[term])  
            idf = math.log((N / df) + 1) 
            query_vector[term] = idf  

    return query_vector


## Cosine Similarity

In [8]:
from numpy.linalg import norm
import numpy as np

with open('tf_idf_vectors.json', 'r') as f:
    doc_vectors = json.load(f)

def cosine_similarity(query_vector, doc_vector):
    
    query_vec = np.array([query_vector.get(term, 0) for term in doc_vector.keys()])
    doc_vec = np.array([doc_vector[term] for term in doc_vector.keys()])

    if norm(query_vec) == 0 or norm(doc_vec) == 0:
        return 0  

    return np.dot(query_vec, doc_vec) / (norm(query_vec) * norm(doc_vec))


def rank_documents(query):
    query_vector = process_query(query, inverted_index, N)
    similarities = []

    for doc_id, doc_vector in doc_vectors.items():
        similarity = cosine_similarity(query_vector, doc_vector)
        similarities.append((doc_id, similarity))

    ranked_docs = sorted(similarities, key=lambda x: x[1], reverse=True)

    return ranked_docs

# Example query
query = "best travel destinations"
ranked_results = rank_documents(query)
print("Ranked Documents:", ranked_results)


Ranked Documents: [('12', np.float64(0.00021951006188713817)), ('17', np.float64(0.00013581062605648786)), ('9', np.float64(0.000102494491456348)), ('8', np.float64(9.172958825055138e-05)), ('11', np.float64(8.942504437114426e-05)), ('6', np.float64(6.15341330602662e-05)), ('5', np.float64(6.117056241381001e-05)), ('4', np.float64(5.551387842034257e-05)), ('1', np.float64(5.010037011795599e-05)), ('0', np.float64(4.895969656409375e-05)), ('48', np.float64(4.789109523589405e-05)), ('16', np.float64(4.722463736632753e-05)), ('3', np.float64(4.661408262834144e-05)), ('10', np.float64(3.475076568209261e-05)), ('2', np.float64(3.13313892166242e-05)), ('44', np.float64(2.7441654191507667e-05)), ('25', np.float64(1.735590747269721e-05)), ('26', np.float64(1.735590747269721e-05)), ('27', np.float64(1.735590747269721e-05)), ('30', np.float64(1.735590747269721e-05)), ('32', np.float64(1.735590747269721e-05)), ('34', np.float64(1.735590747269721e-05)), ('35', np.float64(1.735590747269721e-05)), (

In [10]:
import json
from collections import defaultdict
import math

with open('final_inverted_index.json', 'r') as f:
    inverted_index = json.load(f)

doc_term_counts = defaultdict(lambda: defaultdict(int))
doc_lengths = defaultdict(int)

for term, docs in inverted_index.items():
    for doc_id in docs:
        doc_term_counts[doc_id][term] += 1
        doc_lengths[doc_id] += 1

total_terms = sum(doc_lengths.values())
vocab_size = len(inverted_index)

print("Documents processed:", len(doc_term_counts))
print("Total terms in collection:", total_terms)


Documents processed: 89
Total terms in collection: 1907426


## Query Liklihood Model

In [12]:
def compute_query_likelihood(query, mu=2000):
    query_terms = query.lower().split()
    doc_scores = {}

    for doc_id, term_counts in doc_term_counts.items():
        doc_length = doc_lengths[doc_id]
        score = 0

        for term in query_terms:
            
            term_freq = term_counts.get(term, 0)
            p_doc = term_freq / doc_length if doc_length > 0 else 0

            collection_term_freq = sum(inverted_index.get(term, []))
            p_collection = collection_term_freq / total_terms if total_terms > 0 else 1e-10

            # Dirichlet smoothing
            smoothed_prob = (doc_length / (doc_length + mu)) * p_doc + (mu / (doc_length + mu)) * p_collection
            score += math.log(smoothed_prob) if smoothed_prob > 0 else 0

        doc_scores[doc_id] = score

    ranked_docs = sorted(doc_scores.items(), key=lambda x: x[1], reverse=True)

    return ranked_docs


In [14]:
query = "best travel destinations"
ranked_results = compute_query_likelihood(query)
print("Ranked Documents:", ranked_results[:10])  


Ranked Documents: [(87, -10.912468893166253), (85, -10.91532196069716), (88, -10.91532196069716), (86, -10.917221747114052), (8, -11.08519641582918), (67, -11.277244568709076), (70, -11.282502582982545), (75, -11.28436045585186), (69, -11.295471595541066), (68, -11.302881503842077)]
