In [2]:
import pickle

# Load pre-processed data
with open('preprocessed_data.pkl', 'rb') as f:
    pre_processed_data = pickle.load(f)

# Load vocabulary
with open('final_vocabulary.pkl', 'rb') as f:
    vocabulary = list(pickle.load(f))

In [None]:
# Initialize an inverted index using a dictionary
inverted_index_dict = {}

for doc_id, document in pre_processed_data:
    list_document = document.split()
    for i in range(len(list_document)):
        term = list_document[i]
        if term in inverted_index_dict:
            res = inverted_index_dict[term]
            res[1].add(i)
            res[0] = len(res[1])
        else: 
            inverted_index_dict[term] = [1,{i}]

#sorted(inverted_index_dict.items(), key=lambda x: x[1][0], reverse=True)


# Initialize an empty list-based inverted index
inverted_index_list = []

# Build the list-based inverted index
for doc_id, document in pre_processed_data:
    list_document = document.split()
    for i, term in enumerate(list_document):
        # Ensure that the list has enough elements for the term_id
        while len(inverted_index_list) <= i:
            inverted_index_list.append([])

        # Check if the term already exists in the inverted index
        term_found = False
        for j, (existing_term, postings) in enumerate(inverted_index_list[i]):
            if existing_term == term:
                postings.add(doc_id)
                term_found = True
                break

        if not term_found:
            inverted_index_list[i].append((term, {doc_id}))

# Sorting the list-based inverted index
sorted_inverted_index_list = sorted(inverted_index_list, key=lambda x: len(x[1]), reverse=True)

### Analysis of the two Data Structures

In [1]:
# Retrieval function for dictionary-based index
def retrieve_from_dict(term):
    return inverted_index_dict.get(term, [])

# Retrieval function for list-based index
def retrieve_from_list(term):
    term_id = vocabulary.index(term)
    return inverted_index_list[term_id]

# Insertion function for dictionary-based index
def insert_to_dict(term, doc_id):
    if term in inverted_index_dict:
        inverted_index_dict[term].append(doc_id)
    else:
        inverted_index_dict[term] = [doc_id]

# Insertion function for list-based index
def insert_to_list(term, doc_id):
    term_id = vocabulary.index(term)
    inverted_index_list[term_id].append(doc_id)

# Updation function for dictionary-based index
def update_in_dict(term, old_doc_id, new_doc_id):
    if term in inverted_index_dict:
        inverted_index_dict[term] = [doc_id for doc_id in inverted_index_dict[term] if doc_id != old_doc_id]
        inverted_index_dict[term].append(new_doc_id)

# Updation function for list-based index
def update_in_list(term, old_doc_id, new_doc_id):
    term_id = vocabulary.index(term)
    if old_doc_id in inverted_index_list[term_id]:
        inverted_index_list[term_id].remove(old_doc_id)
        inverted_index_list[term_id].append(new_doc_id)

# Deletion function for dictionary-based index
def delete_from_dict(term, doc_id):
    if term in inverted_index_dict and doc_id in inverted_index_dict[term]:
        inverted_index_dict[term].remove(doc_id)

# Deletion function for list-based index
def delete_from_list(term, doc_id):
    term_id = vocabulary.index(term)
    if doc_id in inverted_index_list[term_id]:
        inverted_index_list[term_id].remove(doc_id)

In [None]:
# Measuring storage cost for dictionary-based index
import sys
import time
storage_cost_dict = sys.getsizeof(inverted_index_dict)

# Measuring storage cost for list-based index
storage_cost_list = sys.getsizeof(inverted_index_list)

# Measuring retrieval time for a specific term
term_to_retrieve = "oppon"
start_time_dict = time.time()
postings_dict = retrieve_from_dict(term_to_retrieve)
end_time_dict = time.time()

start_time_list = time.time()
postings_list = retrieve_from_list(term_to_retrieve)
end_time_list = time.time()

# Measuring insertion time for a specific term and document
term_to_insert = "opponent"
doc_to_insert = 123
start_time_insert_dict = time.time()
insert_to_dict(term_to_insert, doc_to_insert)
end_time_insert_dict = time.time()

start_time_insert_list = time.time()
insert_to_list(term_to_insert, doc_to_insert)
end_time_insert_list = time.time()

# Measuring updation time for a specific term, old document, and new document
term_to_update = "oppon"
old_doc_id = 123  # Replace with the actual old document ID
new_doc_id = 456  # Replace with the actual new document ID
start_time_update_dict = time.time()
update_in_dict(term_to_update, old_doc_id, new_doc_id)
end_time_update_dict = time.time()

start_time_update_list = time.time()
update_in_list(term_to_update, old_doc_id, new_doc_id)
end_time_update_list = time.time()

# Measuring deletion time for a specific term and document
term_to_delete = "truncat"
doc_to_delete = 789  # Replace with the actual document ID
start_time_delete_dict = time.time()
delete_from_dict(term_to_delete, doc_to_delete)
end_time_delete_dict = time.time()

start_time_delete_list = time.time()
delete_from_list(term_to_delete, doc_to_delete)
end_time_delete_list = time.time()

# Calculating time costs
retrieval_time_dict = end_time_dict - start_time_dict
retrieval_time_list = end_time_list - start_time_list

insertion_time_dict = end_time_insert_dict - start_time_insert_dict
insertion_time_list = end_time_insert_list - start_time_insert_list

updation_time_dict = end_time_update_dict - start_time_update_dict
updation_time_list = end_time_update_list - start_time_update_list

deletion_time_dict = end_time_delete_dict - start_time_delete_dict
deletion_time_list = end_time_delete_list - start_time_delete_list

# Printing and analyze the results
print("Storage Cost (Dictionary-Based):", storage_cost_dict, "bytes")
print("Storage Cost (List-Based):", storage_cost_list, "bytes")

print("Retrieval Time (Dictionary-Based):", retrieval_time_dict, "seconds")
print("Retrieval Time (List-Based):", retrieval_time_list, "seconds")

print("Insertion Time (Dictionary-Based):", insertion_time_dict, "seconds")
print("Insertion Time (List-Based):", insertion_time_list, "seconds")

print("Updation Time (Dictionary-Based):", updation_time_dict, "seconds")
print("Updation Time (List-Based):", updation_time_list, "seconds")

print("Deletion Time (Dictionary-Based):", deletion_time_dict, "seconds")
print("Deletion Time (List-Based):", deletion_time_list, "seconds")
