In [70]:
import os
import string
import warnings
import nltk
import re
import pickle
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
warnings.filterwarnings('ignore')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [71]:
def positional(directory_path, num_docs):
    dict = {}

    for doc_id in range(1, num_docs + 1):
        path = os.path.join(directory_path, f"file{doc_id}.txt")

        if os.path.exists(path):
            with open(path, 'r') as file:
                terms = file.read().split()

                for position, term in enumerate(terms, start=1):
                    if term not in dict:
                        dict[term] = {'docs': {doc_id: [position]}}
                    else:
                        dict[term]['docs'].setdefault(doc_id, []).append(position)
        else:
            print(f"File file{doc_id}.txt does not exist.")

    return dict

# Path to the directory containing the text files
directory_path = "/content/drive/MyDrive/IR/text_files/"
# Number of documents
num_docs = 999

# Build the positional index
dict = positional(directory_path, num_docs)

# Print the positional index
print(dict)





In [72]:
file_path = "Q3_pickle.pkl"

with open(file_path, 'wb') as file:
    pickle.dump(dict, file)

print("Positional index saved to", file_path)

Positional index saved to Q3_pickle.pkl


In [73]:

print(len(dict))

6471


In [76]:
print(dict['load'])


{'docs': {226: [2], 267: [92], 382: [47]}}


In [75]:
file_path = "Q3_pickle.pkl"

with open(file_path, 'rb') as file:
    dict = pickle.load(file)

print("Positional index from", file_path)

Positional index from Q3_pickle.pkl


In [77]:
def preprocess(text):
    lem = WordNetLemmatizer()

    # Convert to lowercase
    text = text.lower()

    # Remove non-alphabetic characters
    text = re.sub('[^a-z ]+', ' ', text)

    # Tokenize the text
    tokens = text.split()

    # Remove stop words and punctuation
    stop_words = set(stopwords.words("english"))
    tokens = [token for token in tokens if token not in stop_words and token.isalpha()]

    # Lemmatize the tokens
    tokens = [lem.lemmatize(token) for token in tokens]

    # Join the tokens back into a string
    preprocessed_text = ' '.join(tokens)

    return preprocessed_text


In [80]:
def preprocess_query(query):
    lem = WordNetLemmatizer()

    # Convert to lowercase
    query = query.lower()

    # Remove non-alphabetic characters
    query = re.sub('[^a-z ]+', ' ', query)

    # Tokenize the query
    tokens = query.split()

    # Remove stop words, punctuation, and empty strings
    stop_words = set(["a", "an", "the", "and", "in", "on", "at", "to", "of"])
    tokens = [token for token in tokens if token not in stop_words and token not in ",.?!-"]

    # Lemmatize the tokens
    tokens = [lem.lemmatize(token) for token in tokens]

    # Join the tokens back into a string
    preprocessed_query = ' '.join(tokens)

    return preprocessed_query

def retrived_docs(term, dict):
    result_docs = set()

    for term in term:
        if term in dict:
            docs_positions = dict[term]['docs']
            result_docs.update(docs_positions.keys())

    sorted_docs = sorted(result_docs)
    return sorted_docs

# Process input queries
n_queries = int(input("Enter the number of queries: "))  # Read the number of queries
queries = [input("Enter query: ").strip() for _ in range(n_queries)]  # Read the queries

# Execute queries and get results
results = []

for i, query in enumerate(queries, start=1):
    after_query = preprocess_query(query)
    term = after_query.split()

    if(len(term)>5):
        print("Query contains more than 5 words")
        continue


    # Check if the first term is in the positional index
    if term and term[0] not in dict:
        print(f"Word '{term[0]}' not in dictionary for query {i}!")
        continue

    l_index = dict[term[0]]['docs']

    for term in term[1:]:
        if term not in dict:
            print(f"Word '{term}' not in dictionary for query {i}!")
            break

        new_index = {}
        for doc_id in l_index:
            if doc_id in dict[term]['docs']:
                found_positions = [pos for pos in l_index[doc_id] if pos + 1 in dict[term]['docs'][doc_id]]
                if found_positions:
                    new_index[doc_id] = found_positions

        l_index = new_index

    documents_found = list(l_index.keys())
    results.append(documents_found)

# Output results
for i, result in enumerate(results, start=1):
    print(f"Number of docs retrieve for query {i} using positional index: {len(result)}")
    if result:
        print(f"Names of docs retrieve for query {i} using positional index: {', '.join(map(str, result))}")
    else:
        print("No documents found.")


Enter the number of queries: 2
Enter query: load is loaded
Enter query: great value
Number of docs retrieve for query 1 using positional index: 0
No documents found.
Number of docs retrieve for query 2 using positional index: 10
Names of docs retrieve for query 2 using positional index: 65, 103, 330, 466, 597, 748, 767, 789, 899, 993
