<a href="https://colab.research.google.com/github/eklahari/IR-Information-Retrieval/blob/main/InvertedIndex.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Inverted - Index**

In [None]:
import nltk
import string
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
def preprocess_documents(documents):
    refined_tokens = []

    for doc in documents:
        # Step 1: Lowercasing
        doc = doc.lower()

        # Step 2: Tokenization
        tokens = word_tokenize(doc)

        # Step 3: Removing Punctuation
        tokens = [word for word in tokens if word not in string.punctuation]

        # Step 4: Stopword Removal
        stop_words = set(stopwords.words('english'))
        filtered_tokens = [word for word in tokens if word not in stop_words]

        # Step 5: Stemming (using Porter Stemmer)
        stemmer = PorterStemmer()
        stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]

        # Step 6: Lemmatization (using WordNet Lemmatizer)
        lemmatizer = WordNetLemmatizer()
        lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]

        # Extend the refined_tokens list with the current document's tokens
        refined_tokens.extend(lemmatized_tokens)

    return refined_tokens

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [50]:
def build_inverted_index(documents):
    inverted_index = {}

    # Iterate through documents
    for doc_id, doc in enumerate(documents):
        # Preprocess the document to get refined tokens
        tokens = preprocess_documents([doc])

        for term in tokens:
            if term not in inverted_index:
                inverted_index[term] = []
            if doc_id not in inverted_index[term]:
                inverted_index[term].append(doc_id)

    return inverted_index
def print_inverted_index(inverted_index):
    print("terms||postingslist")
    for term, doc_ids in inverted_index.items():
        print(f"{term}",end="")
        print("->",doc_ids)
    print("\n")

In [51]:
def process_complex_query(query, inverted_index):
    def and_operator(postings1, postings2):
        p1, p2 = 0, 0
        result = []
        while p1 < len(postings1) and p2 < len(postings2):
            if postings1[p1] == postings2[p2]:
                result.append(postings1[p1])
                p1 += 1
                p2 += 1
            elif postings1[p1] < postings2[p2]:
                p1 += 1
            else:
                p2 += 1
        return result

    def or_operator(postings1, postings2):
        p1, p2 = 0, 0
        result = []
        while p1 < len(postings1) and p2 < len(postings2):
            if postings1[p1] == postings2[p2]:
                result.append(postings1[p1])
                p1 += 1
                p2 += 1
            elif postings1[p1] < postings2[p2]:
                result.append(postings1[p1])
                p1 += 1
            else:
                result.append(postings2[p2])
                p2 += 1

        while p1 < len(postings1):
            result.append(postings1[p1])
            p1 += 1

        while p2 < len(postings2):
            result.append(postings2[p2])
            p2 += 1

        return result

    def not_operator(postings):
        all_docs = set(range(1, 5))  # Replace with the actual range of document IDs
        return list(all_docs - set(postings))

    def solve(postings, operator):
        if operator == 'and':
            return and_operator(postings[0], postings[1])
        elif operator == 'or':
            return or_operator(postings[0], postings[1])
        else:
            return not_operator(postings[0])

    query = query.lower()
    query = query.replace("(", " ( ").replace(")", " ) ").split()

    operators = []
    operands = []

    for token in query:
        if token in inverted_index:
            operand = inverted_index[token]
            operands.append(operand)
        elif token == '(':
            operators.append(token)
        elif token == ')':
            while operators and operators[-1] != '(':
                operator = operators.pop()
                operand2 = operands.pop()
                operand1 = operands.pop()
                result = solve([operand1, operand2], operator)
                operands.append(result)
            operators.pop()  # Remove the '('
        elif token in {'and', 'or', 'not'}:
            operators.append(token)

    # Process remaining operators and operands
    while operators:
        operator = operators.pop()
        if operator == '(' or operator == ')':
            raise ValueError("Invalid query")
        operand2 = operands.pop()
        operand1 = operands.pop()
        result = solve([operand1, operand2], operator)
        operands.append(result)

    return operands[0]


In [52]:
documents = [
    "breakthrough drug for schizophrenia",
    "new schizophrenia drug",
    "new approach for treatment of schizophrenia",
    "new hopes for schizophrenia patients"
    ]

# Build the inverted index
inverted_index = build_inverted_index(documents)
print_inverted_index(inverted_index)

# Process different queries
queries = [
    "(new and treatment) or (new and approach)",
    "(breakthrough or approach) and schizophrenia",
    "schizophrenia"
]
for query in queries:
    result = process_complex_query(query,inverted_index)
    print("Query:", query)
    print("Result:", result)

terms||postingslist
breakthrough-> [0]
drug-> [0, 1]
schizophrenia-> [0, 1, 2, 3]
new-> [1, 2, 3]
approach-> [2]
treatment-> [2]
hope-> [3]
patient-> [3]


Query: (new and treatment) or (new and approach)
Result: [2]
Query: (breakthrough or approach) and schizophrenia
Result: [0, 2]
Query: schizophrenia
Result: [0, 1, 2, 3]
