<a href="https://colab.research.google.com/github/eklahari/IR-Information-Retrieval/blob/main/positionalindex.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Positional Index**

In [3]:
import string

def build_positional_index(documents):
    positional_index = {}
    for doc_id, doc in enumerate(documents, start=1):
        doc = doc.lower()
        doc = ''.join([char for char in doc if char not in string.punctuation])
        terms = doc.split()
        position = 0
        for term in terms:
            position += 1
            if term in positional_index:
                if doc_id in positional_index[term]:
                    positional_index[term][doc_id].append(position)
                else:
                    positional_index[term][doc_id] = [position]
            else:
                positional_index[term] = {doc_id: [position]}
    return positional_index

def process_phrase_query(phrase, positional_index):
    phrase = phrase.lower()
    terms = phrase.split()
    results = set()

    # Find documents containing the first term
    first_term = terms[0]
    if first_term in positional_index:
        candidate_docs = set(positional_index[first_term].keys())
    else:
        return []

    for doc_id in candidate_docs:
        positions = positional_index[first_term][doc_id]
        found = False
        for position in positions:
            valid_sequence = True
            for i, term in enumerate(terms[1:], start=1):
                if doc_id in positional_index[term] and position + i in positional_index[term][doc_id]:
                    continue
                else:
                    valid_sequence = False
                    break
            if valid_sequence:
                results.add(doc_id)
                found = True
                break
        if not found:
            continue

    return list(results)

# Sample documents as content without numbering
documents = [
    "this is a sample document. it contains sample text for testing.",
    "sample text for testing.",
    "another sample document for demonstration."
]

# Build the positional index
positional_index = build_positional_index(documents)

# Example phrase query
query = "sample text for testing"

# Process the phrase query
results = process_phrase_query(query, positional_index)

if results:
    print(f"Documents containing the phrase '{query}': {results}")
else:
    print(f"No documents found for the phrase '{query}'.")
print(positional_index)

Documents containing the phrase 'sample text for testing': [1, 2]
{'this': {1: [1]}, 'is': {1: [2]}, 'a': {1: [3]}, 'sample': {1: [4, 8], 2: [1], 3: [2]}, 'document': {1: [5], 3: [3]}, 'it': {1: [6]}, 'contains': {1: [7]}, 'text': {1: [9], 2: [2]}, 'for': {1: [10], 2: [3], 3: [4]}, 'testing': {1: [11], 2: [4]}, 'another': {3: [1]}, 'demonstration': {3: [5]}}
