<a href="https://colab.research.google.com/github/eklahari/IR-Information-Retrieval/blob/main/BiwordIndex.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Biword-Index**

In [43]:
import string

def build_term_index(documents):
    term_index = {}
    for doc_id, doc in enumerate(documents, start=1):
        doc = doc.lower()
        doc = ''.join([char for char in doc if char not in string.punctuation])
        terms = doc.split()
        for position, term in enumerate(terms, start=1):
            if term in term_index:
                term_index[term].append((doc_id, position))
            else:
                term_index[term] = [(doc_id, position)]
    return term_index

def process_complex_phrase_query(query, term_index, max_gap=0):
    query = query.lower()
    query_terms = query.split()
    results = set()

    first_term = query_terms[0]
    if first_term in term_index:
        candidates = set([doc_id for doc_id, _ in term_index[first_term]])
    else:
        return []

    for term in query_terms[1:]:
        if term in term_index:
            current_results = set()

            for doc_id in candidates:
                positions = [pos for doc, pos in term_index[term] if doc == doc_id]
                prev_positions = [pos - 1 for doc, pos in term_index[query_terms[query_terms.index(term) - 1]] if doc == doc_id]

                if any(abs(prev - pos) <= max_gap for prev in prev_positions for pos in positions):
                    current_results.add(doc_id)

            results = current_results

    return list(results)

documents = [
    "this is a sample document. it contains sample text for testing.",
    "sample text for testing.",
    "another sample document for demonstration."
]

term_index = build_term_index(documents)

query = "contains sample "
results = process_complex_phrase_query(query, term_index, max_gap=2)

if results:
    print(f"Documents containing the complex phrase '{query}': {results}")
else:
    print(f"No documents found for the complex phrase '{query}'.")


Documents containing the complex phrase 'contains sample ': [1]
