# Boolean Retrieval Model Implementation
This notebook implements the Boolean Retrieval Model with inverted index and query processing.

In [15]:
import os
import nltk
from collections import defaultdict
from nltk.stem import PorterStemmer

nltk.download('punkt')

# Function to load stopwords
def load_stopwords(file_path):
    with open(file_path, 'r') as f:
        return set(f.read().splitlines())

# Function to preprocess text
def preprocess_text(text, stopwords):
    ps = PorterStemmer()
    tokens = nltk.word_tokenize(text.lower())
    return [ps.stem(word) for word in tokens if word.isalpha() and word not in stopwords]

# Function to build inverted and positional indexes
def build_indexes(abstracts_folder, stopwords):
    inverted_index = defaultdict(set)  
    positional_index = defaultdict(lambda: defaultdict(list))  

    for doc_id, filename in enumerate(sorted(os.listdir(abstracts_folder))):
        file_path = os.path.join(abstracts_folder, filename)

        try:
            with open(file_path, 'r', encoding='utf-8', errors="ignore") as file:  # Ignore decoding errors
                text = file.read()
        except Exception as e:
            print(f"❌ Error reading {file_path}: {e}")
            continue  # Skip problematic files

        words = preprocess_text(text, stopwords)

        for pos, word in enumerate(words):
            inverted_index[word].add(doc_id)
            positional_index[word][doc_id].append(pos)

    return inverted_index, positional_index


# Function to process Boolean queries
from nltk.stem import PorterStemmer

ps = PorterStemmer()  # Define stemmer globally

def boolean_query_processing(query, inverted_index, total_docs):
    query = query.lower().split()
    result_docs = set(range(total_docs))
    operator = None
    current_set = set()

    for term in query:
        if term in {"and", "or", "not"}:
            operator = term
        else:
            stemmed_term = ps.stem(term)  # Convert query term to its stemmed form
            term_docs = inverted_index.get(stemmed_term, set())

            if operator == "not":
                term_docs = result_docs - term_docs
            elif operator == "or":
                current_set |= term_docs
            elif operator == "and":
                current_set &= term_docs
            else:
                current_set = term_docs

    return sorted(current_set)


# Function to handle proximity queries (word1 word2 / k)
def proximity_query(word1, word2, k, positional_index):
    result_docs = set()
    
    if word1 in positional_index and word2 in positional_index:
        docs1, docs2 = positional_index[word1], positional_index[word2]

        for doc in docs1.keys() & docs2.keys():  # Find common documents
            positions1 = docs1[doc]
            positions2 = docs2[doc]

            for p1 in positions1:
                if any(abs(p1 - p2) <= k for p2 in positions2):
                    result_docs.add(doc)
                    break

    return sorted(result_docs)

# Load stopwords
stopwords = load_stopwords(r"C:\Users\ehtis\Downloads\Boolean_Retrieval_Model_Complete\data\Stopword-List.txt")

# Define dataset folder (Abstracts)
abstracts_folder = r"C:\Users\ehtis\Downloads\Boolean_Retrieval_Model_Complete\data\Abstracts"

# Build indexes
inverted_index, positional_index = build_indexes(abstracts_folder, stopwords)

# Testing with a Boolean Query
query = "deep AND learning"
boolean_results = boolean_query_processing(query, inverted_index, len(os.listdir(abstracts_folder)))

# Testing with a Proximity Query
proximity_results = proximity_query("neural", "information", 2, positional_index)

print(f"Boolean Query Results ({query}): {boolean_results}")
print(f"Proximity Query Results (neural information / 2): {proximity_results}")

print("Setup Complete!")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ehtis\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Boolean Query Results (deep AND learning): [83, 84, 85, 86, 127, 145, 156, 162, 164, 168, 172, 176, 186, 192, 193, 198, 199, 202, 251, 260, 273, 274, 275, 276, 281, 286, 287, 290, 292, 302, 304, 305, 306, 312, 313, 314, 329, 330, 336, 339, 340, 351, 358, 370, 383]
Proximity Query Results (neural information / 2): []
Setup Complete!


In [16]:
query = "deep AND learning"
boolean_results = boolean_query_processing(query, inverted_index, len(os.listdir(abstracts_folder)))

print(f"Boolean Query Results ({query}): {boolean_results}")


Boolean Query Results (deep AND learning): [83, 84, 85, 86, 127, 145, 156, 162, 164, 168, 172, 176, 186, 192, 193, 198, 199, 202, 251, 260, 273, 274, 275, 276, 281, 286, 287, 290, 292, 302, 304, 305, 306, 312, 313, 314, 329, 330, 336, 339, 340, 351, 358, 370, 383]


In [17]:
query = "machine OR learning AND NOT vision"
boolean_results = boolean_query_processing(query, inverted_index, len(os.listdir(abstracts_folder)))

print(f"Boolean Query Results ({query}): {boolean_results}")


Boolean Query Results (machine OR learning AND NOT vision): [0, 1, 2, 4, 5, 6, 7, 9, 11, 12, 13, 15, 16, 19, 21, 23, 26, 28, 29, 30, 33, 34, 35, 37, 38, 39, 40, 41, 42, 43, 44, 47, 48, 49, 50, 51, 52, 53, 54, 56, 58, 59, 60, 61, 62, 64, 65, 67, 69, 73, 74, 75, 76, 78, 79, 80, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 97, 98, 100, 101, 107, 111, 112, 113, 114, 118, 119, 120, 121, 122, 124, 126, 127, 128, 134, 140, 141, 143, 144, 145, 148, 151, 152, 153, 155, 156, 157, 158, 159, 160, 162, 164, 165, 167, 168, 169, 170, 172, 173, 174, 176, 177, 179, 180, 181, 182, 183, 184, 186, 188, 189, 190, 191, 192, 193, 195, 198, 199, 200, 201, 202, 204, 205, 206, 207, 208, 209, 210, 211, 213, 214, 216, 218, 221, 223, 225, 226, 229, 231, 235, 236, 237, 238, 239, 244, 245, 247, 248, 249, 250, 251, 252, 253, 254, 255, 257, 258, 259, 260, 261, 262, 264, 265, 266, 267, 272, 273, 274, 275, 276, 281, 282, 283, 284, 285, 286, 287, 288, 290, 291, 292, 293, 296, 297, 298, 299, 300, 302, 303, 304, 305, 306, 308, 

In [10]:
print("Sample of Inverted Index (first 10 terms):")
for term, doc_list in list(inverted_index.items())[:10]:  # Show first 10 words
    print(f"{term}: {doc_list}")


Sample of Inverted Index (first 10 terms):
ensembl: {0, 7, 396, 144, 24, 281, 174, 436, 181, 309, 187, 318, 205, 80, 222, 362, 235, 236, 109, 111, 245, 253}
statist: {0, 384, 386, 387, 4, 263, 15, 143, 271, 18, 19, 405, 25, 284, 32, 417, 418, 298, 45, 173, 301, 51, 53, 56, 440, 188, 317, 63, 65, 326, 204, 78, 79, 340, 345, 356, 101, 104, 105, 366, 367, 368, 115, 244, 117, 246, 376, 121}
and: {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 20, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 1

In [18]:
print("Words in Index:", list(inverted_index.keys())[:50])  # Show first 50 words


Words in Index: ['ensembl', 'statist', 'and', 'heurist', 'model', 'unsupervis', 'word', 'align', 'learn', 'need', 'larg', 'amount', 'of', 'train', 'data', 'while', 'they', 'weak', 'corpora', 'thi', 'paper', 'propos', 'new', 'approach', 'hybrid', 'techniqu', 'use', 'method', 'algorithm', 'three', 'base', 'sever', 'round', 'gener', 'weigh', 'scheme', 'resampl', 'vote', 'score', 'consid', 'aggreg', 'underli', 'studi', 'includ', 'ibm', 'dice', 'measur', 'our', 'experiment', 'result']


In [20]:
search_word = "learn"  
if search_word in inverted_index:
    print(f"✅ '{search_word}' exists in the index:", inverted_index[search_word])
else:
    print(f"❌ '{search_word}' NOT found in the index.")


✅ 'learn' exists in the index: {0, 1, 2, 5, 6, 7, 11, 13, 15, 16, 19, 21, 23, 26, 28, 33, 34, 35, 37, 38, 39, 41, 43, 44, 47, 48, 49, 50, 51, 52, 53, 54, 56, 58, 59, 60, 61, 62, 64, 65, 67, 69, 74, 75, 76, 78, 79, 80, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 97, 98, 100, 101, 107, 111, 112, 113, 114, 118, 119, 120, 121, 122, 124, 126, 127, 128, 134, 140, 141, 143, 144, 145, 148, 151, 152, 153, 155, 156, 157, 158, 159, 160, 162, 164, 165, 167, 168, 169, 170, 172, 173, 174, 176, 177, 179, 180, 181, 182, 184, 186, 188, 189, 190, 191, 192, 193, 195, 198, 199, 200, 202, 204, 205, 206, 207, 208, 209, 210, 211, 213, 214, 216, 218, 221, 223, 225, 226, 229, 231, 235, 236, 237, 238, 239, 244, 245, 247, 248, 249, 250, 251, 252, 253, 254, 255, 257, 258, 259, 260, 261, 262, 264, 265, 266, 267, 273, 274, 275, 276, 281, 282, 283, 284, 285, 286, 287, 288, 290, 291, 292, 293, 296, 297, 298, 299, 300, 302, 303, 304, 305, 306, 308, 310, 311, 312, 313, 314, 315, 316, 317, 319, 320, 321, 322, 323, 324, 325,

In [21]:
query = "deep"
boolean_results = boolean_query_processing(query, inverted_index, len(os.listdir(abstracts_folder)))

print(f"Boolean Query Results ({query}): {boolean_results}")


Boolean Query Results (deep): [83, 84, 85, 86, 127, 145, 156, 162, 164, 168, 172, 176, 186, 192, 193, 198, 199, 202, 251, 260, 273, 274, 275, 276, 281, 286, 287, 290, 292, 302, 304, 305, 306, 312, 313, 314, 329, 330, 336, 339, 340, 351, 358, 370, 383]
