In [1]:
import os

In [5]:
def import_documents(directory):
    documents = []
    if os.path.isfile(directory):
        if directory.endswith(".txt"):
            with open(directory, 'r', encoding='utf-8') as file:
                documents.append(file.read())
    elif os.path.isdir(directory):
        for filename in os.listdir(directory):
            if filename.endswith(".txt"):
                with open(os.path.join(directory, filename), 'r', encoding='utf-8') as file:
                    documents.append(file.read())
    return documents

documents = import_documents("/content/sample.txt")
print(f"Imported {len(documents)} documents.")

Imported 1 documents.


In [6]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from collections import defaultdict

nltk.download('punkt')
stemmer = PorterStemmer()

def preprocess_document(doc):
    tokens = word_tokenize(doc.lower())
    tokens = [stemmer.stem(word) for word in tokens if word.isalnum()]
    return tokens


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [10]:
def create_dictionary(documents):
    dictionary = set()
    for doc in documents:
        tokens = preprocess_document(doc)
        dictionary.update(tokens)
    return dictionary
dictionary = create_dictionary(documents)
print("Dictionary:", dictionary)


Dictionary: {'wa', 'it', 'everyth', 'anker', 'our', 'wish', 'amazon', 'travel', 'even', '031', 'import', 'them', 'drain', 'click', 'depart', 'all', '1h', 'song', 'think', 'offlicens', 'my', '105853', '105860', 'suck', 'properli', 'doe', 'bye', 'thing', 'okay', 'onc', 'speed', 'same', '105851', 'as', 'anyth', 'free', 'number', 'know', 'great', 'livechat', 'companion', 'drop', 'again', 'suddenli', 'printhead', 'pleas', 'be', 'laptop', 'addit', 'nudg', 'custom', 'both', '105843', 'cheer', 'system', '76099', 'reschedul', 'respond', 'zip', 'direct', 'love', 'come', 'regard', 'load', 'welcom', 'get', 'beg', 'by', 'just', 'colleagu', 'half', 'distanc', 'let', '105848', 'paus', 'shout', 'type', 'like', 'sever', 'full', 'new', 'page', 'cooki', 'mayb', 'shortli', 'make', 'updat', 'tri', 'some', '25', 'agoura', 'restart', 'ago', 'macbook', 'befor', 'i', 'cumbersom', 'tomorrow', 'unlik', 'poor', 'sierra', 'sale', 'refund', 'off', 'basket', 'wo', 'life', 'a', 'is', 'look', 'paralys', 'hit', 'person

In [8]:
def create_inverted_index(documents, dictionary):
    inverted_index = defaultdict(list)
    for idx, doc in enumerate(documents):
        tokens = preprocess_document(doc)
        for token in set(tokens):
            if token in dictionary:
                inverted_index[token].append(idx)
    return inverted_index
inverted_index = create_inverted_index(documents, dictionary)
print("Inverted Index:", dict(inverted_index))


Inverted Index: {'wa': [0], 'it': [0], 'everyth': [0], 'anker': [0], 'our': [0], 'wish': [0], 'amazon': [0], 'travel': [0], 'even': [0], '031': [0], 'import': [0], 'them': [0], 'drain': [0], 'click': [0], 'depart': [0], 'all': [0], '1h': [0], 'song': [0], 'think': [0], 'offlicens': [0], 'my': [0], '105853': [0], '105860': [0], 'suck': [0], 'properli': [0], 'doe': [0], 'bye': [0], 'thing': [0], 'okay': [0], 'onc': [0], 'speed': [0], 'same': [0], '105851': [0], 'as': [0], 'anyth': [0], 'free': [0], 'number': [0], 'know': [0], 'great': [0], 'livechat': [0], 'companion': [0], 'drop': [0], 'again': [0], 'suddenli': [0], 'printhead': [0], 'pleas': [0], 'be': [0], 'laptop': [0], 'addit': [0], 'nudg': [0], 'custom': [0], 'both': [0], '105843': [0], 'cheer': [0], 'system': [0], '76099': [0], 'reschedul': [0], 'respond': [0], 'zip': [0], 'direct': [0], 'love': [0], 'come': [0], 'regard': [0], 'load': [0], 'welcom': [0], 'get': [0], 'beg': [0], 'by': [0], 'just': [0], 'colleagu': [0], 'half': [0]

In [9]:
def boolean_and(term1, term2, inverted_index):
    return list(set(inverted_index.get(term1, [])) & set(inverted_index.get(term2, [])))

def boolean_or(term1, term2, inverted_index):
    return list(set(inverted_index.get(term1, [])) | set(inverted_index.get(term2, [])))

def boolean_not(term, inverted_index, total_docs):
    return list(set(range(total_docs)) - set(inverted_index.get(term, [])))
print("AND Query ('inform', 'retriev'):", boolean_and('inform', 'retriev', inverted_index))
print("OR Query ('inform', 'data'):", boolean_or('inform', 'data', inverted_index))
print("NOT Query ('retriev'):", boolean_not('retriev', inverted_index, len(documents)))


AND Query ('inform', 'retriev'): []
OR Query ('inform', 'data'): [0]
NOT Query ('retriev'): [0]
