In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
import re
import argparse
import pickle
import sys
import glob
import numpy as np
from nltk.corpus import stopwords

dir_path = '/content/drive/My Drive/Colab Notebooks/boolean retrieval inverted-file'

In [None]:
def assert_dir(path):
    if not os.path.exists(path):
        print('ERROR: {} does not exists'.format(path))
        sys.exit(1)
    if not os.path.isdir(path):
        print('ERROR: {} is not a directory'.format(path))
        sys.exit(1)

def preprocess_text(text):
    processed_text = text.lower()
    processed_text = processed_text.replace("’", "'")
    processed_text = processed_text.replace("“", '"')
    processed_text = processed_text.replace("”", '"')
    non_words = re.compile(r"[^A-Za-z']+")
    processed_text = re.sub(non_words, ' ', processed_text)
    return processed_text

def get_text_from_file(filename):
    with open(filename, mode='r', encoding='utf-8') as f:
        text = f.read()
    return text


def get_words_from_text(text):
    stop_words = set(stopwords.words('vie'))
    processed_text = preprocess_text(text)
    words = {w for w in processed_text.split() if w not in stop_words}
    return words


def build_inverted_index(docs_path):
    inverted_index = {}
    for doc_file in os.listdir(docs_path):
        filename = os.path.join(docs_path, doc_file)
        text = get_text_from_file(filename)
        words = get_words_from_text(text)
        for word in words:
            if inverted_index.get(word, None) is None:
                inverted_index[word] = {filename}
            else:
                inverted_index[word].add(filename)
    return inverted_index


def index(docs_path, data_path):
    inverted_index = build_inverted_index(docs_path)
    dic_file = os.path.join(data_path, dir_path + '/data/dictionary.txt')
    inverted_index_file = os.path.join(data_path, dir_path + '/data/inverted_index.pickle')
    with open(dic_file, mode='w') as f:
        for word in inverted_index.keys():
            f.write(word + '\n')
    with open(inverted_index_file, mode='wb') as f:
        pickle.dump(inverted_index, f)

In [None]:
# Get inverted index from pickle file
inverted_index_file = os.path.join(
    os.getcwd(), dir_path + '/data', dir_path + '/data/inverted_index.pickle')

with open(inverted_index_file, mode='rb') as f:
    inverted_index = pickle.load(f)

dictionary = inverted_index.keys()

non_words = re.compile(r"[^A-Za-z'?]+")
stop_words = set(stopwords.words('vie'))

# Create a command line parser
parser = argparse.ArgumentParser(description='Boolean query')
parser.add_argument('query', help='words seperated by space')
args = parser.parse_args()

# Preprocess query
query = args.query
query = query.lower()
query = re.sub(non_words, ' ', query)

# Remove all stopwords and words which is not in dictionary
words = {
    word for word in query.split()
    if word not in stop_words and word in dictionary}

result = None
for word in words:
    if result is None:
        result = inverted_index.get(word)
    else:
        result.intersection_update(inverted_index.get(word))

print(result)