In [2]:
import csv
import os
import nltk
import gen_id_path_map

id_path_map = os.path.join("..", "output", "id_path_map.csv")
id_path_dict = gen_id_path_map.get_id_path_map(id_path_map)

universe_set = set(range(517401))
stemmer = nltk.stem.SnowballStemmer("english")


def op_and(op1, op2):
    return op1 & op2


def op_or(op1, op2):
    return op1 | op2


def op_not(op):
    return universe_set - op


def get_indices(word):
    token = stemmer.stem(word)
    path = os.path.join("..", "output", "inverted_index_table", token + ".csv")
    if os.path.exists(path):
        with open(path) as fp:
            r = csv.reader(fp)
            return set(map(lambda x: int(x[0]), r))
    else:
        return set()


operators_level = {"$": -1, ")": 0, "|": 1, "&": 2, "!": 3, "(": 4}
operator_func = {"&": op_and, "|": op_or, "!": op_not}


def bool_query(origin_query_str):
    query_str = (
        origin_query_str.lower()
        .replace("(", " ( ")
        .replace(")", " ) ")
        .replace("and", "&")
        .replace("or", "|")
        .replace("not", "!")
    )
    query_exp = query_str.split()
    query_exp.append("$")

    operand_stack = list()
    operator_stack = list("$")
    i = 0
    while True:
        element = query_exp[i]
        if element in operators_level:
            operator = operator_stack.pop()
            if operators_level[operator] < operators_level[element]:
                operator_stack.append(operator)
                operator_stack.append(element)
                i += 1
            else:
                if operator == "$":
                    break
                elif operator == "(":
                    if element != ")":
                        operator_stack.append(operator)
                        operator_stack.append(element)
                    i += 1
                elif operator == "!":
                    operand = operand_stack.pop()
                    result = operator_func[operator](operand)
                    operand_stack.append(result)
                else:
                    operand1 = operand_stack.pop()
                    operand2 = operand_stack.pop()
                    result = operator_func[operator](operand1, operand2)
                    operand_stack.append(result)
        else:
            operand = get_indices(element)  # set
            operand_stack.append(operand)
            i += 1
    return operand_stack.pop()

In [4]:
stemmer.stem("factor")

'factor'

In [13]:
query_str = "compute and build and fact or fact"
result = list(bool_query(query_str))
result.sort()
(list(map(lambda x: (x, id_path_dict[str(x)]), result)))

[(42, 'allen-p\\all_documents\\140_'),
 (217, 'allen-p\\all_documents\\2_'),
 (382, 'allen-p\\all_documents\\44_'),
 (389, 'allen-p\\all_documents\\456_'),
 (408, 'allen-p\\all_documents\\473_'),
 (447, 'allen-p\\all_documents\\508_'),
 (633, 'allen-p\\deleted_items\\103_'),
 (756, 'allen-p\\deleted_items\\21_'),
 (767, 'allen-p\\deleted_items\\233_'),
 (772, 'allen-p\\deleted_items\\238_'),
 (783, 'allen-p\\deleted_items\\249_'),
 (785, 'allen-p\\deleted_items\\250_'),
 (810, 'allen-p\\deleted_items\\31_'),
 (935, 'allen-p\\deleted_items\\46_'),
 (944, 'allen-p\\deleted_items\\54_'),
 (964, 'allen-p\\deleted_items\\74_'),
 (968, 'allen-p\\deleted_items\\79_'),
 (986, 'allen-p\\deleted_items\\95_'),
 (987, 'allen-p\\deleted_items\\96_'),
 (1022, 'allen-p\\discussion_threads\\129_'),
 (1094, 'allen-p\\discussion_threads\\194_'),
 (1134, 'allen-p\\discussion_threads\\232_'),
 (1234, 'allen-p\\discussion_threads\\448_'),
 (1262, 'allen-p\\discussion_threads\\473_'),
 (1488, 'allen-p\\note