In [20]:
import yaml
import os
import nltk

universe_set = set(range(517401))
stemmer = nltk.stem.SnowballStemmer("english")


def op_and(op1, op2):
    return op1 & op2


def op_or(op1, op2):
    return op1 | op2


def op_not(op):
    return universe_set - op


def get_indices(word):
    token = stemmer.stem(word)
    path = os.path.join("..", "output", "inverted_index_table", token)
    if os.path.exists(path):
        with open(path) as fp:
            l = yaml.load(fp)
            return set(l)
    else:
        return set()


operators_level = {"$": -1, ")": 0, "|": 1, "&": 2, "!": 3, "(": 4}
operator_func = {"&": op_and, "|": op_or, "!": op_not}


def cal_bool(origin_query_str):
    query_str = (
        origin_query_str.lower()
        .replace("(", " ( ")
        .replace(")", " ) ")
        .replace("and", "&")
        .replace("or", "|")
        .replace("not", "!")
    )
    query_exp = query_str.split()
    query_exp.append("$")

    operand_stack = list()
    operator_stack = list("$")
    i = 0
    while True:
        element = query_exp[i]
        if element in operators_level:
            operator = operator_stack.pop()
            if operators_level[operator] < operators_level[element]:
                operator_stack.append(operator)
                operator_stack.append(element)
                i += 1
            else:
                if operator == "$":
                    break
                elif operator == "(":
                    if element != ")":
                        operator_stack.append(operator)
                        operator_stack.append(element)
                    i += 1
                elif operator == "!":
                    operand = operand_stack.pop()
                    result = operator_func[operator](operand)
                    operand_stack.append(result)
                else:
                    operand1 = operand_stack.pop()
                    operand2 = operand_stack.pop()
                    result = operator_func[operator](operand1, operand2)
                    operand_stack.append(result)
        else:
            operand = get_indices(element)  # set
            operand_stack.append(operand)
            i += 1
    return operand_stack.pop()


In [33]:
origin_query_str = "Compute       And    Build"
result = cal_bool(origin_query_str)
result = sorted(list(result))
result

[50,
 643,
 807,
 818,
 1018,
 1523,
 2332,
 2495,
 3112,
 3628,
 3655,
 3656,
 3821,
 3932,
 4120,
 4121,
 4284,
 4312,
 4375,
 4476,
 4512,
 4547,
 4556,
 4617,
 4714,
 4745,
 4760,
 4811,
 4813,
 5085,
 5190,
 5191,
 5215,
 5299,
 5370,
 5387,
 5483,
 5507,
 5508,
 6131,
 7702,
 8187,
 8313,
 8603,
 8610,
 8622,
 8810,
 8962,
 9039,
 9052,
 9058,
 9180,
 9262,
 9265,
 9454,
 9945,
 9954,
 12578,
 13080,
 14700,
 15358,
 17181,
 17821,
 18263,
 18281,
 18316,
 18337,
 18338,
 18451,
 18479,
 18490,
 18520,
 18535,
 18673,
 19378,
 19445,
 19533,
 19534,
 19666,
 20490,
 20516,
 21086,
 22467,
 23444,
 24295,
 25096,
 25754,
 26579,
 28075,
 28145,
 28477,
 28511,
 28515,
 28938,
 29481,
 30231,
 30885,
 30892,
 31981,
 32385,
 32417,
 32476,
 32673,
 32703,
 32758,
 32842,
 33006,
 33019,
 33051,
 33136,
 33730,
 33805,
 33825,
 35953,
 36538,
 36737,
 36750,
 36871,
 36992,
 37204,
 38409,
 38703,
 38914,
 39016,
 39101,
 39124,
 39552,
 40177,
 40549,
 40684,
 40719,
 40771,
 41567