In [None]:
import os
from collections import defaultdict
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize

ps = PorterStemmer()
nltk.download()

In [None]:
def produce_rotations(word):
    term = "$" + word
    res = [term]
    for i in range(len(word) - 1):
        term = term[-1] + term[:-1]
        res.append(term)
    return res

In [None]:
def rotate(wildcard):
    term = '$' + wildcard
    for i, l in enumerate(term, 1):
        if l == "*":
            return term[i:] + term[:i-1], True
    else:
        return wildcard, False

In [None]:
def union(p1, p2):
    i = j = 0
    res = []
    
    while i < len(p1) and j < len(p2):
        if p1[i] == p2[j]:
            res.append(p1[i])
            i += 1
            j += 1
        elif p1[i] < p2[j]:
            res.append(p1[i])
            i += 1
        elif p1[i] > p2[j]:
            res.append(p2[j])
            j += 1
    if i < len(p1):
        res += p1[i:]
    else:
        res += p2[j:]
        
    return res

In [None]:
def inverse(p1, total):
    return [i for i in total if i not in p1]

In [None]:
def intersection(p1, p2):
    i = j = 0
    res = []
    
    while i < len(p1) and j < len(p2):
        if p1[i] == p2[j]:
            res.append(p1[i])
            i += 1
            j += 1
        elif p1[i] < p2[j]:
            i += 1
        elif p1[i] > p2[j]:
            j += 1
        
    return res

In [None]:
def and_not(p1, p2):
    i = j = 0
    res = []
    
    while i < len(p1) and j < len(p2):
        if p1[i] == p2[j]:
            i += 1
            j += 1
        elif p1[i] < p2[j]:
            res.append(p1[i])
            i += 1
        elif p1[i] > p2[j]:
            j += 1
    if i < len(p1):
        res += p1[i:]
        
    return res

In [None]:
def or_not(p1, p2, total):    
    return union(p1, inverse(p2, total))

In [None]:
symbols = {}

In [None]:
def match(term, ii):
    if term[0] == '@':
        return symbols[term]
    res = []
    rotated, is_wild = rotate(term)
    if is_wild:
        for w in ii.keys():
            if len(w) >= len(term)-1:
                for r in ii[w]['rotations']:
                    if r[:len(rotated)] == rotated:
                        res = union(res, ii[w]['postings'])
                        break
    else:
        for w in ii.keys():
            if w == rotated:
                res = union(res, ii[w]['postings'])
                break
    return res

In [None]:
def levenshtein_distance(word1, word2):
    l = max(len(word1), len(word2))
    m = np.zeros((l, l))
    for i in range(len(word1)):
        m[i, 0] = i
        
    for j in range(len(word2)):
        m[0, j] = j
    
    for i in range(1, len(word1)):
        for j in range(1, len(word2)):
            if word1[i] == word2[j]:
                m[i, j] = min(m[i-1, j] + 1, min(m[i, j-1] + 1, m[i-1, j-1]))
            else:
                m[i, j] = min(m[i-1, j] + 1, min(m[i, j-1] + 1, m[i-1, j-1] + 1))
    return m[len(word1)-1, len(word2)-1]

In [None]:
directory = 'Datasets/Shakespeare'
ii = defaultdict(lambda: {'count': [], 'words': [], 'rotations': [], 'postings': []})

id_to_txt = {}
for i, filename in enumerate(os.listdir(directory)):
    id_to_txt[i] = filename
    with open(os.path.join(directory, filename), 'rt') as original:
        sents = sent_tokenize(original.read())
        for s in sents:
            for w in word_tokenize(s):
                stemmed = ps.stem(w).lower()
                if stemmed not in stopwords.words('english'):
                    if i not in ii[stemmed]['postings']:
                        ii[stemmed]['postings'].append(i)
                    if w not in ii[stemmed]['words']:
                        ii[stemmed]['words'].append(w)
                
    print(i)

In [None]:
for t in ii.keys():
    ii[t]['count'] = len(ii[t]['postings'])
    
    for w in ii[t]['words']:
        ii[t]['rotations'] += produce_rotations(w)
        


In [None]:
np.save('ii', np.array(dict(ii)))

In [None]:
ii = np.load('ii.npy', allow_pickle=True).item()

In [None]:
match('mids*m', ii)

In [None]:
match('mid*er', ii)

In [None]:
def evaluate_expr(expr, i):
    print("evaluating " + expr + " and storing as @" + str(i))
    # var or not var
    # var or var
    # var and not var
    # var and var
    # var
    # not var
    total = [i for i in range(42)]
    
    keywords = ["and", "or", "not"]
    expr = expr.split(" ")
    new_symbol = '@' + str(i)
    
    if expr[0] == "not":
        symbols[new_symbol] = inverse(match(expr[1], ii), total)
        return new_symbol
        
    else:
        if len(expr) == 1:
            symbols[new_symbol] = match(expr[0], ii)
            return new_symbol
        
        if expr[1] == 'and':
            if expr[2] == 'not':
                symbols[new_symbol] = and_not(match(expr[0], ii), match(expr[3], ii))
                return new_symbol
            
            else:
                symbols[new_symbol] = intersection(match(expr[0], ii), match(expr[2], ii))
                return new_symbol
        
        else:
            if expr[2] == 'not':
                symbols[new_symbol] = or_not(match(expr[0], ii), match(expr[3], ii), total)
                return new_symbol
            
            else:
                symbols[new_symbol] = union(match(expr[0], ii), match(expr[2], ii))
                return new_symbol

In [None]:
def perform_query(query):
    stack = []
    i = 0
    for c in query:
        if c != ')':
            stack.append(c)
        else:
            expr = ""
            while stack:
                char = stack.pop()
                if char != '(':
                    expr += char
                else:
                    stack += list(evaluate_expr(expr[::-1], i))
                    i += 1
                    break
    if stack:
        evaluate_expr("".join(stack), i)
        i += 1
    return symbols['@' + str(i - 1)]

In [None]:
q1 = "(mid*er and (B or not C)) or D"
q2 = "not mid*er"

In [None]:
perform_query(q1)

In [None]:
symbols

In [None]:
ii

In [None]:
two_gram_index = defaultdict(lambda: set())
for i in ii.keys():
    for j in ii[i]['words']:
        for k in range(len(j) - 1):
            two_gram_index[j[k:k+2]].add(i)

In [None]:
two_gram_index

In [None]:
word = "midsumer"

In [None]:
match(word, ii)

In [None]:
res = []
for i in range(len(word) - 1):
    res += two_gram_index[word[i:i+2]]

In [None]:
res

In [None]:
import collections

In [None]:
freqs = dict(collections.Counter(res))

In [None]:
freqs = {k: v for k, v in reversed(sorted(freqs.items(), key=lambda item: item[1]))}
freqs

In [None]:
ff = defaultdict(lambda: [])
for k, v in freqs.items():
    ff[v].append(k)

In [None]:
ff

In [None]:
ed = defaultdict(lambda: [])
for f in list(ff.keys())[:3]:
    for w in ff[f]:
        print("edit dist between " + w + " and " + word + " is " + str(levenshtein_distance(word, w)))
        ed[levenshtein_distance(word, w)].append(w)

In [None]:
max([(ii[x]['count'], x) for x in ed[min(list(ed.keys()))]])[1]