In [None]:
import os
import re
import collections
from collections import defaultdict
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
from tqdm import tqdm

ps = PorterStemmer()
nltk.download()

In [None]:
print('e')

In [None]:
class InvertedIndex:
    def __init__(self, directory, stopwords, save=True, name='ii'):
        self.directory = directory
        self.stopwords = stopwords
        self.save = save
        self.name = name
        self.id_to_file = {}
        self.index = defaultdict(lambda: {'count': [], 'words': set()}) # stemmed index
        self.windex = defaultdict(lambda: {'count': [], 'postings': set(), 'rotations':set()}) # word index
        self.tgi = defaultdict(lambda: set())
        self.construct()
        self.construct_tgi()
        
    def produce_rotations(self, word):
        term = "$" + word
        res = [term]
        for i in range(len(word) - 1):
            term = term[-1] + term[:-1]
            res.append(term)
        return res
    
    def construct(self):
        for i, filename in tqdm(enumerate(os.listdir(self.directory))):
            self.id_to_file[i] = filename
            with open(os.path.join(self.directory, filename), 'rt') as original:
                sents = sent_tokenize(original.read())
                for s in sents:
                    for w in word_tokenize(s):
                        if re.match("^[-'a-zA-Z]+$", w): # if it is a proper term
                            w = w.lower()
                            stemmed = ps.stem(w)
                            if stemmed not in self.stopwords:
                                    self.index[stemmed]['words'].add(w)
                                    self.windex[w]['postings'].add(i)
                            
        
        for t in self.index.keys():
            postings = set()

            for w in self.index[t]['words']:
                self.windex[w]['count'] = len(self.windex[w]['postings'])
                postings|=set(self.windex[w]['postings'])
                self.windex[w]['rotations'] = set(self.produce_rotations(w))
            self.index[t]['count'] = len(postings)
            
        if self.save:
            np.save(self.name, np.array(dict(self.index)))

    def construct_tgi(self):
        for i in self.index.keys():
            for j in self.index[i]['words']:
                for k in range(len(j) - 1):
                    self.tgi[j[k:k+2]].add(i) # storing stems

In [None]:
ii = InvertedIndex(directory = 'Datasets/Shakespeare', stopwords = stopwords.words('english'))

In [None]:
ii.index['whore']

In [None]:
ii.windex['whoring']

In [None]:
class QueryHandler:
    def __init__(self):
        self.symbols = {}
    
    def rotate(self, wildcard):
        term = '$' + wildcard
        for i, l in enumerate(term, 1):
            if l == "*":
                return term[i:] + term[:i-1], True
        else:
            return wildcard, False
        
    def union(self, p1, p2):
        res = set()
        res = (set(p1) | set(p2))
        return list(res)

    def inverse(self, p1, total):
        return [i for i in total if i not in p1]
    
    def intersection(self, p1, p2):
        res = set()
        res = (set(p1) & set(p2))
        return list(res)
    
    def and_not(self, p1, p2):
        i = j = 0
        res = []

        while i < len(p1) and j < len(p2):
            if p1[i] == p2[j]:
                i += 1
                j += 1
            elif p1[i] < p2[j]:
                res.append(p1[i])
                i += 1
            elif p1[i] > p2[j]:
                j += 1
        if i < len(p1):
            res += p1[i:]

        return res
    
    def or_not(self, p1, p2, total):    
        return self.union(p1, self.inverse(p2, total))
    
    def levenshtein_distance(self, word1, word2):
        m = np.zeros((len(word1)+1, len(word2)+1))
        for j in range(len(word1)+1):
            m[0][j] = j
        for i in range(len(word2)+1):
            m[i][0] = i

        for i in range(1, len(word1)+1):
            for j in range(1, len(word2)+1):
                if word1[i-1] == word2[j-1]:
                    m[i, j] = m[i-1, j-1]
                else:
                    m[i, j] = 1 + min(m[i-1, j], min(m[i, j-1], m[i-1, j-1]))
        return m[len(word1), len(word2)]
    
    def spell_correct(self, misspelled, ii):
        twograms = []
        for i in range(len(misspelled) - 1):
            twograms += ii.tgi[misspelled[i:i+2]]
        
        freqs = dict(collections.Counter(twograms)) # stem : no. of matching two-grams
        freqs = {k: v for k, v in reversed(sorted(freqs.items(), key=lambda item: item[1]))}
        # print(freqs)
        
        ff = defaultdict(lambda: []) # no.of matching two-grams: stem
        
        for k, v in freqs.items():
            ff[v].append(k)
        # print(ff)
            
        ed = defaultdict(lambda: set())
        
        for f in list(ff.keys())[:1]: # top two-gram matching word
            for i in ff[f]: # for each stem with frequency f
                for w in ii.index[i]['words']: # for each word in that stem
                    if len(w)>=len(misspelled)-4 and len(w)<=len(misspelled)+4 : # if at most 4 chars away from misspelled word
                        d = self.levenshtein_distance(misspelled, w) # get distance
                        if d<=5: # if dist at most 5
                            ed[d].add(i) # add stem
        # print(ed)

        if not ed:
            return ""
        return max([(ii.index[x]['count'], x) for x in ed[min(list(ed.keys()))]])[1]
        
    
    def match(self, term, ii):
        if term[0] == '@':
            return self.symbols[term]
        res = []
        rotated, is_wild = self.rotate(term)
        #print(rotated)
        if is_wild: # is a wildcard
            for i in ii.index.keys():
                for w in ii.index[i]['words']:
                    if len(w) >= len(term)-1:
                        for r in ii.windex[w]['rotations']:
                            if r[:len(rotated)] == rotated:
                                #print(r)
                                #print(w)
                                #print(ii.windex[w]['postings'])
                                res = self.union(res, ii.windex[w]['postings'])
                                break
        else: # not a wildcard
            rotated = ps.stem(rotated)
            for i in ii.index.keys():
                if i == rotated:
                    for w in ii.index[i]['words']:
                        res = set(res)
                        res|= set(ii.windex[w]['postings'])
                    break
                    
        if not is_wild and not res: # misspelled word
            corrected = self.spell_correct(term, ii)
            print(term + " is corrected to " + corrected)
            if corrected:
                return self.match(corrected, ii)
        
        return list(res)
    
    def evaluate_expr(self, expr, i, ii, total):
        print("evaluating " + expr + " and storing as @" + str(i))
        # var or not var
        # var or var
        # var and not var
        # var and var
        # var
        # not var

        keywords = ["and", "or", "not"]
        expr = expr.split(" ")
        new_symbol = '@' + str(i)

        if expr[0] == "not":
            self.symbols[new_symbol] = self.inverse(self.match(expr[1], ii), total)
            return new_symbol

        else:
            if len(expr) == 1:
                self.symbols[new_symbol] = self.match(expr[0], ii)
                return new_symbol

            if expr[1] == 'and':
                if expr[2] == 'not':
                    self.symbols[new_symbol] = self.and_not(self.match(expr[0], ii), self.match(expr[3], ii))
                    return new_symbol

                else:
                    self.symbols[new_symbol] = self.intersection(self.match(expr[0], ii), self.match(expr[2], ii))
                    return new_symbol

            else:
                if expr[2] == 'not':
                    self.symbols[new_symbol] = self.or_not(self.match(expr[0], ii), self.match(expr[3], ii), total)
                    return new_symbol

                else:
                    self.symbols[new_symbol] = self.union(self.match(expr[0], ii), self.match(expr[2], ii))
                    return new_symbol
            
    def compute(self, query, ii, total):
        stack = []
        self.symbols = {}
        i = 0
        for c in query:
            if c != ')':
                stack.append(c)
            else:
                expr = ""
                while stack:
                    char = stack.pop()
                    if char != '(':
                        expr += char
                    else:
                        stack += list(self.evaluate_expr(expr[::-1], i, ii, total))
                        i += 1
                        break
        if stack:
            self.evaluate_expr("".join(stack), i, ii, total)
            i += 1
        return self.symbols['@' + str(i - 1)]

In [None]:
ii.index

In [None]:
query = "man*"
qh = QueryHandler()
qh.compute(query, ii, list(ii.id_to_file.keys()))

In [None]:
qh.levenshtein_distance('whorr', 'short')

In [None]:
ii.windex["whore"]

In [None]:
def ld(word1, word2):
        m = np.zeros((len(word1)+1, len(word2)+1))

        for i in range(len(word1)+1):
            for j in range(len(word2)+1):
                if i==0:
                    m[i][j] = j
                if j==0:
                    m[i][j] = i
                else:
                    if word1[i-1] == word2[j-1]:
                        m[i, j] = m[i-1, j-1]
                    else:
                        m[i, j] = 1 + min(m[i-1, j], min(m[i, j-1], m[i-1, j-1]))
        return m[len(word1), len(word2)]

In [None]:
n = ld('whorr', 'short')
n