In [1]:
import os
from collections import defaultdict
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize

ps = PorterStemmer()
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [2]:
def produce_rotations(word):
    term = "$" + word
    res = [term]
    for i in range(len(word) - 1):
        term = term[-1] + term[:-1]
        res.append(term)
    return res

In [3]:
def rotate(wildcard):
    term = '$' + wildcard
    for i, l in enumerate(term, 1):
        if l == "*":
            return term[i:] + term[:i-1], True
    else:
        return wildcard, False

In [4]:
def union(p1, p2):
    i = j = 0
    res = []
    
    while i < len(p1) and j < len(p2):
        if p1[i] == p2[j]:
            res.append(p1[i])
            i += 1
            j += 1
        elif p1[i] < p2[j]:
            res.append(p1[i])
            i += 1
        elif p1[i] > p2[j]:
            res.append(p2[j])
            j += 1
    if i < len(p1):
        res += p1[i:]
    else:
        res += p2[j:]
        
    return res

In [15]:
def inverse(p1, total):
    return [i for i in total if i not in p1]

In [16]:
def intersection(p1, p2):
    i = j = 0
    res = []
    
    while i < len(p1) and j < len(p2):
        if p1[i] == p2[j]:
            res.append(p1[i])
            i += 1
            j += 1
        elif p1[i] < p2[j]:
            i += 1
        elif p1[i] > p2[j]:
            j += 1
        
    return res

In [17]:
def and_not(p1, p2):
    i = j = 0
    res = []
    
    while i < len(p1) and j < len(p2):
        if p1[i] == p2[j]:
            i += 1
            j += 1
        elif p1[i] < p2[j]:
            res.append(p1[i])
            i += 1
        elif p1[i] > p2[j]:
            j += 1
    if i < len(p1):
        res += p1[i:]
        
    return res

In [24]:
def or_not(p1, p2, total):    
    return union(p1, inverse(p2, total))

In [159]:
symbols = {}

In [160]:
def match(term, ii):
    if term[0] == '@':
        return symbols[term]
    res = []
    rotated, is_wild = rotate(term)
    if is_wild:
        for w in ii.keys():
            if len(w) >= len(term)-1:
                for r in ii[w]['rotations']:
                    if r[:len(rotated)] == rotated:
                        res = union(res, ii[w]['postings'])
                        break
    else:
        for w in ii.keys():
            if w == rotated:
                res = union(res, ii[w]['postings'])
                break
    return res

In [208]:
def levenshtein_distance(word1, word2):
    l = max(len(word1), len(word2))
    m = np.zeros((l, l))
    for i in range(len(word1)):
        m[i, 0] = i
        
    for j in range(len(word2)):
        m[0, j] = j
    
    for i in range(1, len(word1)):
        for j in range(1, len(word2)):
            if word1[i] == word2[j]:
                m[i, j] = min(m[i-1, j] + 1, min(m[i, j-1] + 1, m[i-1, j-1]))
            else:
                m[i, j] = min(m[i-1, j] + 1, min(m[i, j-1] + 1, m[i-1, j-1] + 1))
    return m[len(word1)-1, len(word2)-1]

In [8]:
directory = 'Datasets/Shakespeare'
ii = defaultdict(lambda: {'count': [], 'words': [], 'rotations': [], 'postings': []})

id_to_txt = {}
for i, filename in enumerate(os.listdir(directory)):
    id_to_txt[i] = filename
    with open(os.path.join(directory, filename), 'rt') as original:
        sents = sent_tokenize(original.read())
        for s in sents:
            for w in word_tokenize(s):
                stemmed = ps.stem(w).lower()
                if stemmed not in stopwords.words('english'):
                    if i not in ii[stemmed]['postings']:
                        ii[stemmed]['postings'].append(i)
                    if w not in ii[stemmed]['words']:
                        ii[stemmed]['words'].append(w)
                
    print(i)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41


In [9]:
for t in ii.keys():
    ii[t]['count'] = len(ii[t]['postings'])
    
    for w in ii[t]['words']:
        ii[t]['rotations'] += produce_rotations(w)
        


In [11]:
np.save('ii', np.array(dict(ii)))

In [12]:
ii = np.load('ii.npy', allow_pickle=True).item()

In [13]:
match('mids*m', ii)

[]

In [14]:
match('mid*er', ii)

[0, 3, 7, 40]

In [171]:
def evaluate_expr(expr, i):
    print("evaluating " + expr + " and storing as @" + str(i))
    # var or not var
    # var or var
    # var and not var
    # var and var
    # var
    # not var
    total = [i for i in range(42)]
    
    keywords = ["and", "or", "not"]
    expr = expr.split(" ")
    new_symbol = '@' + str(i)
    
    if expr[0] == "not":
        symbols[new_symbol] = inverse(match(expr[1], ii), total)
        return new_symbol
        
    else:
        if len(expr) == 1:
            symbols[new_symbol] = match(expr[0], ii)
            return new_symbol
        
        if expr[1] == 'and':
            if expr[2] == 'not':
                symbols[new_symbol] = and_not(match(expr[0], ii), match(expr[3], ii))
                return new_symbol
            
            else:
                symbols[new_symbol] = intersection(match(expr[0], ii), match(expr[2], ii))
                return new_symbol
        
        else:
            if expr[2] == 'not':
                symbols[new_symbol] = or_not(match(expr[0], ii), match(expr[3], ii), total)
                return new_symbol
            
            else:
                symbols[new_symbol] = union(match(expr[0], ii), match(expr[2], ii))
                return new_symbol

In [172]:
def perform_query(query):
    stack = []
    i = 0
    for c in query:
        if c != ')':
            stack.append(c)
        else:
            expr = ""
            while stack:
                char = stack.pop()
                if char != '(':
                    expr += char
                else:
                    stack += list(evaluate_expr(expr[::-1], i))
                    i += 1
                    break
    if stack:
        evaluate_expr("".join(stack), i)
        i += 1
    return symbols['@' + str(i - 1)]

In [173]:
q1 = "(mid*er and (B or not C)) or D"
q2 = "not mid*er"

In [174]:
perform_query(q1)

evaluating B or not C and storing as @0
evaluating mid*er and @0 and storing as @1
evaluating @1 or D and storing as @2


[0, 3, 7, 40]

In [175]:
symbols

{'@0': [0,
  1,
  2,
  3,
  4,
  5,
  6,
  7,
  8,
  9,
  10,
  11,
  12,
  13,
  14,
  15,
  16,
  17,
  18,
  19,
  20,
  21,
  22,
  23,
  24,
  25,
  26,
  27,
  28,
  29,
  30,
  31,
  32,
  33,
  34,
  35,
  36,
  37,
  38,
  39,
  40,
  41],
 '@1': [0, 3, 7, 40],
 '@2': [0, 3, 7, 40]}

In [177]:
ii

{'midsumm': {'count': 4,
  'words': ['Midsummer', 'midsummer'],
  'rotations': ['$Midsummer',
   'r$Midsumme',
   'er$Midsumm',
   'mer$Midsum',
   'mmer$Midsu',
   'ummer$Mids',
   'summer$Mid',
   'dsummer$Mi',
   'idsummer$M',
   '$midsummer',
   'r$midsumme',
   'er$midsumm',
   'mer$midsum',
   'mmer$midsu',
   'ummer$mids',
   'summer$mid',
   'dsummer$mi',
   'idsummer$m'],
  'postings': [0, 3, 7, 40]},
 'night': {'count': 41,
  'words': ['Night', 'night', 'nights', 'nighted'],
  'rotations': ['$Night',
   't$Nigh',
   'ht$Nig',
   'ght$Ni',
   'ight$N',
   '$night',
   't$nigh',
   'ht$nig',
   'ght$ni',
   'ight$n',
   '$nights',
   's$night',
   'ts$nigh',
   'hts$nig',
   'ghts$ni',
   'ights$n',
   '$nighted',
   'd$nighte',
   'ed$night',
   'ted$nigh',
   'hted$nig',
   'ghted$ni',
   'ighted$n'],
  'postings': [0,
   1,
   2,
   3,
   4,
   5,
   6,
   7,
   8,
   9,
   10,
   11,
   12,
   13,
   14,
   15,
   16,
   17,
   18,
   19,
   20,
   21,
   22,
   23,
   24,


In [183]:
two_gram_index = defaultdict(lambda: set())
for i in ii.keys():
    for j in ii[i]['words']:
        for k in range(len(j) - 1):
            two_gram_index[j[k:k+2]].add(i)

In [184]:
two_gram_index

defaultdict(<function __main__.<lambda>()>,
            {'Mi': {'mi',
              'michael',
              'michaelma',
              'mida',
              'middl',
              'midnight',
              'midsumm',
              'midway',
              'might',
              "might'st",
              'mighti',
              'milan',
              'milan.',
              'mildli',
              'mile',
              'milford',
              'milk-liv',
              'miller',
              'million',
              'milo',
              'mind',
              'mine',
              'minerva',
              'mingl',
              'minim',
              'minion',
              'mino',
              'minola',
              'minotaur',
              'miranda',
              'miranda.',
              'mirth',
              'misanthropo',
              'miscarri',
              'mischanc',
              'mischief',
              'misconst',
              'miscreant',
              'misena',
 

In [186]:
word = "midsumer"

In [188]:
match(word, ii)

[]

In [213]:
res = []
for i in range(len(word) - 1):
    res += two_gram_index[word[i:i+2]]

In [214]:
res

['minute-whil',
 'minimu',
 'abomin',
 'demi-paradis',
 'midday',
 'high-mind',
 'alchemist',
 'mistook',
 'life-harm',
 'miscarri',
 'gremio.',
 'miracl',
 'fumitori',
 'swim',
 'armipot',
 'miretur',
 'demi-puppet',
 'enemi',
 'premis',
 'misde',
 'admir',
 'ransom',
 "ptolemies'",
 'demi-devil',
 'demigod',
 'minx',
 'demis',
 'calmi',
 'bloom',
 'mindless',
 'simil',
 'unmix',
 'smilet',
 'emilia',
 'presurmis',
 'besmirch',
 'smith',
 'admitted.',
 'mamilliu',
 'commixtur',
 'qualmish',
 'mild',
 'mice',
 'intermix',
 'mitig',
 'animi',
 'illumin',
 'misti',
 'submiss',
 'hum',
 'sap-consum',
 'beseem',
 'misheard',
 'mimic',
 'atomi',
 'promiseth',
 'misbecomingli',
 'omit',
 'gemini',
 "mistak'st",
 'motley-mind',
 'frame',
 'minikin',
 'tragical-comical-historical-pastor',
 'militari',
 'midsumm',
 'illumineth',
 'diminut',
 "hermits'",
 'skirmish',
 'gremio',
 'mind.',
 'milksop',
 'misericord',
 'camomil',
 'undermin',
 'blood-consum',
 'milch-kin',
 'misbehav',
 'pigrogromit

In [193]:
import collections

In [196]:
freqs = dict(collections.Counter(res))

In [201]:
freqs = {k: v for k, v in reversed(sorted(freqs.items(), key=lambda item: item[1]))}
freqs

{'//shakespeare.folger.edu/shakespeares-works/a-midsummer-nights-dream/': 7,
 'midsumm': 7,
 'summer-seem': 5,
 'consum': 5,
 'summer-swel': 4,
 'summer': 4,
 "summer'": 4,
 'mermaid': 4,
 'assum': 4,
 'presum': 4,
 'innumer': 3,
 'drummer': 3,
 'blumer': 3,
 'mummer': 3,
 'perfum': 3,
 'aumerl': 3,
 'over-measur': 3,
 'submerg': 3,
 'sumpter': 3,
 'sum': 3,
 'suppertim': 3,
 'insult': 3,
 'summon': 3,
 'presume.': 3,
 '//shakespeare.folger.edu/shakespeares-works/measure-for-measure/': 3,
 'subdument': 3,
 'resum': 3,
 'adsum': 3,
 'herdsmen': 3,
 'command': 3,
 '//shakespeare.folger.edu/shakespeares-works/the-merry-wives-of-windsor/': 3,
 'bloodsuck': 3,
 'northumberland': 3,
 'coffer-lid': 3,
 'chambermaid': 3,
 'meridian': 3,
 'mermaid-lik': 3,
 'redeem': 3,
 'epidamium': 3,
 'misgovern': 3,
 'midst': 3,
 'confirm': 3,
 "'midst": 3,
 'humid': 3,
 'misterm': 3,
 'fumit': 3,
 'overwhelm': 3,
 'perform': 3,
 'astronom': 3,
 "o'erwhelm": 3,
 'amidst': 3,
 'pyramid': 3,
 'fume': 3,
 'sum

In [204]:
ff = defaultdict(lambda: [])
for k, v in freqs.items():
    ff[v].append(k)

In [205]:
ff

defaultdict(<function __main__.<lambda>()>,
            {7: ['//shakespeare.folger.edu/shakespeares-works/a-midsummer-nights-dream/',
              'midsumm'],
             5: ['summer-seem', 'consum'],
             4: ['summer-swel',
              'summer',
              "summer'",
              'mermaid',
              'assum',
              'presum'],
             3: ['innumer',
              'drummer',
              'blumer',
              'mummer',
              'perfum',
              'aumerl',
              'over-measur',
              'submerg',
              'sumpter',
              'sum',
              'suppertim',
              'insult',
              'summon',
              'presume.',
              '//shakespeare.folger.edu/shakespeares-works/measure-for-measure/',
              'subdument',
              'resum',
              'adsum',
              'herdsmen',
              'command',
              '//shakespeare.folger.edu/shakespeares-works/the-merry-wives-of-windsor/'

In [210]:
ed = defaultdict(lambda: [])
for f in list(ff.keys())[:3]:
    for w in ff[f]:
        print("edit dist between " + w + " and " + word + " is " + str(levenshtein_distance(word, w)))
        ed[levenshtein_distance(word, w)].append(w)

edit dist between //shakespeare.folger.edu/shakespeares-works/a-midsummer-nights-dream/ and midsumer is 61.0
edit dist between midsumm and midsumer is 2.0
edit dist between summer-seem and midsumer is 9.0
edit dist between consum and midsumer is 4.0
edit dist between summer-swel and midsumer is 9.0
edit dist between summer and midsumer is 4.0
edit dist between summer' and midsumer is 5.0
edit dist between mermaid and midsumer is 7.0
edit dist between assum and midsumer is 4.0
edit dist between presum and midsumer is 4.0


In [215]:
max([(ii[x]['count'], x) for x in ed[min(list(ed.keys()))]])[1]

'midsumm'