In [1]:
import util
import os
from nltk.corpus import stopwords
import pandas as pd


### Spell Correction
Mispelled word: agie  
Insertion: age  
Subtition: adie  
Subtition: amie  
Deletion: agile  
Deletion: augie  

In [2]:
topdir = 'resource/iula'
all_content = []
for dirpath, dirnames, filename in os.walk(topdir) :
    for name in filename :
        if name.endswith('plain.txt'):
            with open(os.path.join(dirpath, name)) as f :
                all_content.append(f.read())


all_content = pd.Series(all_content)

In [3]:
stem_cache = util.create_stem_cache(all_content)
stop_dict = set(stopwords.words('English'))
my_custom_processor = util.create_custom_preprocessor(stop_dict, stem_cache)

In [4]:
processed_content = [my_custom_processor(s) for s in all_content]

In [5]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
vectorizer = CountVectorizer()
vectorizer.fit(processed_content)
freq_iula = vectorizer.transform(processed_content)
freq_iula = pd.DataFrame(freq_iula.todense(), columns=vectorizer.get_feature_names_out()).sum()
freq_iula

00            13
000          234
0000           2
00000000       2
000030         3
            ... 
única          2
únicament      1
úniqu          1
útil           2
überbau        1
Length: 25710, dtype: int64

In [6]:
query = ['age', 'adie', 'amie', 'agile', 'augie']
transformed_query = [vectorizer.inverse_transform(vectorizer.transform([q])) for q in query]
query_freq = pd.Series([freq_iula.T.loc[tq[0]].values[0] if len(tq[0]) > 0 else 0 for tq in transformed_query], index= query)
query_freq

age      90
adie      0
amie      0
agile     0
augie     0
dtype: int64

In [7]:
# IULA Corpus
IULA = pd.DataFrame(query_freq, columns=['frequency'])
IULA_pop = len(processed_content)
IULA['P(w)'] = IULA['frequency']/IULA_pop
IULA['rank'] = IULA['frequency'].rank(ascending=False).astype(int)
IULA

Unnamed: 0,frequency,P(w),rank
age,90,0.703125,1
adie,0,0.0,3
amie,0,0.0,3
agile,0,0.0,3
augie,0,0.0,3


In [8]:
# COCA Corpus
COCA = pd.DataFrame([ ['age',202733],['adie',135], ['amie',236],['agile',2446] , ['augie',566]], columns=['word','frequency'])
COCA_pop = 1001610938
COCA['P(w)'] = COCA['frequency']/COCA_pop
COCA['rank'] = COCA['frequency'].rank(ascending=False).astype(int)
COCA

Unnamed: 0,word,frequency,P(w),rank
0,age,202733,0.0002024069,1
1,adie,135,1.347829e-07,5
2,amie,236,2.356204e-07,4
3,agile,2446,2.442066e-06,2
4,augie,566,5.650897e-07,3


# Adding P(X|W) from norvig

In [9]:
norvig_orig = pd.read_csv('https://norvig.com/ngrams/count_big.txt',sep='\t' , encoding="ISO-8859-1", header=None)
norvig_orig = norvig_orig.dropna()
norvig_orig.columns = ['term', 'freq']
norvig_orig.head()

Unnamed: 0,term,freq
0,a,21160
1,aah,1
2,aaron,5
3,ab,2
4,aback,3


In [10]:
norvig = pd.read_csv('https://norvig.com/ngrams/count_1edit.txt',sep='\t' ,encoding="ISO-8859-1", header=None)
norvig.columns= ['term', 'edit']
norvig = norvig.set_index('term')
norvig.head()

Unnamed: 0_level_0,edit
term,Unnamed: 1_level_1
e|i,917
a|e,856
i|e,771
e|a,749
a|i,559


In [11]:
def get_count(c, norvig_orig):
    return norvig_orig.apply(lambda x: x.term.count(c) * x.freq, axis=1).sum()

In [12]:
import itertools
from string import ascii_lowercase

character_set = list(map(''.join, itertools.product(ascii_lowercase, repeat=1))) + list(map(''.join, itertools.product(ascii_lowercase, repeat=2)))

In [13]:
from multiprocessing.pool import ThreadPool

pool = ThreadPool(8)
freq_list = pool.starmap(get_count, zip(character_set, itertools.repeat(norvig_orig)))

freq_df = pd.DataFrame([character_set, freq_list], index=['char', 'freq']).T
freq_df = freq_df.set_index('char')
freq_df.head()

Unnamed: 0_level_0,freq
char,Unnamed: 1_level_1
a,407349
b,73161
c,144964
d,215698
e,632999


In [14]:
IULA['P(x|w)'] = [
    (0 / freq_df.loc['i'].values)[0],
    (norvig.loc['g|d'].values / freq_df.loc['d'].values)[0],
    (norvig.loc['g|m'].values / freq_df.loc['m'].values)[0],
    (norvig.loc['i|il'].values / freq_df.loc['il'].values)[0],
    (norvig.loc['a|au'].values / freq_df.loc['au'].values)[0]
]
IULA['10^9 * P(x|w)P(w)'] = 1e9 * IULA['P(w)'] * IULA['P(x|w)']
IULA

Unnamed: 0,frequency,P(w),rank,P(x|w),10^9 * P(x|w)P(w)
age,90,0.703125,1,0.0,0.0
adie,0,0.0,3,0.000111,0.0
amie,0,0.0,3,3.9e-05,0.0
agile,0,0.0,3,0.003682,0.0
augie,0,0.0,3,0.020867,0.0


In [15]:
COCA['P(x|w)'] = [
    (0 / freq_df.loc['i'].values)[0],
    (norvig.loc['g|d'].values / freq_df.loc['d'].values)[0],
    (norvig.loc['g|m'].values / freq_df.loc['m'].values)[0],
    (norvig.loc['i|il'].values / freq_df.loc['il'].values)[0],
    (norvig.loc['a|au'].values / freq_df.loc['au'].values)[0]
]

COCA['10^9 * P(x|w)P(w)'] = 1e9 * COCA['P(w)'] * COCA['P(x|w)']
COCA

Unnamed: 0,word,frequency,P(w),rank,P(x|w),10^9 * P(x|w)P(w)
0,age,202733,0.0002024069,1,0.0,0.0
1,adie,135,1.347829e-07,5,0.000111,0.014997
2,amie,236,2.356204e-07,4,3.9e-05,0.009272
3,agile,2446,2.442066e-06,2,0.003682,8.992818
4,augie,566,5.650897e-07,3,0.020867,11.791809
