In [29]:
import util
import os
from nltk.corpus import stopwords
import pandas as pd


In [30]:
topdir = 'resource/iula'
all_content = []
for dirpath, dirnames, filename in os.walk(topdir) :
    for name in filename :
        if name.endswith('plain.txt'):
            with open(os.path.join(dirpath, name)) as f :
                all_content.append(f.read())


all_content = pd.Series(all_content)
# processed_content = [util.preprocess(s , stop_dict, stem_cache) for s in all_content]

In [31]:
stem_cache = util.create_stem_cache(all_content)
stop_dict = set(stopwords.words('English'))
my_custom_processor = util.create_custom_preprocessor(stop_dict, stem_cache)

In [32]:
processed_content = [my_custom_processor(s) for s in all_content]

In [33]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
vectorizer = CountVectorizer()
vectorizer.fit(processed_content)
freq_iula = vectorizer.transform(processed_content)
freq_iula = pd.DataFrame(freq_iula.todense(), columns=vectorizer.get_feature_names_out()).sum()
freq_iula

aaa             1
aaaaaa          1
aalborg         2
aarhu           1
aaron           3
               ..
zuckerkandl     1
zurich          4
zvi             1
zygos           1
zygot          10
Length: 22529, dtype: int64

In [34]:
query = ['deet', 'deft', 'defer', 'defect', 'defeat']
transformed_query = [vectorizer.inverse_transform(vectorizer.transform([q])) for q in query]
query_freq = pd.Series([freq_iula.T.loc[tq[0]].values[0] if len(tq[0]) > 0 else 0 for tq in transformed_query], index= query)
query_freq

deet       0
deft       0
defer      5
defect    79
defeat     9
dtype: int64

In [35]:
IULA = pd.DataFrame(query_freq, columns=['frequency'])
IULA_pop = len(processed_content)
IULA['P(w)'] = IULA['frequency']/IULA_pop
IULA['rank'] = IULA['frequency'].rank(ascending=False).astype(int)

# Adding P(X|W) from norvig

In [36]:
norvig_orig = pd.read_csv('https://norvig.com/ngrams/count_big.txt',sep='\t' , encoding="ISO-8859-1", header=None)
norvig_orig = norvig_orig.dropna()
norvig_orig.columns = ['term', 'freq']
norvig_orig.head()

Unnamed: 0,term,freq
0,a,21160
1,aah,1
2,aaron,5
3,ab,2
4,aback,3


In [37]:
norvig = pd.read_csv('https://norvig.com/ngrams/count_1edit.txt',sep='\t' ,encoding="ISO-8859-1", header=None)
norvig.columns= ['term', 'edit']
norvig = norvig.set_index('term')
norvig.head()

Unnamed: 0_level_0,edit
term,Unnamed: 1_level_1
e|i,917
a|e,856
i|e,771
e|a,749
a|i,559


In [38]:
def get_count(c, norvig_orig):
    return norvig_orig.apply(lambda x: x.term.count(c) * x.freq, axis=1).sum()

In [39]:
import itertools
from string import ascii_lowercase

character_set = list(map(''.join, itertools.product(ascii_lowercase, repeat=1))) + list(map(''.join, itertools.product(ascii_lowercase, repeat=2)))

In [40]:
from multiprocessing.pool import ThreadPool

pool = ThreadPool(8)
freq_list = pool.starmap(get_count, zip(character_set, itertools.repeat(norvig_orig)))

freq_df = pd.DataFrame([character_set, freq_list], index=['char', 'freq']).T
freq_df = freq_df.set_index('char')
freq_df

Unnamed: 0_level_0,freq
char,Unnamed: 1_level_1
a,407349
b,73161
c,144964
d,215698
e,632999
...,...
zv,1
zw,1
zx,0
zy,32


In [41]:
COCA = IULA
COCA['P(x|w)'] = [
    (0 / freq_df.loc['f'].values)[0],
    (norvig.loc['e| '].values / freq_df.loc['e'].values)[0],
    (norvig.loc['t|r'].values / freq_df.loc['r'].values)[0],
    (norvig.loc['e|ea'].values / freq_df.loc['ea'].values)[0],
    (norvig.loc['e|ec'].values / freq_df.loc['ec'].values)[0],
]

COCA['10^9 * P(x|w)P(w)'] = 1e9 * COCA['P(w)'] * COCA['P(x|w)']
COCA

Unnamed: 0,frequency,P(w),rank,P(x|w),10^9 * P(x|w)P(w)
deet,0,0.0,4,0.0,0.0
deft,0,0.0,4,3e-06,0.0
defer,5,0.039062,3,3.6e-05,1388.126
defect,79,0.617188,1,0.012834,7920979.0
defeat,9,0.070312,2,0.003167,222672.8
