In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from IPython.display import display, HTML
display(HTML("<style>.container { width: 95% !important; }</style>"))

In [None]:
from collections import Counter
from tqdm.autonotebook import tqdm
import sys
sys.path.append('..')
import logging

In [None]:
logger = logging.getLogger('wm2')
logging.basicConfig(stream=sys.stdout, level=logging.INFO)

In [None]:
import pandas as pd
# pd.set_option('display.max_colwidth', None)
from symspellpy import SymSpell, Verbosity
from uralicNLP import uralicApi
from lib import dbutil

In [None]:
# Uncomment this if you get errors from uralicNLP
# uralicApi.download('fin')

### Orthographic neighbourhood calculation

Calculate the orthographic neighbourhood with a spelling dictionary.
 - symspellpy: https://github.com/mammothb/symspellpy

Distance metrics:
 - Hamming distance of 1
   - one letter substitution
   - https://en.wikipedia.org/wiki/Hamming_distance
 - Levenshtein distance of 1
   - one letter change: deletion, addition or substitution

Only the Hamming distance is stored to the database.

In [None]:
dbc = dbutil.DatabaseConnection('../data/ns24_freq2.db')

In [None]:
sdf = dbutil.adhoc_query(dbc.get_connection(), "select * from forms", todf=True)
sdf

In [None]:
sym_spell = SymSpell(max_dictionary_edit_distance=1)

In [None]:
for idx, row in tqdm(sdf.iterrows(), total=len(sdf)):
    word = row.form
    freq = row.frequency
    sym_spell.create_dictionary_entry(word, freq)

#### Neighbourhood calculation algorithm

The algorithm uses a combination of the spelling dictionary and the uralicNLP morphological analyzer for Finnish.
 - First, the speller suggestions for a form are fetched from the dictionary.
 - Frequency cutoffs:
   - If the frequency of the suggestion is higher than some preset value, the suggestion is accepted.
   - If the frequency of the suggestion is lower than some preset value, the suggestion is rejected.
 - Otherwise the form is checked by morphological analyzer.
   - If the morphological analyzer accepts the form (=finds an analysis), the suggestion is accepted.
 - The neighbourhood is the number of suggestions.

Currently the frequency cutoffs are fixed; thus the neighbourhood values are not consummerate across databases of different sizes.

In [None]:
autofreq = 10000
minfreq = 100
finals = {}
for idx, row in tqdm(sdf.iterrows(), total=len(sdf)):
    form = row.form
    freq = row.frequency
    suggestions = sym_spell.lookup(form, Verbosity.ALL)
    formfinals = []
    for suggestion in suggestions:       
        # print(form, suggestion.term, suggestion.distance, suggestion.count)
        ok = False
        res = []
        if suggestion.distance == 0:
            ...
        elif suggestion.count >= autofreq:
            ok = True
        elif suggestion.count < minfreq:
            ...
        else:
            res = uralicApi.analyze(suggestion.term, "fin")
            if len(res) > 0:
                # print(suggestion.term, res, suggestion.count)
                ok = True
        if ok:
            formfinals.append((suggestion.term, suggestion.count))
    finals[form] = formfinals
    # if idx > -1:
    #     break


In [None]:
levdict = Counter()
hamdict = Counter()
for key, analysis in tqdm(finals.items()):
    levs = [w for w,_ in analysis]
    hams = [w for w,_ in analysis if len(w) == len(key)]
    levdict[key] = len(levs)
    hamdict[key] = len(hams)

#### Verify that the hood calculation produces the same results that are stored in the database

In [None]:
sdf2 = sdf[:10].copy()
nuhoods = [hamdict[form] for form in sdf2.form]
sdf2['nuhood'] = nuhoods
sdf2