In [1]:
import sys
sys.path.append('../')

In [2]:
import math

from pandas import DataFrame
from pandas import read_csv
from pandas import concat
from tqdm.auto import tqdm

from typing import List
from typing import Union

from api.ea_jsoncontent_2_jsontabelid.kirjavigastaja import KIRJAVIGASTAJA

## I. Using local Pyhton class to introduce typing errors to wordforms 

In [3]:
def generate_misspellings(input_table, min_lenght:int = 4):
    """
    Generates lemma-misspelling pairs form lemma-wordform pairs

    The input dataframe has columns lemma, wordform. The output dataframe has columns lemma, misspelling.
    The number of rows corresponding to a single lemma varies as duplicated misspellings are omitted.
    All misspellings that are shorter than min_length are also omitted.
    As a result no misspellings are created or kept for some lemmas. 
    """

    assert set(input_table.columns) == {'lemma', 'wordform'}, 'Wrong input shape'
    wordform_count = len(input_table)
    error_generator = KIRJAVIGASTAJA(verbose=False, analyser='https://smart-search.tartunlp.ai/api/analyser/process')
    tbl = DataFrame({'lemma': [None] * wordform_count, 'misspelling': [None] * wordform_count})
    for i, row in input_table.iterrows():
        misspellings = error_generator.kirjavigur(row['wordform'])
        tbl.loc[i, 'lemma'] = row['lemma']
        tbl.loc[i, 'misspelling'] = set(map(lambda x: x[0], misspellings))

    return (tbl
            .groupby('lemma', as_index=False).agg(misspelling=('misspelling', lambda x: list(set.union(*x))))
            .explode('misspelling')
            .pipe(lambda df: df[~df['misspelling'].isna()])
            .pipe(lambda df: df[df['misspelling'].str.len() >= 4])
           )
    
display(generate_misspellings(DataFrame({'lemma': ['kala', 'kala'], 'wordform': ['kala', 'kalaga']})).head(10))

Unnamed: 0,lemma,misspelling
0,kala,klaaga
0,kala,kalaka
0,kala,kkala
0,kala,kalaaga
0,kala,akla
0,kala,kaaga
0,kala,aklaga
0,kala,kaalga
0,kala,kaalaga
0,kala,kalgaa


### Generation of all misspellings

In [4]:
BLOCK_SIZE = 100
wordforms = read_csv('results/caption_index/state_laws_all_wordforms.csv')
block_count = math.floor(len(wordforms)/BLOCK_SIZE)
block_count = 10


In [5]:
result = [None] * (block_count + 1)
for i in tqdm(range(block_count + 1), total=block_count):
    result[i] = generate_misspellings(wordforms.loc[BLOCK_SIZE * i: BLOCK_SIZE * (i + 1)])
    
result = concat(result, axis=0).reset_index(drop=True).sort_values(['lemma', 'misspelling'])
result = result.drop_duplicates()

assert not any(result.duplicated()), 'Unexpected duplications'
display(result.head(10))

  0%|          | 0/10 [00:00<?, ?it/s]

Unnamed: 0,lemma,misspelling
5,AKV-EL,AAKV-EL
9,AKV-EL,AK-VEL
14,AKV-EL,AKKV-EL
3,AKV-EL,AKV--EL
2,AKV-EL,AKV-E
6,AKV-EL,AKV-EEL
13,AKV-EL,AKV-ELL
10,AKV-EL,AKV-L
8,AKV-EL,AKV-LE
15,AKV-EL,AKVE-L


### Conflicts analysis for misspellings

A misspelling can be generated by worforms corresponding to different lemmas. This is problematic if the number of lemmas is high.
For such cases, it is better to give up with corrections instead of confusing the user with too many unrelated results.
We fix the maximal number of lemmas to three.

In [6]:
MAX_LEMMAS = 3

misspelling_dictionary = (result
                          .groupby('misspelling', as_index=False)
                          .agg(lemmas=('lemma', lambda x: tuple(set(x))))
                          .assign(lemma_count=lambda df: df['lemmas'].map(lambda x: len(x))))

conflicts = misspelling_dictionary[misspelling_dictionary['lemma_count'] > 1]

In [9]:
print('Example of conflicts')
display(conflicts[conflicts['lemma_count'] > MAX_LEMMAS].head(10))

print('Lemma complects that cause conflicts')
display(conflicts[['lemmas', 'lemma_count']].drop_duplicates().sort_values('lemma_count', ascending=False).reset_index(drop=True))

Example of conflicts


Unnamed: 0,misspelling,lemmas,lemma_count
6530,BBeltides,"(Belti, Beltides, Belt, Beltide)",4
6570,BBeltidesse,"(Belti, Beltides, Belt, Beltide)",4
6571,BBeltidest,"(Belti, Beltides, Belt, Beltide)",4
8903,Beeltides,"(Belti, Beltides, Belt, Beltide)",4
8943,Beeltidesse,"(Belti, Beltides, Belt, Beltide)",4
8944,Beeltidest,"(Belti, Beltides, Belt, Beltide)",4
9126,Beldides,"(Belti, Beltides, Belt, Beltide)",4
9166,Beldidesse,"(Belti, Beltides, Belt, Beltide)",4
9167,Beldidest,"(Belti, Beltides, Belt, Beltide)",4
10152,Belides,"(Belti, Beltides, Belt, Beltide)",4


Lemma complects that cause conflicts


Unnamed: 0,lemmas,lemma_count
0,"(Belti, Beltides, Belt, Beltide)",4
1,"(Belti, Belt, Beltide)",3
2,"(Belti, Belgia, Belt)",3
3,"(Belti, Belt)",2
4,"(Beltide, Beltides)",2
5,"(Baltica, Baltic)",2
6,"(Belti, Beltide)",2
7,"(Cartagena, Cartage)",2
8,"(Credit, Credi)",2


### Final dictionary for wordforms and misspellings

Now we can combine dictionary of wordforms and misspelling into a single normalisation dictionary.
Note that there can be more one row for particular input string

In [8]:
normalisation_table = concat([
    misspelling_dictionary
    .loc[misspelling_dictionary['lemma_count'] <= MAX_LEMMAS]
    .explode('lemmas')
    .rename(columns={'lemmas': 'lemma', 'misspelling': 'search_string'})
    .assign(misspelling=True)
    [['misspelling', 'search_string', 'lemma']],
    
    wordforms
    .rename(columns={'wordform': 'search_string'})
    .assign(misspelling=False)[['misspelling', 'search_string', 'lemma']]
    ], axis=0).sort_values(['lemma', 'search_string'])
 
display(normalisation_table.sample(10))

normalisation_table.to_csv('results/caption_index/state_laws_normalisation_dictionary.csv', header=True, index=False)

Unnamed: 0,misspelling,search_string,lemma
27264,False,keskkonnaregistreid,keskkonnaregister
31104,True,eBltidesidelt,Beltides
10124,True,Belias,Belgia
40156,False,linnakohtuteta,linnakohus
41846,False,lähetatutele,lähetatud
45382,False,mereliiklusisse,mereliiklus
4106,True,Antartkikaile,Antarktika
90126,False,väärikuga,väärik
16626,True,Cartagekks,Cartage
9473,False,augusteis,august
