In [1]:
import re
import math
import time
import requests
import datetime
import urllib.parse

from datetime import date
from pandas import DataFrame 
from pandas import read_csv
from pandas import concat
from tqdm.auto import tqdm

## I. Using vabamorph generator directly to generate all wordforms from lemmas

* The previous step yielded a list of lemmas that are in documents and the user could search them
* Now we need to generate all wordforms of these lemmas as the does not have to specify the term in the expected form
* The result is also an input to typing error generator which mimics common mistakes users tend to do.

In [2]:
def generate_all_worldforms(lemmas: str):
    """
    Uses web service to generate all wordforms from the list of lemmas separated by spaces

    Returns a two column table with columns wordform and lemma.
    The number of rows corresponding to a single lemma varies as duplicated wordforms are omitted.
    """
    assert lemmas.find('+') == -1, 'Input cannot contain + sign. It corrupts the output'

    GENERATOR_QUERY = "https://smart-search.tartunlp.ai/api/generator/process"
    HEADERS = {"Content-Type": "application/json; charset=utf-8"}
    POST_DATA_TEMPLATE = {'type': 'text', 'content': lemmas}

    response = requests.post(GENERATOR_QUERY, json=POST_DATA_TEMPLATE, headers=HEADERS)
    assert response.ok, "Webservice failed"
    response = response.json()
    assert response['response']['type'] == 'texts', "Unexpected response type"

    token_count = len(response['response']['texts'])
    tbl = DataFrame({'lemma': [None] * token_count, 'wordform': [None] * token_count})
    for i, token in enumerate(response['response']['texts']):
        generated_ = token['features']
        tbl.loc[i, 'lemma'] = token['content']
        tbl.loc[i, 'wordform'] = list(set(map(lambda x: x['token'].replace('+', ''), token['features']['generated_forms'])))

    return tbl.explode('wordform').reset_index(drop=True)

display(generate_all_worldforms('Tere').head())
display(generate_all_worldforms('ÜRO'))
display(generate_all_worldforms('ujuma').head())


Unnamed: 0,lemma,wordform
0,Tere,terega
1,Tere,teredelt
2,Tere,teret
3,Tere,teresid
4,Tere,tere


Unnamed: 0,lemma,wordform
0,ÜRO,ÜRO


Unnamed: 0,lemma,wordform
0,ujuma,ujuvad
1,ujuma,ujuksid
2,ujuma,ujudes
3,ujuma,ujunuksin
4,ujuma,ujuksite


### Generation of all wordforms 

In [3]:
BLOCK_SIZE = 100
lemma_counts = read_csv('results/caption_index/state_laws.csv', header=0)

block_count = math.floor(len(lemma_counts)/BLOCK_SIZE)
result = [None] * (block_count + 1)
for i in tqdm(range(block_count + 1), total=block_count):
    lemma_string = ' '.join(lemma_counts.loc[BLOCK_SIZE * i: BLOCK_SIZE * (i + 1), 'lemma'])
    result[i] = generate_all_worldforms(lemma_string)

result = concat(result, axis=0).reset_index(drop=True).sort_values(['lemma', 'wordform'])
display(result.head(10))

  0%|          | 0/28 [00:00<?, ?it/s]

Unnamed: 0,lemma,wordform
31476,-,-
39461,AB,AB
54156,AKV-EL,AKV-EL
54158,Aafrika,Aafrika
54167,Aafrika,Aafrikaga
54170,Aafrika,Aafrikaks
54163,Aafrika,Aafrikal
54168,Aafrika,Aafrikale
54161,Aafrika,Aafrikalt
54165,Aafrika,Aafrikana


### Validate and clean the result

Lets just check that the first letter of the lemma and a wordform coincides. 
There is a substantial amount of case changes which can be ignored at this stage. 
But there are other abnormalities that require deeper analysis. 
In particular note that for some lemma candidates the form generation completely fails. 

**Resulution:** We just remove these non-productive lemmas from the result

There are also these wonderful words that completely change their form which is bizarre but completely correct.

In [4]:
idx = result['lemma'].str[0] != result['wordform'].str[0]
display(result[idx].head(5))

potential_errors = result['lemma'].str[0].str.lower() != result['wordform'].str[0].str.lower()
display(result[potential_errors].head(5))
display(result[result['wordform'].isna()].head(5))

result = result[~result['wordform'].isna()]

potential_errors = result['lemma'].str[0].str.lower() != result['wordform'].str[0].str.lower()
display(result[potential_errors].groupby('lemma').agg(example = ('wordform', lambda x: x.iloc[0])).reset_index())

Unnamed: 0,lemma,wordform
54160,Aafrika,aafrika
26170,Albaania,albaania
13078,Ameerika,ameerika
39462,Araabia,araabia
54309,Armeenia,armeenia


Unnamed: 0,lemma,wordform
88278,SOS-laste,
57657,antu,
58900,avatu,
61848,iga,ea
61850,iga,ead


Unnamed: 0,lemma,wordform
88278,SOS-laste,
57657,antu,
58900,avatu,
63712,kasutatu,
10998,mitme,


Unnamed: 0,lemma,example
0,iga,ea
1,ise,end
2,see,need
3,tema,nad
4,too,noid


### Export results to separate CSV file

In [5]:
result.to_csv('results/caption_index/state_laws_all_wordforms.csv', header=True, index=False)