In [2]:
import math
import time
import requests
import datetime
import urllib.parse

from datetime import date
from pandas import DataFrame 
from pandas import read_csv
from pandas import concat
from tqdm.auto import tqdm

## I. Using vabamorph analyser directly to extract lemmas from texts

The most basic way to idex a document is to analyse it word by word:
* This keeps all possible interpretation of the word in the outcome.
* This creates many spurious analysis results (sadama --> sada, sadam).

In [121]:
def analyze_document_caption(caption: str):
    """
    Uses web service to extract words and sub-words form document captions

    Returns a four column table with columns index, wordform, lemma, sublemmas.
    There can be several rows for each word as each word is analysed separately.
    All rows with the same index correspond to the same word.
    Wordform columns is added to facilitate tokenisation debugging.
    """
    ANALYZER_QUERY = "https://smart-search.tartunlp.ai/api/analyser/process"
    HEADERS = {"Content-Type": "application/json; charset=utf-8"}
    POST_DATA_TEMPLATE = {'params': {"vmetajson": ["--guess"]}, 'content': caption}

    response = requests.post(ANALYZER_QUERY, json=POST_DATA_TEMPLATE, headers=HEADERS)
    assert response.ok, "Webservice failed"
    response = response.json()

    token_count = len(response['annotations']['tokens'])
    tbl = DataFrame({'wordform': [None] * token_count, 'lemma': [None] * token_count})
    for i, token in enumerate(response['annotations']['tokens']):
        features = token['features']
        tbl.loc[i, 'wordform'] = features['token']
        tbl.loc[i, 'lemma'] = list(set(map(lambda x: x['lemma'], features['mrf'])))

    tbl =  tbl.reset_index().explode('lemma')
    tbl['sublemmas'] = tbl['lemma'].str.split('_', regex=False)
    tbl['lemma'] = tbl['lemma'].str.replace('_', '', regex=False)
    return tbl

# Example output
analyze_document_caption('Presidendi ametiraha seadus')

Unnamed: 0,index,wordform,lemma,sublemmas
0,0,Presidendi,president,[president]
1,1,ametiraha,ametiraha,"[ameti, raha]"
2,2,seadus,seadu,[seadu]
2,2,seadus,seadus,[seadus]


### Example analysis of all captions

To buid a necessary input file to query normalisation service, we need to analyse all the document captions and get occurence counts for lemmas. In this analysis we ignore the fact that some wordforms have multiple lemmas and treat each row with the same weight.  

In [146]:
sources = read_csv('results/state_laws.csv', header=0)

result = [None] *  len(sources)
for i, caption in tqdm(enumerate(sources['document_title']), total=len(sources)):
    result[i] = analyze_document_caption(caption).assign(doc_id = i)

result = concat(result, axis=0)
display(result)

  0%|          | 0/4712 [00:00<?, ?it/s]

Now we can form a table of lemma counts and filter it to get rid of spurious tokens that are not lemmas
or find out problems with preprocessing of document captions

In [162]:
lemma_counts = result.groupby('lemma').agg(occurence_count=('lemma', len), document_count = ('doc_id', lambda x: len(set(x))))
lemma_counts = lemma_counts.sort_values(['occurence_count', 'document_count'], ascending=False)
display(lemma_counts.head(15))
display(lemma_counts.tail(15))

Unnamed: 0_level_0,occurence_count,document_count
lemma,Unnamed: 1_level_1,Unnamed: 2_level_1
seadus,3004,2988
seadu,2988,2988
ja,1178,918
vabariik,584,338
ratifitseerimine,472,469
Eesti,370,330
eesti,363,324
seadustik,274,271
konventsioon,243,229
valitsus,229,121


Unnamed: 0_level_0,occurence_count,document_count
lemma,Unnamed: 1_level_1,Unnamed: 2_level_1
ühisõppus,1,1
ühtekuuluvus,1,1
ülalpidamine,1,1
üldine,1,1
üldleping,1,1
üleliigne,1,1
üleminekusäte,1,1
ülevõtmine,1,1
ümberkujundamine,1,1
ümberpaigutamine,1,1


**Observation:** There are clearly non-words inside the lemma list. Lets diagnose what has happened.