In [19]:
import re
import math
import time
import requests
import datetime
import urllib.parse

from io import StringIO
from datetime import date
from pandas import DataFrame 
from pandas import read_csv
from pandas import concat
from tqdm.auto import tqdm

## I. Using vabamorph analyser directly to extract lemmas from texts

The most basic way to idex a document is to analyse it word by word:
* This keeps all possible interpretation of the word in the outcome.
* This creates many spurious analysis results (sadama --> sada, sadam).

In [29]:
def index_document(text: str):
    """
    Uses web service to extract words and sub-words form document captions

    Returns a four column table with columns index, wordform, lemma, sublemmas.
    There can be several rows for each word as each word is analysed separately.
    All rows with the same index correspond to the same word.
    Wordform columns is added to facilitate tokenisation debugging.
    """
    

    content = "Jahimehed jahikoertega."
    
    ANALYZER_QUERY = "https://smart-search.tartunlp.ai/api/lemmade-indekseerija/csv"
    HEADERS = {"Content-Type": "application/json"}
    POST_DATA_TEMPLATE = {"sources": {"DOC_1":{"content": content}}}
    
    response = requests.post(ANALYZER_QUERY, json=POST_DATA_TEMPLATE, headers=HEADERS)
    tbl = read_csv(StringIO(response.text), delimiter='\t', header=None)

    return tbl

In [30]:
tbl = index_document('aas')

In [31]:
tbl

Unnamed: 0,0,1,2,3,4,5
0,jahikoer,False,jahikoertega,DOC_1,10,22
1,jahimees,False,Jahimehed,DOC_1,0,9
2,jaht,True,Jahimehed,DOC_1,0,9
3,jaht,True,jahikoertega,DOC_1,10,22
4,jahtima,True,Jahimehed,DOC_1,0,9
5,jahtima,True,jahikoertega,DOC_1,10,22
6,koer,True,jahikoertega,DOC_1,10,22
7,mees,True,Jahimehed,DOC_1,0,9


In [27]:
tbl.groupby('wordform').size()

KeyError: 'wordform'

In [17]:
response.text

'jahikoer\tFalse\tjahikoertega\tDOC_1\t10\t22\njahimees\tFalse\tJahimehed\tDOC_1\t0\t9\njaht\tTrue\tJahimehed\tDOC_1\t0\t9\njaht\tTrue\tjahikoertega\tDOC_1\t10\t22\njahtima\tTrue\tJahimehed\tDOC_1\t0\t9\njahtima\tTrue\tjahikoertega\tDOC_1\t10\t22\nkoer\tTrue\tjahikoertega\tDOC_1\t10\t22\nmees\tTrue\tJahimehed\tDOC_1\t0\t9\n'

Unnamed: 0,0,1,2,3,4,5
0,jahikoer,False,jahikoertega,DOC_1,10,22
1,jahimees,False,Jahimehed,DOC_1,0,9
2,jaht,True,Jahimehed,DOC_1,0,9
3,jaht,True,jahikoertega,DOC_1,10,22
4,jahtima,True,Jahimehed,DOC_1,0,9
5,jahtima,True,jahikoertega,DOC_1,10,22
6,koer,True,jahikoertega,DOC_1,10,22
7,mees,True,Jahimehed,DOC_1,0,9


In [None]:
curl --silent --request POST --header "Content-Type: application/json" \
  --data '{"sources": {"DOC_1":{"content":"Jahimehed jahikoertega."},"DOC_2":{"content":"Daam sülekoeraga ja mees jahikoeraga."}}}' \
  https://smart-search.tartunlp.ai/api/lemmade-indekseerija/csv

In [10]:
def analyze_document_caption(caption: str):
    """
    Uses web service to extract words and sub-words form document captions

    Returns a four column table with columns index, wordform, lemma, sublemmas.
    There can be several rows for each word as each word is analysed separately.
    All rows with the same index correspond to the same word.
    Wordform columns is added to facilitate tokenisation debugging.
    """
    ANALYZER_QUERY = "https://smart-search.tartunlp.ai/api/analyser/process"
    HEADERS = {"Content-Type": "application/json; charset=utf-8"}
    POST_DATA_TEMPLATE = {'params': {"vmetajson": ["--guess"]}, 'content': caption}

    response = requests.post(ANALYZER_QUERY, json=POST_DATA_TEMPLATE, headers=HEADERS)
    assert response.ok, "Webservice failed"
    response = response.json()

    token_count = len(response['annotations']['tokens'])
    tbl = DataFrame({'wordform': [None] * token_count, 'lemma': [None] * token_count})
    for i, token in enumerate(response['annotations']['tokens']):
        features = token['features']
        tbl.loc[i, 'wordform'] = features['token']
        tbl.loc[i, 'lemma'] = list(set(map(lambda x: x['lemma'], features['mrf'])))

    tbl =  tbl.reset_index().explode('lemma')

    # Post-correction for Vabamorph output. Remove special symbols 
    tbl['lemma'] = tbl['lemma'].str.replace('=', '', regex=False)
    tbl['lemma'] = tbl['lemma'].str.replace('+', '', regex=False)

    # Post-correction for sublemmas
    tbl['sublemmas'] = tbl['lemma'].str.split('_', regex=False)
    tbl['lemma'] = tbl['lemma'].str.replace('_', '', regex=False)
    return tbl

# Example output
analyze_document_caption('Presidendi ametiraha seadus')

Unnamed: 0,index,wordform,lemma,sublemmas
0,0,Presidendi,president,[president]
1,1,ametiraha,ametiraha,"[ameti, raha]"
2,2,seadus,seadu,[seadu]
2,2,seadus,seadus,[seadus]


### I.A Initial analysis of all captions

To buid a necessary input file to query normalisation service, we need to analyse all the document captions and get occurence counts for lemmas. In this analysis we ignore the fact that some wordforms have multiple lemmas and treat each row with the same weight.  

In [11]:
sources = read_csv('results/state_laws.csv', header=0)

result = [None] *  len(sources)
for i, caption in tqdm(enumerate(sources['document_title']), total=len(sources)):
    result[i] = analyze_document_caption(caption).assign(doc_id = i)

result = concat(result, axis=0)
display(result)

  0%|          | 0/4712 [00:00<?, ?it/s]

Unnamed: 0,index,wordform,lemma,sublemmas,doc_id
0,0,Ajutise,ajutine,[ajutine],0
1,1,sisseveo,sissevedu,"[sisse, vedu]",0
2,2,konventsiooniga,konventsioon,[konventsioon],0
3,3,ühinemise,ühinemine,[ühinemine],0
4,4,seadus,seadu,[seadu],0
...,...,...,...,...,...
0,0,Väljasõidukohustuse,väljasõidukohustus,"[välja, sõidu, kohustus]",4711
1,1,ja,ja,[ja],4711
2,2,sissesõidukeelu,sissesõidukeeld,"[sisse, sõidu, keeld]",4711
3,3,seadus,seadu,[seadu],4711


Now we can form a table of lemma counts and filter it to get rid of spurious tokens that are not lemmas
or find out problems with preprocessing of document captions

In [12]:
lemma_counts = result.groupby('lemma').agg(occurence_count=('lemma', len), document_count = ('doc_id', lambda x: len(set(x))))
lemma_counts = lemma_counts.sort_values(['occurence_count', 'document_count'], ascending=False).reset_index()
display(lemma_counts.head(15))
display(lemma_counts.tail(15))

Unnamed: 0,lemma,occurence_count,document_count
0,seadus,3004,2988
1,seadu,2988,2988
2,ja,1178,918
3,vabariik,584,338
4,ratifitseerimine,472,469
5,Eesti,370,330
6,eesti,363,324
7,seadustik,274,271
8,konventsioon,243,229
9,valitsus,229,121


Unnamed: 0,lemma,occurence_count,document_count
2287,ühisõppus,1,1
2288,ühtekuuluvus,1,1
2289,ülalpidamine,1,1
2290,üldine,1,1
2291,üldleping,1,1
2292,üleliigne,1,1
2293,üleminekusäte,1,1
2294,ülevõtmine,1,1
2295,ümberkujundamine,1,1
2296,ümberpaigutamine,1,1


**Observation:** There are clearly non-words inside the lemma list. Lets diagnose what has happened.

In [13]:
incorrect = ~lemma_counts['lemma'].str.contains('^[a-z|öäõü|\-|žš]+$', case=False)
display(lemma_counts[incorrect])

Unnamed: 0,lemma,occurence_count,document_count
330,1979.,7,7
383,1,6,6
384,2,6,6
468,1992.,5,5
469,2009.,5,5
...,...,...,...
2262,Äriseadustik1,1,1
2298,ˮEuroopa,1,1
2299,ˮinimõigus,1,1
2300,ˮsisekokkulepe,1,1


In [14]:
numbers = lemma_counts['lemma'].str.contains('^[0-9]+(?:\.|/|-)?[0-9]*$', case=False)
display(lemma_counts[numbers])

Unnamed: 0,lemma,occurence_count,document_count
330,1979.,7,7
383,1,6,6
384,2,6,6
468,1992.,5,5
469,2009.,5,5
...,...,...,...
1260,7,1,1
1261,8,1,1
1263,81,1,1
1264,9,1,1


In [15]:
years = lemma_counts['lemma'].str.contains('^(?:[0-9]+\.\s+aasta)$', case=False)
display(lemma_counts[years])

Unnamed: 0,lemma,occurence_count,document_count
836,1996. aasta,2,2
1223,1972. aasta,1,1
1237,2004. aasta,1,1
1242,2013. aasta,1,1
1244,2016. aasta,1,1


In [16]:
lemma_counts[~numbers & incorrect & ~years]

Unnamed: 0,lemma,occurence_count,document_count
662,K.3,3,3
1195,/,1,1
1200,11. juuni,1,1
1201,11. veebruar,1,1
1209,15.-17.,1,1
1262,8. lisa,1,1
1265,9. lisa,1,1
1267,A/S,1,1
1277,B.4,1,1
1278,B.7,1,1


**Judgement:** 
* Tokenisation failed for some cases as it contained non-breakable space instead of normal space
* There is a problem with quotation marks that touch words.  

### I.B Updated analysis of all captions

Lets correct most of the discovered errors by preprocessing the caption string.

In [29]:
def analyze_document_caption(caption: str):
    """
    Uses web service to extract words and sub-words form document captions

    Returns a four column table with columns index, wordform, lemma, sublemmas.
    There can be several rows for each word as each word is analysed separately.
    All rows with the same index correspond to the same word.
    Wordform columns is added to facilitate tokenisation debugging.
    """
    corrected_caption = re.sub('\s+', ' ', re.sub('ˮ', ' ˮ ', re.sub('\s+', ' ', caption))) 
    
    ANALYZER_QUERY = "https://smart-search.tartunlp.ai/api/analyser/process"
    HEADERS = {"Content-Type": "application/json; charset=utf-8"}
    POST_DATA_TEMPLATE = {'params': {"vmetajson": ["--guess"]}, 'content': corrected_caption}

    response = requests.post(ANALYZER_QUERY, json=POST_DATA_TEMPLATE, headers=HEADERS)
    assert response.ok, "Webservice failed"
    response = response.json()

    token_count = len(response['annotations']['tokens'])
    tbl = DataFrame({'wordform': [None] * token_count, 'lemma': [None] * token_count})
    for i, token in enumerate(response['annotations']['tokens']):
        features = token['features']
        tbl.loc[i, 'wordform'] = features['token']
        tbl.loc[i, 'lemma'] = list(set(map(lambda x: x['lemma'], features['mrf'])))

    tbl =  tbl.reset_index().explode('lemma')

    # Post-correction for Vabamorph output. Remove special symbols 
    tbl['lemma'] = tbl['lemma'].str.replace('=', '', regex=False)
    tbl['lemma'] = tbl['lemma'].str.replace('+', '', regex=False)

    # Post-correction for sublemmas
    tbl['sublemmas'] = tbl['lemma'].str.split('_', regex=False)
    tbl['lemma'] = tbl['lemma'].str.replace('_', '', regex=False)
    return tbl

# Example output
analyze_document_caption('Presidendi ametiraha seadus')

Unnamed: 0,index,wordform,lemma,sublemmas
0,0,Presidendi,president,[president]
1,1,ametiraha,ametiraha,"[ameti, raha]"
2,2,seadus,seadu,[seadu]
2,2,seadus,seadus,[seadus]


In [30]:
sources = read_csv('results/state_laws.csv', header=0)

result = [None] *  len(sources)
for i, caption in tqdm(enumerate(sources['document_title']), total=len(sources)):
    result[i] = analyze_document_caption(caption).assign(doc_id = i)

result = concat(result, axis=0)
display(result)

  0%|          | 0/4712 [00:00<?, ?it/s]

Unnamed: 0,index,wordform,lemma,sublemmas,doc_id
0,0,Ajutise,ajutine,[ajutine],0
1,1,sisseveo,sissevedu,"[sisse, vedu]",0
2,2,konventsiooniga,konventsioon,[konventsioon],0
3,3,ühinemise,ühinemine,[ühinemine],0
4,4,seadus,seadu,[seadu],0
...,...,...,...,...,...
0,0,Väljasõidukohustuse,väljasõidukohustus,"[välja, sõidu, kohustus]",4711
1,1,ja,ja,[ja],4711
2,2,sissesõidukeelu,sissesõidukeeld,"[sisse, sõidu, keeld]",4711
3,3,seadus,seadu,[seadu],4711


#### Validation of the lemma table

In [31]:
lemma_counts = result.groupby('lemma').agg(occurence_count=('lemma', len), document_count = ('doc_id', lambda x: len(set(x))))
lemma_counts = lemma_counts.sort_values(['occurence_count', 'document_count'], ascending=False).reset_index()
display(lemma_counts.head(15))
display(lemma_counts.tail(15))

Unnamed: 0,lemma,occurence_count,document_count
0,seadus,3005,2989
1,seadu,2989,2989
2,ja,1178,918
3,vabariik,584,338
4,ratifitseerimine,472,469
5,Eesti,370,330
6,eesti,363,324
7,seadustik,274,271
8,konventsioon,243,229
9,valitsus,229,121


Unnamed: 0,lemma,occurence_count,document_count
2274,ühista,1,1
2275,ühistatu,1,1
2276,ühistatud,1,1
2277,ühistugi,1,1
2278,ühisõppus,1,1
2279,ühtekuuluvus,1,1
2280,ülalpidamine,1,1
2281,üldine,1,1
2282,üldleping,1,1
2283,üleliigne,1,1


In [32]:
incorrect = ~lemma_counts['lemma'].str.contains('^[a-z|öäõü|\-|žš]+$', case=False)
display(lemma_counts[incorrect])

Unnamed: 0,lemma,occurence_count,document_count
330,ˮ,8,4
331,1979.,7,7
384,1,6,6
385,2,6,6
470,1992.,5,5
...,...,...,...
1276,B.8,1,1
1330,III/1,1,1
1372,P.L.C,1,1
1390,U.K,1,1


In [33]:
numbers = lemma_counts['lemma'].str.contains('^[0-9]+(?:\.|/|-)?[0-9]*$', case=False)
display(lemma_counts[numbers])

Unnamed: 0,lemma,occurence_count,document_count
331,1979.,7,7
384,1,6,6
385,2,6,6
470,1992.,5,5
471,1996.,5,5
...,...,...,...
1259,8,1,1
1260,81,1,1
1261,9,1,1
1262,9.,1,1


In [34]:
display(lemma_counts[incorrect & ~numbers])

Unnamed: 0,lemma,occurence_count,document_count
330,ˮ,8,4
668,K.3,3,3
1199,/,1,1
1212,15.-17.,1,1
1264,A/S,1,1
1274,B.4,1,1
1275,B.7,1,1
1276,B.8,1,1
1330,III/1,1,1
1372,P.L.C,1,1


**Judgement:** There are still some errors in the analysis but we can run with it. Lets collect the correct lemma list. 

In [37]:
correct_lemmas = lemma_counts[~incorrect]
display(correct_lemmas) 

Unnamed: 0,lemma,occurence_count,document_count
0,seadus,3005,2989
1,seadu,2989,2989
2,ja,1178,918
3,vabariik,584,338
4,ratifitseerimine,472,469
...,...,...,...
2284,üleminekusäte,1,1
2285,ülevõtmine,1,1
2286,ümberkujundamine,1,1
2287,ümberpaigutamine,1,1


### Extraction of meaningful sub-lemmas

In [65]:
multi_root_words = result['sublemmas'].map(lambda x: len(x) !=1)
sublemmas = result.loc[multi_root_words, ['sublemmas', 'doc_id']].explode('sublemmas')

# Remove sublemmas that are not sublemmas
uncovered_by_lemmas = ~sublemmas['sublemmas'].isin(correct_lemmas['lemma'])
sublemmas = sublemmas[uncovered_by_lemmas].rename(columns={'sublemmas':'sublemma'})
display(sublemmas)

Unnamed: 0,sublemma,doc_id
0,rahvus,1
1,elekter,1
5,era,1
5,korraline,1
6,täie,1
...,...,...
0,välis,4710
0,maalane,4710
1,rahvus,4710
0,sõidu,4711


In [75]:
sublemma_count = sublemmas.groupby('sublemma').agg(occurence_count=('sublemma', len), document_count = ('doc_id', lambda x: len(set(x))))
sublemma_count = sublemma_count.sort_values(['occurence_count', 'document_count'], ascending=False).reset_index()
display(sublemma_count.head(15))
display(sublemma_count.tail(15))

incorrect = ~sublemma_count['sublemma'].str.contains('^[a-z|öäõü|\-|žš]+$', case=False)
assert not any(incorrect), 'There should be no incorrect sublemmas'

Unnamed: 0,sublemma,occurence_count,document_count
0,vaba,616,338
1,maksu,374,373
2,riigi,299,293
3,kindlustus,217,217
4,lõivu,161,161
5,rahvus,148,144
6,elektri,143,143
7,aktsiis,112,112
8,kriminaal,106,106
9,turu,99,99


Unnamed: 0,sublemma,occurence_count,document_count
672,võetav,1,1
673,võte,1,1
674,võtja,1,1
675,võtt,1,1
676,võttev,1,1
677,õiguse,1,1
678,õigusta,1,1
679,õigustatu,1,1
680,õigustatud,1,1
681,õppus,1,1


### Final lemma count list for further processing

In [86]:
outcome = concat([
    correct_lemmas.assign(sublemma=False),
    sublemma_count.rename(columns={'sublemma': 'lemma'}).assign(sublemma=True)], axis=0)

outcome = (outcome
           .sort_values(['occurence_count', 'document_count'], ascending=False)
           [['sublemma', 'lemma', 'occurence_count', 'document_count']])

outcome.to_csv('results/caption_index/state_laws.csv', header=True, index=False)
display(outcome)

Unnamed: 0,sublemma,lemma,occurence_count,document_count
0,False,seadus,3005,2989
1,False,seadu,2989,2989
2,False,ja,1178,918
0,True,vaba,616,338
3,False,vabariik,584,338
...,...,...,...,...
682,True,öö,1,1
683,True,ühik,1,1
684,True,ühiskond,1,1
685,True,ühte,1,1


Unnamed: 0,sublemma,lemma,occurence_count,document_count
0,False,seadus,3005,2989
1,False,seadu,2989,2989
2,False,ja,1178,918
0,True,vaba,616,338
3,False,vabariik,584,338
...,...,...,...,...
682,True,öö,1,1
683,True,ühik,1,1
684,True,ühiskond,1,1
685,True,ühte,1,1
