In [1]:
import re
import math
import time
import requests
import datetime
import urllib.parse

from datetime import date
from pandas import DataFrame 
from pandas import read_csv
from pandas import concat
from tqdm.auto import tqdm

## I. Using vabamorph analyser directly to extract wordform-lemma pairs from text

* This part of the code is exactly the sama as for lemma exraction (05A_find_existing_lemmas_in_document_captions_with_webservice.ipynb).
* We use the updated version of the script here.
* Updated the treatment of quotation marks.
* Updated the treatment of brakets.
* Updated treatment of comas

In [2]:
def analyze_document_caption(caption: str):
    """
    Uses web service to extract words and sub-words form document captions

    Returns a four column table with columns index, wordform, lemma, sublemmas.
    There can be several rows for each word as each word is analysed separately.
    All rows with the same index correspond to the same word.
    Wordform columns is added to facilitate tokenisation debugging.
    """
    corrected_caption = re.sub('\s+', ' ', re.sub('ˮ|"|„|”|“|«|»', ' ˮ ', re.sub('\s+', ' ', caption)))
    corrected_caption = re.sub('\s+', ' ', re.sub('\)', ' )', re.sub('\(', '( ', corrected_caption)))
    corrected_caption = re.sub('\s+', ' ', re.sub('\.(?:\s|$)', ' . ', re.sub(',(?:\s|$)', ' , ', corrected_caption)))
    corrected_caption = re.sub('\s+', ' ', re.sub(':(?:\s|$)', ' : ', re.sub(';(?:\s|$)', ' ; ', corrected_caption)))

    ANALYZER_QUERY = "https://smart-search.tartunlp.ai/api/analyser/process"
    HEADERS = {"Content-Type": "application/json; charset=utf-8"}
    POST_DATA_TEMPLATE = {'params': {"vmetajson": ["--guess"]}, 'content': corrected_caption}

    response = requests.post(ANALYZER_QUERY, json=POST_DATA_TEMPLATE, headers=HEADERS)
    assert response.ok, "Webservice failed"
    response = response.json()
 
    token_count = len(response['annotations']['tokens'])
    tbl = DataFrame({'wordform': [None] * token_count, 'lemma': [None] * token_count})
    for i, token in enumerate(response['annotations']['tokens']):
        features = token['features']
        tbl.loc[i, 'wordform'] = features['token']
        tbl.loc[i, 'lemma'] = list(set(map(lambda x: x['lemma'], features['mrf'])))

    tbl =  tbl.reset_index().explode('lemma')

    # Post-correction for Vabamorph output. Remove special symbols 
    tbl['lemma'] = tbl['lemma'].str.replace('=', '', regex=False)
    tbl['lemma'] = tbl['lemma'].str.replace('+', '', regex=False)

    # Post-correction for sublemmas
    tbl['sublemmas'] = tbl['lemma'].str.split('_', regex=False)
    tbl['lemma'] = tbl['lemma'].str.replace('_', '', regex=False)
    return tbl

# „Talleks”	
# '"Balti'
# '“Lepingu'
# (EAEC)	
# Ühendust,
# «Õigusabi
# Example output
analyze_document_caption('Presidendi ametiraha seadus')

Unnamed: 0,index,wordform,lemma,sublemmas
0,0,Presidendi,president,[president]
1,1,ametiraha,ametiraha,"[ameti, raha]"
2,2,seadus,seadu,[seadu]
2,2,seadus,seadus,[seadus]


### I.A Initial analysis of all captions

* We need to analyse all the document captions and get occurence counts for wordforms. 
* In this analysis we ignore the fact that some wordforms have multiple lemmas and treat each row with the same weight.

In [3]:
sources = read_csv('results/state_laws.csv', header=0)

result = [None] *  len(sources)
for i, caption in tqdm(enumerate(sources['document_title']), total=len(sources)):
    result[i] = analyze_document_caption(caption).assign(doc_id = i)

result = concat(result, axis=0)
display(result)

  0%|          | 0/4712 [00:00<?, ?it/s]

Unnamed: 0,index,wordform,lemma,sublemmas,doc_id
0,0,Ajutise,ajutine,[ajutine],0
1,1,sisseveo,sissevedu,"[sisse, vedu]",0
2,2,konventsiooniga,konventsioon,[konventsioon],0
3,3,ühinemise,ühinemine,[ühinemine],0
4,4,seadus,seadu,[seadu],0
...,...,...,...,...,...
0,0,Väljasõidukohustuse,väljasõidukohustus,"[välja, sõidu, kohustus]",4711
1,1,ja,ja,[ja],4711
2,2,sissesõidukeelu,sissesõidukeeld,"[sisse, sõidu, keeld]",4711
3,3,seadus,seadu,[seadu],4711


#### Validation of the wordform table

As the first sanity check, let us search for wordforms that contain characters out of Estonian alphabet.
This can reveal special words and abbrevations or reveal errors in previous analysis steps. 
To simplify analysis we first separate all numerical tokens and separate interpunctuation marks.  


In [4]:
NUMBER = '[0-9]+'
NUMBER_RANGE = f'(?:{NUMBER})-(?:{NUMBER})'
ROMAN_NUMBER = '[MCLXVI]+'
SECTION_NUMBER = f'[A-Z]\.{NUMBER}|(?:{ROMAN_NUMBER}/{NUMBER})'
NUMBER_EXPRESSION = '|'.join([NUMBER, NUMBER_RANGE, SECTION_NUMBER])

In [5]:
numeric_expressions = result[result['wordform'].str.match(NUMBER_EXPRESSION)]
wordforms = result[~result['wordform'].str.match(NUMBER_EXPRESSION)]

display(wordforms[wordforms['wordform'].str.contains('[0-9]')])

Unnamed: 0,index,wordform,lemma,sublemmas,doc_id
0,0,Äriseadustik1,Äriseadustik1,[Äriseadustik1],1664


**Observation:** Wordforms indeed do not contain numbers excapt for a single tokenisation error.

In [6]:
PUNCTUATION_MARKS = '[,\.ˮ;:/\(\)–-]'

punctuation = wordforms[wordforms['wordform'].str.match(PUNCTUATION_MARKS)]
wordforms = wordforms[~wordforms['wordform'].str.match(PUNCTUATION_MARKS)]

display(wordforms.loc[wordforms['wordform'].map(lambda x: len(x) == 1), 'wordform'].value_counts().reset_index())

Unnamed: 0,wordform,count
0,I,1
1,a,1
2,c,1
3,E,1


**Observation:** There are some one-letter tokens inside wordforms. These are again parts of numeric codes but these do not have to be excluded from the wordforms.

In [7]:
ESTONIAN_LETTER = '[a-z|öäõü|\-|žš]'
ESTONIAN_ONLY_WORD = f'^{ESTONIAN_LETTER}+$'

In [8]:
idx = wordforms['wordform'].str.match(ESTONIAN_ONLY_WORD, case=False)
display(wordforms[~idx])

Unnamed: 0,index,wordform,lemma,sublemmas,doc_id
2,2,Strasbourg'i,Strasbourg,[Strasbourg],185
5,5,International'i,International,[International],259
3,3,A/S,A/S,[A/S],570
4,4,Eksportfinans'i,Eksportfinans,[Eksportfinans],570
5,5,Exportkredit'i,Exportkredit,[Exportkredit],571
4,4,Paribas',Paribas,[Paribas],573
13,13,U.K,U.K,[U.K],573
15,15,P.L.C,P.L.C,[P.L.C],573
0,0,Äriseadustik1,Äriseadustik1,[Äriseadustik1],1664


**Observation:** Foreign names can have an aphostrophe and they can have a grammatical form that is not generated by Vabamorph. 
We need to separate these wordforms for further processing.  

In [9]:
international_names = wordforms[wordforms['wordform'].str.contains("'", regex=False)]
wordforms = wordforms[~wordforms['wordform'].str.contains("'", regex=False)]

display(international_names)

idx = wordforms['wordform'].str.match(ESTONIAN_ONLY_WORD, case=False)
display(wordforms[~idx])

Unnamed: 0,index,wordform,lemma,sublemmas,doc_id
2,2,Strasbourg'i,Strasbourg,[Strasbourg],185
5,5,International'i,International,[International],259
4,4,Eksportfinans'i,Eksportfinans,[Eksportfinans],570
5,5,Exportkredit'i,Exportkredit,[Exportkredit],571
4,4,Paribas',Paribas,[Paribas],573


Unnamed: 0,index,wordform,lemma,sublemmas,doc_id
3,3,A/S,A/S,[A/S],570
13,13,U.K,U.K,[U.K],573
15,15,P.L.C,P.L.C,[P.L.C],573
0,0,Äriseadustik1,Äriseadustik1,[Äriseadustik1],1664


**Observation:** We should also remove common abrevations from the list of normal wordforms as they are inflected differently form normal words. 

In [10]:
COMMON_ABBREVATIONS = '^(?:A/S|U\.K|P\.L\.C)$'

In [11]:
abbrevations = wordforms[wordforms['wordform'].str.contains(COMMON_ABBREVATIONS, case=True, regex=True)]
wordforms = wordforms[~wordforms['wordform'].str.contains(COMMON_ABBREVATIONS, case=True, regex=True)]

display(abbrevations)

idx = wordforms['wordform'].str.match(ESTONIAN_ONLY_WORD, case=False)
display(wordforms[~idx])

Unnamed: 0,index,wordform,lemma,sublemmas,doc_id
3,3,A/S,A/S,[A/S],570
13,13,U.K,U.K,[U.K],573
15,15,P.L.C,P.L.C,[P.L.C],573


Unnamed: 0,index,wordform,lemma,sublemmas,doc_id
0,0,Äriseadustik1,Äriseadustik1,[Äriseadustik1],1664


## II Create list of existing wordforms 

We need a table with columns wordform, occurence_count, document_count
The lemma-wordform information was already extracted in 5A 

In [12]:
outcome = (wordforms
           .groupby('wordform', as_index=False)
           .agg(occurence_count=('index', lambda x: len(x)), document_count=('doc_id', lambda x: len(set(x))))
           .sort_values(['occurence_count', 'document_count'], ascending=False))

outcome.to_csv('results/caption_index/state_laws_existing_wordforms.csv', index=False)

In [13]:
outcome = (international_names
           .groupby('wordform', as_index=False)
           .agg(occurence_count=('index', lambda x: len(x)), document_count=('doc_id', lambda x: len(set(x))))
           .sort_values(['occurence_count', 'document_count'], ascending=False))

outcome.to_csv('results/caption_index/state_laws_existing_international_names.csv', index=False)

In [14]:
outcome = (abbrevations
           .groupby('wordform', as_index=False)
           .agg(occurence_count=('index', lambda x: len(x)), document_count=('doc_id', lambda x: len(set(x))))
           .sort_values(['occurence_count', 'document_count'], ascending=False))

outcome.to_csv('results/caption_index/state_laws_existing_abbrevations.csv', index=False)