In [1]:
import re
import math
import time
import requests
import datetime
import urllib.parse

from datetime import date
from pandas import DataFrame 
from pandas import read_csv
from pandas import concat
from tqdm.auto import tqdm

## I. Using vabamorph analyser directly to extract wordform-lemma pairs from text

This part of the code is exactly the sama as for lemma exraction (05A_find_existing_lemmas_in_document_captions_with_webservice.ipynb).
* We use the updated version of the script here.
* Updated the treatment of quotation marks.
* Updated the treatment of brakets.
* Updated treatment of comas

In [3]:
def analyze_document_caption(caption: str):
    """
    Uses web service to extract words and sub-words form document captions

    Returns a four column table with columns index, wordform, lemma, sublemmas.
    There can be several rows for each word as each word is analysed separately.
    All rows with the same index correspond to the same word.
    Wordform columns is added to facilitate tokenisation debugging.
    """
    corrected_caption = re.sub('\s+', ' ', re.sub('ˮ|"|„|”|“|«|»', ' ˮ ', re.sub('\s+', ' ', caption)))
    corrected_caption = re.sub('\s+', ' ', re.sub('\)', ' )', re.sub('\(', '( ', corrected_caption)))
    corrected_caption = re.sub('\s+', ' ', re.sub('\.(?:\s|$)', ' . ', re.sub(',(?:\s|$)', ' , ', corrected_caption)))
    corrected_caption = re.sub('\s+', ' ', re.sub(':(?:\s|$)', ' : ', re.sub(';(?:\s|$)', ' ; ', corrected_caption)))

    ANALYZER_QUERY = "https://smart-search.tartunlp.ai/api/analyser/process"
    HEADERS = {"Content-Type": "application/json; charset=utf-8"}
    POST_DATA_TEMPLATE = {'params': {"vmetajson": ["--guess"]}, 'content': corrected_caption}

    response = requests.post(ANALYZER_QUERY, json=POST_DATA_TEMPLATE, headers=HEADERS)
    assert response.ok, "Webservice failed"
    response = response.json()
 
    token_count = len(response['annotations']['tokens'])
    tbl = DataFrame({'wordform': [None] * token_count, 'lemma': [None] * token_count})
    for i, token in enumerate(response['annotations']['tokens']):
        features = token['features']
        tbl.loc[i, 'wordform'] = features['token']
        tbl.loc[i, 'lemma'] = list(set(map(lambda x: x['lemma'], features['mrf'])))

    tbl =  tbl.reset_index().explode('lemma')

    # Post-correction for Vabamorph output. Remove special symbols 
    tbl['lemma'] = tbl['lemma'].str.replace('=', '', regex=False)
    tbl['lemma'] = tbl['lemma'].str.replace('+', '', regex=False)

    # Post-correction for sublemmas
    tbl['sublemmas'] = tbl['lemma'].str.split('_', regex=False)
    tbl['lemma'] = tbl['lemma'].str.replace('_', '', regex=False)
    return tbl

# „Talleks”	
# '"Balti'
# '“Lepingu'
# (EAEC)	
# Ühendust,
# «Õigusabi
# Example output
analyze_document_caption('Presidendi ametiraha seadus')

Unnamed: 0,index,wordform,lemma,sublemmas
0,0,Presidendi,president,[president]
1,1,ametiraha,ametiraha,"[ameti, raha]"
2,2,seadus,seadus,[seadus]
2,2,seadus,seadu,[seadu]


## I.A Initial analysis of all captions

* We need to analyse all the document captions and get occurence counts for wordforms.
* In this analysis we ignore the fact that some wordforms have multiple lemmas and treat each row with the same weight.

In [4]:
sources = read_csv('results/state_laws.csv', header=0)

result = [None] *  len(sources)
for i, caption in tqdm(enumerate(sources['document_title']), total=len(sources)):
    result[i] = analyze_document_caption(caption).assign(doc_id = i)

result = concat(result, axis=0).reset_index(drop=True)
display(result)

  0%|          | 0/4712 [00:00<?, ?it/s]

Unnamed: 0,index,wordform,lemma,sublemmas,doc_id
0,0,Ajutise,ajutine,[ajutine],0
1,1,sisseveo,sissevedu,"[sisse, vedu]",0
2,2,konventsiooniga,konventsioon,[konventsioon],0
3,3,ühinemise,ühinemine,[ühinemine],0
4,4,seadus,seadus,[seadus],0
...,...,...,...,...,...
0,0,Väljasõidukohustuse,väljasõidukohustus,"[välja, sõidu, kohustus]",4711
1,1,ja,ja,[ja],4711
2,2,sissesõidukeelu,sissesõidukeeld,"[sisse, sõidu, keeld]",4711
3,3,seadus,seadus,[seadus],4711


In [80]:
result = result.reset_index(drop=True)

### Explicit decomposition into subwords

Vabamorf analyser returns lemma of a compound word together with its decomposition into subwords. 
All compounds except for the last subword are in the form they occur in the original wordform.
Thus it is straightforward although tedious to extract all subwords,  

In [81]:
compound_words = result[result['sublemmas'].map(lambda x: len(x) != 1)].copy()
compound_words['prefix'] = compound_words['sublemmas'].map(lambda x: ''.join(x[:len(x)-1]))

In [82]:
def extract_suffic(word: str, prefix: str):
    location = word.lower().find(prefix.lower())
    return word[location + len(prefix):] if location != -1 else None    

assert extract_suffic('subword', 'sub') == 'word'
assert extract_suffic('-subword', 'sub') == 'word'
assert extract_suffic('subword', 'xxx') is None

In [90]:
compound_words['subwordforms'] = compound_words.apply(
    lambda row: row['sublemmas'][:-1] + [extract_suffic(row['wordform'], row['prefix'])], axis=1)

### Validation of compound word table

In [114]:
ESTONIAN_LETTER = '[a-z|öäõü|\-|žš]'
ESTONIAN_ONLY_WORD = f'^{ESTONIAN_LETTER}+$'

idx = compound_words['wordform'].str.match(ESTONIAN_ONLY_WORD, case=False)
assert all(idx), 'Selected compound words do not contain anything unexpected'

### Export results 

In [102]:
subwords = (compound_words[['index', 'doc_id', 'subwordforms']]
            .explode('subwordforms')
            .rename(columns={'subwordforms':'subword'}))

In [115]:
output = (subwords
          .groupby('subword', as_index=False)
          .agg(occurence_count=('index', lambda x: len(x)), document_count=('doc_id', lambda x: len(set(x))))
          .sort_values(['occurence_count', 'document_count'], ascending=False))

output.to_csv('results/caption_index/state_laws_existing_subwords.csv', index=False)