In [1]:
import re
import math
import time
import requests
import datetime
import urllib.parse

from datetime import date
from pandas import DataFrame 
from pandas import read_csv
from pandas import concat
from pandas import merge
from tqdm.auto import tqdm

In [2]:
def generate_all_worldforms(lemmas: str):
    """
    Uses web service to generate all wordforms from the list of lemmas separated by spaces

    Returns a two column table with columns wordform and lemma.
    The number of rows corresponding to a single lemma varies as duplicated wordforms are omitted.
    """
    assert lemmas.find('+') == -1, 'Input cannot contain + sign. It corrupts the output'

    GENERATOR_QUERY = "https://smart-search.tartunlp.ai/api/generator/process"
    HEADERS = {"Content-Type": "application/json; charset=utf-8"}
    POST_DATA_TEMPLATE = {'type': 'text', 'content': lemmas}

    response = requests.post(GENERATOR_QUERY, json=POST_DATA_TEMPLATE, headers=HEADERS)
    assert response.ok, "Webservice failed"
    response = response.json()
    assert response['response']['type'] == 'texts', "Unexpected response type"

    token_count = len(response['response']['texts'])
    tbl = DataFrame({'lemma': [None] * token_count, 'wordform': [None] * token_count})
    for i, token in enumerate(response['response']['texts']):
        generated_ = token['features']
        tbl.loc[i, 'lemma'] = token['content']
        tbl.loc[i, 'wordform'] = list(set(map(lambda x: x['token'].replace('+', ''), token['features']['generated_forms'])))

    return tbl.explode('wordform').reset_index(drop=True)

display(generate_all_worldforms('Tere').head())

Unnamed: 0,lemma,wordform
0,Tere,tered
1,Tere,teredena
2,Tere,teredes
3,Tere,teredest
4,Tere,terena


In [7]:
def extend_query(input_string: str):
    
    SEARCH_LEMMATISATION_QUERY = 'https://smart-search.tartunlp.ai/api/ea_paring/json' 
    HEADERS = {'Content-Type': 'application/json'}

    # Find lemmas corresponding to the search string
    inputs = re.sub('\s+', ' ', input_string).split(' ')
    tbl = DataFrame({'input': [None] * len(inputs), 'lemmas': [None] * len(inputs), 'suggestions': [None] * len(inputs)})
    for i, input in enumerate(inputs):
        POST_DATA_TEMPLATE ={"content": input}
        
        response = requests.post(SEARCH_LEMMATISATION_QUERY, json=POST_DATA_TEMPLATE, headers=HEADERS)
        assert response.ok, "Webservice failed"
        response = response.json()

        assert 'annotations' in response, "Invalid response"
        assert 'query' in response['annotations'], "Invalid response"
        assert 'typos' in response['annotations'], "Invalid response"

        query = response['annotations']['query']
        typos = response['annotations']['typos'].get(input)



        tbl.loc[i, 'input'] = input
        tbl.loc[i, 'lemmas'] = query[0] if len(query) != 0 else None
        tbl.loc[i, 'suggestions'] = [(element['suggestion'], element['weight']) for element in typos] if typos is not None else None 

    # Reformat the table to get it into right shape
    tbl = tbl.reset_index(names='location')
    tbl1 = tbl[['location', 'input', 'lemmas']]
    tbl1 = (tbl1[~tbl1['lemmas'].isna()]
            .explode('lemmas')
            .assign(type='original')
            .rename(columns = {'lemmas': 'lemma'})
            .assign(confidence=1.0))

    tbl2 = tbl[['location', 'input', 'suggestions']]
    tbl2 = (tbl2[~tbl2['suggestions'].isna()]
            .explode('suggestions')
            .assign(type='suggestion', lemma=None, confidence=None))
    if len(tbl2) != 0:
        tbl2[['lemma', 'confidence']] = tbl2['suggestions'].tolist()
        tbl2['confidence'] = tbl2['confidence'].astype(float)
    
    tbl3 = (tbl.loc[tbl['lemmas'].isna() & tbl['suggestions'].isna(), ['location', 'input']]
            .assign(lemma = None, type='original', confidence=0.0))

    tbl_list = [tbl1, tbl2[['location', 'input', 'lemma', 'type', 'confidence']], tbl3]
    tbl = (concat([tbl for tbl in tbl_list if len(tbl) != 0], axis=0)
           .sort_values(['location', 'input', 'confidence'])
           .reset_index(drop=True))

    # Extend lemmas to all wordforms. 
    # Currently for all wordforms as right web service is missing
    lemmas = tbl.loc[~tbl['lemma'].isna(), 'lemma']
    assert not(any(lemmas.str.contains('\s', regex=True))), 'Unexpected whitespaces inside words'
    wordforms = generate_all_worldforms(' '.join(lemmas))
    tbl = merge(tbl, wordforms, on='lemma', how='left') 

    # Copy input strings as wordforms if lemmatisation fails
    idx = tbl['wordform'].isna()
    tbl.loc[idx, 'wordform'] = tbl.loc[idx, 'input']
    return tbl

display(extend_query('xxx bresident president sadama satama'))

Unnamed: 0,location,input,lemma,type,confidence,wordform
0,0,xxx,,original,0.0,xxx
1,1,bresident,president,suggestion,0.0,presidendi
2,1,bresident,president,suggestion,0.0,presidentideks
3,1,bresident,president,suggestion,0.0,presidendilt
4,1,bresident,president,suggestion,0.0,presidentidesse
...,...,...,...,...,...,...
486,4,satama,seatama,suggestion,0.0,seatamad
487,4,satama,seatama,suggestion,0.0,seatamadeta
488,4,satama,seatama,suggestion,0.0,seatamas
489,4,satama,seatama,suggestion,0.0,seatamadega


## Query documets form RT webservice

In [15]:
import math
import time
import requests
import datetime
import urllib.parse

from datetime import date
from pandas import DataFrame 
from pandas import concat
from tqdm.auto import tqdm

In [24]:
BASE_URL = 'https://www.riigiteataja.ee/api/oigusakt_otsing/1'

ARG_STRUCTURE = \
{
    'leht': (int, 1),
    'limiit': (int, 500),
    'kehtiv': (date, None),
    'tulemused': (bool, True),
    'kehtivKehtetus': (bool, False),
    'mitteJoustunud': (bool, False),
    'kov': (bool, False),
    'dokument': (str, 'seadus'),
    'pealkiri': (str, None)
}

def get_search_query(**kwargs):
    result = {}
    for key, value in kwargs.items():
        arg_type, default_value = ARG_STRUCTURE.get(key, (None, None))
        if arg_type is None:
            raise ValueError(f'Unknown argument: {key}')
        elif not isinstance(value, arg_type):
            raise ValueError(f'Argument {key} must be of type {arg_type}')
            
        if arg_type is bool:
            result[key] = str(value).lower()
        else:
            result[key] = urllib.parse.quote(str(value))
    return f"{BASE_URL}/otsi?{'&'.join(f'{key}={value}' for key, value in result.items())}"

In [136]:
def search_caption(wordform:str, **kwargs):
    SEARCH_QUERY = 'https://www.riigiteataja.ee/api/oigusakt_otsing/1/otsi' 
    
    current_date =  date.today()
    payload = {'leht':1,
               'kehtiv':current_date, 
               'dokument':'seadus', 
               'limiit':500, 
               'pealkiri': wordform, 
               'pealkiriOtsinguTyyp': 'koik_sonad', 
               'filter':True, 
               'grupeeri':False}
    
    response = requests.get(SEARCH_QUERY, params=payload)
    assert response.status_code== 200, 'GET request failed'
    response = response.json()

    # Get the number of responce pages
    assert 'aktid' in response, 'Missing payload'
    assert 'metaandmed' in response, 'Missing meta field'
    assert 'kokku' in response['metaandmed'], 'Missing meta field'
    assert 'limiit' in response['metaandmed'], 'Missing meta field'

    total_count = response['metaandmed']['kokku']
    document_limit = response['metaandmed']['limiit']
    max_page = math.ceil(total_count/document_limit)

    if total_count == 0:
        return None 
    # Iterate over responce sheets
    query_results = [None] * max_page 
    for page in range(max_page):

        payload['leht'] = page + 1
        response = requests.get(SEARCH_QUERY, params=payload)
        assert response.status_code== 200, 'GET request failed'
        response = response.json()

        document_count = len(response['aktid'])
        query_results[page] = DataFrame({
            'global_id': [None] * document_count, 
            'document_title': [None] * document_count, 
            'document_type': [None] * document_count, 
            'commencement_date': [None] * document_count,
            'repeal_date': [None] * document_count,
            'xml_source': [None] * document_count})

        for i, document in enumerate(response['aktid']):
            query_results[page].loc[i, 'global_id'] = document['globaalID']
            query_results[page].loc[i, 'document_title'] = document['pealkiri']
            query_results[page].loc[i, 'document_type'] = document['liik']
            query_results[page].loc[i, 'commencement_date'] = document['kehtivus'].get('algus')
            query_results[page].loc[i, 'repeal_date'] = document['kehtivus'].get('lopp')
            query_results[page].loc[i, 'xml_source'] = f"https://www.riigiteataja.ee{document['url']}"   
        
    return concat(query_results, axis=0)

search_caption('presidendi')

Unnamed: 0,global_id,document_title,document_type,commencement_date,repeal_date,xml_source
0,128122010026,Vabariigi Presidendi ametihüve seadus,seadus,2012-01-01,,https://www.riigiteataja.ee/akt/128122010026.xml
1,113122014059,Vabariigi Presidendi ametihüve seadus,seadus,2016-07-01,,https://www.riigiteataja.ee/akt/113122014059.xml
2,121032011032,Vabariigi Presidendi töökorra seadus,seadus,2012-01-01,,https://www.riigiteataja.ee/akt/121032011032.xml
3,129122011152,Vabariigi Presidendi ametihüve seadus,seadus,2014-01-01,,https://www.riigiteataja.ee/akt/129122011152.xml
4,122062016013,Vabariigi Presidendi ametihüve seadus,seadus,2018-01-01,,https://www.riigiteataja.ee/akt/122062016013.xml
5,110072012032,Vabariigi Presidendi valimise seadus,seadus,2013-04-01,,https://www.riigiteataja.ee/akt/110072012032.xml
6,127062017010,Vabariigi Presidendi ametihüve seadus,seadus,2018-01-01,2023-12-31,https://www.riigiteataja.ee/akt/127062017010.xml
7,127062017011,Vabariigi Presidendi töökorra seadus,seadus,2017-07-07,,https://www.riigiteataja.ee/akt/127062017011.xml
8,128122011068,Riigikogu ja Vabariigi Presidendi poolt nimeta...,seadus,2013-01-01,,https://www.riigiteataja.ee/akt/128122011068.xml
9,129122012010,Riigikogu ja Vabariigi Presidendi poolt nimeta...,seadus,2014-01-01,,https://www.riigiteataja.ee/akt/129122012010.xml


### Let put it together

In [147]:
def final_search_query(search_input: str):
    extended_search_input = extend_query(search_input)

    summary_result = []
    for wordform in extended_search_input['wordform']:
        result = search_caption(wordform)
        if result is not None:
            summary_result.append(result)

    if len(summary_result) == 0:
        return None

    return concat(summary_result, axis=0).sort_values(['document_title', 'commencement_date'])
    

In [148]:
final_search_query('president')

Unnamed: 0,global_id,document_title,document_type,commencement_date,repeal_date,xml_source
10,13262582,Riigikogu ja Vabariigi Presidendi poolt nimeta...,seadus,2012-01-01,,https://www.riigiteataja.ee/akt/13262582.xml
8,128122011068,Riigikogu ja Vabariigi Presidendi poolt nimeta...,seadus,2013-01-01,,https://www.riigiteataja.ee/akt/128122011068.xml
9,129122012010,Riigikogu ja Vabariigi Presidendi poolt nimeta...,seadus,2014-01-01,,https://www.riigiteataja.ee/akt/129122012010.xml
0,128122010026,Vabariigi Presidendi ametihüve seadus,seadus,2012-01-01,,https://www.riigiteataja.ee/akt/128122010026.xml
3,129122011152,Vabariigi Presidendi ametihüve seadus,seadus,2014-01-01,,https://www.riigiteataja.ee/akt/129122011152.xml
1,113122014059,Vabariigi Presidendi ametihüve seadus,seadus,2016-07-01,,https://www.riigiteataja.ee/akt/113122014059.xml
4,122062016013,Vabariigi Presidendi ametihüve seadus,seadus,2018-01-01,,https://www.riigiteataja.ee/akt/122062016013.xml
6,127062017010,Vabariigi Presidendi ametihüve seadus,seadus,2018-01-01,2023-12-31,https://www.riigiteataja.ee/akt/127062017010.xml
11,769953,Vabariigi Presidendi ja Vabariigi Valitsuse li...,seadus,2004-07-01,,https://www.riigiteataja.ee/akt/769953.xml
2,121032011032,Vabariigi Presidendi töökorra seadus,seadus,2012-01-01,,https://www.riigiteataja.ee/akt/121032011032.xml


In [149]:
final_search_query('resident')

In [151]:
final_search_query('tuumarelvakatsetus')

Unnamed: 0,global_id,document_title,document_type,commencement_date,repeal_date,xml_source
0,26008,Tuumarelvakatsetuste üldise keelustamise lepin...,seadus,2002-06-01,,https://www.riigiteataja.ee/akt/26008.xml


In [137]:
search_caption('president')

In [139]:
for wordform in extended_query['wordform']:
    result = search_caption(wordform)
    if result is not None:
        display(result)
    else:
        print(wordform)

Unnamed: 0,global_id,document_title,document_type,commencement_date,repeal_date,xml_source
0,128122010026,Vabariigi Presidendi ametihüve seadus,seadus,2012-01-01,,https://www.riigiteataja.ee/akt/128122010026.xml
1,113122014059,Vabariigi Presidendi ametihüve seadus,seadus,2016-07-01,,https://www.riigiteataja.ee/akt/113122014059.xml
2,121032011032,Vabariigi Presidendi töökorra seadus,seadus,2012-01-01,,https://www.riigiteataja.ee/akt/121032011032.xml
3,129122011152,Vabariigi Presidendi ametihüve seadus,seadus,2014-01-01,,https://www.riigiteataja.ee/akt/129122011152.xml
4,122062016013,Vabariigi Presidendi ametihüve seadus,seadus,2018-01-01,,https://www.riigiteataja.ee/akt/122062016013.xml
5,110072012032,Vabariigi Presidendi valimise seadus,seadus,2013-04-01,,https://www.riigiteataja.ee/akt/110072012032.xml
6,127062017010,Vabariigi Presidendi ametihüve seadus,seadus,2018-01-01,2023-12-31,https://www.riigiteataja.ee/akt/127062017010.xml
7,127062017011,Vabariigi Presidendi töökorra seadus,seadus,2017-07-07,,https://www.riigiteataja.ee/akt/127062017011.xml
8,128122011068,Riigikogu ja Vabariigi Presidendi poolt nimeta...,seadus,2013-01-01,,https://www.riigiteataja.ee/akt/128122011068.xml
9,129122012010,Riigikogu ja Vabariigi Presidendi poolt nimeta...,seadus,2014-01-01,,https://www.riigiteataja.ee/akt/129122012010.xml


presidentideks
presidendilt
presidentidesse
presidendita
presidendele
presidendini
presidendil
presidendiga
presidendina
presidendiks
presidendelt
presidendel
presidentideta
presidendes
presidentidelt
presidendile
presidentidena
presidente
presidendist
presidenti
presidentisid
presidentides
presidentidel
president
presidendesse
presidentidest
presidentide
presidendeks
presidendis
presidendisse
presidentidega
presidendid
presidendest
presidentidele
presidentideni


In [124]:
search_caption('presidendi')

Unnamed: 0,global_id,document_title,document_type,commencement_date,repeal_date,xml_source
0,128122010026,Vabariigi Presidendi ametihüve seadus,seadus,2012-01-01,,https://www.riigiteataja.ee/akt/128122010026.xml
1,113122014059,Vabariigi Presidendi ametihüve seadus,seadus,2016-07-01,,https://www.riigiteataja.ee/akt/113122014059.xml
2,121032011032,Vabariigi Presidendi töökorra seadus,seadus,2012-01-01,,https://www.riigiteataja.ee/akt/121032011032.xml
3,129122011152,Vabariigi Presidendi ametihüve seadus,seadus,2014-01-01,,https://www.riigiteataja.ee/akt/129122011152.xml
4,122062016013,Vabariigi Presidendi ametihüve seadus,seadus,2018-01-01,,https://www.riigiteataja.ee/akt/122062016013.xml
5,110072012032,Vabariigi Presidendi valimise seadus,seadus,2013-04-01,,https://www.riigiteataja.ee/akt/110072012032.xml
6,127062017010,Vabariigi Presidendi ametihüve seadus,seadus,2018-01-01,2023-12-31,https://www.riigiteataja.ee/akt/127062017010.xml
7,127062017011,Vabariigi Presidendi töökorra seadus,seadus,2017-07-07,,https://www.riigiteataja.ee/akt/127062017011.xml
8,128122011068,Riigikogu ja Vabariigi Presidendi poolt nimeta...,seadus,2013-01-01,,https://www.riigiteataja.ee/akt/128122011068.xml
9,129122012010,Riigikogu ja Vabariigi Presidendi poolt nimeta...,seadus,2014-01-01,,https://www.riigiteataja.ee/akt/129122012010.xml


In [118]:
response

{'staatus': 'OK',
 'paring': '/otsi',
 'filter': {'pealkiri': 'presidendi',
  'pealkiriOtsinguTyyp': 'KOIK_SONAD',
  'kehtiv': '2023-10-10',
  'dokument': 'seadus'},
 'metaandmed': {'kokku': 12, 'leht': 1, 'limiit': 500},
 'aktid': [{'globaalID': 128122010026,
   'terviktekstID': 1002107,
   'pealkiri': 'Vabariigi Presidendi ametihüve seadus',
   'lyhend': None,
   'kehtivus': {'algus': '2012-01-01', 'lopp': None},
   'staatus': 'avaldatud',
   'tekst': 'terviktekst',
   'liik': 'seadus',
   'valjaandja': 'Riigikogu',
   'mitteJoustunud': False,
   'kehtivKehtetus': False,
   'muudetud': 1453907541078,
   'url': '/akt/128122010026.xml'},
  {'globaalID': 113122014059,
   'terviktekstID': 1032179,
   'pealkiri': 'Vabariigi Presidendi ametihüve seadus',
   'lyhend': None,
   'kehtivus': {'algus': '2016-07-01', 'lopp': None},
   'staatus': 'avaldatud',
   'tekst': 'terviktekst',
   'liik': 'seadus',
   'valjaandja': 'Riigikogu',
   'mitteJoustunud': False,
   'kehtivKehtetus': False,
   'm

In [114]:
n = len(response['aktid'])

In [109]:
result = DataFrame({'global_id': [None] * n, 'document_title': [None] * n, 
                    'document_type': [None] * n, 
                    'commencement_date': [None] * n,
                    'repeal_date': [None] * n,
                    'altered': [None] * n,
                    'xml_source': [None] * n})

In [110]:
for i, document in enumerate(response['aktid']):
    result.loc[i, 'global_id'] = document['globaalID']
    result.loc[i, 'document_title'] = document['pealkiri']
    result.loc[i, 'document_type'] = document['liik']
    result.loc[i, 'commencement_date'] = document['kehtivus'].get('algus')
    result.loc[i, 'repeal_date'] = document['kehtivus'].get('lopp')
    result.loc[i, 'altered'] = str(document['muudetud'])
    result.loc[i, 'xml_source'] = f"https://www.riigiteataja.ee{document['url']}"   

In [111]:
result.sort_values(['document_title', 'commencement_date', 'commencement_date'], ascending=False)

Unnamed: 0,global_id,document_title,document_type,commencement_date,repeal_date,altered,in_force,xml_source
5,110072012032,Vabariigi Presidendi valimise seadus,seadus,2013-04-01,,1620902047527,False,https://www.riigiteataja.ee/akt/110072012032.xml
7,127062017011,Vabariigi Presidendi töökorra seadus,seadus,2017-07-07,,1690799412070,False,https://www.riigiteataja.ee/akt/127062017011.xml
2,121032011032,Vabariigi Presidendi töökorra seadus,seadus,2012-01-01,,1453907437520,False,https://www.riigiteataja.ee/akt/121032011032.xml
11,769953,Vabariigi Presidendi ja Vabariigi Valitsuse li...,seadus,2004-07-01,,1630667215076,False,https://www.riigiteataja.ee/akt/769953.xml
4,122062016013,Vabariigi Presidendi ametihüve seadus,seadus,2018-01-01,,1498485407289,False,https://www.riigiteataja.ee/akt/122062016013.xml
6,127062017010,Vabariigi Presidendi ametihüve seadus,seadus,2018-01-01,2023-12-31,1688563699099,False,https://www.riigiteataja.ee/akt/127062017010.xml
1,113122014059,Vabariigi Presidendi ametihüve seadus,seadus,2016-07-01,,1453908766894,False,https://www.riigiteataja.ee/akt/113122014059.xml
3,129122011152,Vabariigi Presidendi ametihüve seadus,seadus,2014-01-01,,1453907541078,False,https://www.riigiteataja.ee/akt/129122011152.xml
0,128122010026,Vabariigi Presidendi ametihüve seadus,seadus,2012-01-01,,1453907541078,False,https://www.riigiteataja.ee/akt/128122010026.xml
9,129122012010,Riigikogu ja Vabariigi Presidendi poolt nimeta...,seadus,2014-01-01,,1453907753635,False,https://www.riigiteataja.ee/akt/129122012010.xml


In [87]:
response['aktid'][0]

{'globaalID': 128122010026,
 'terviktekstID': 1002107,
 'pealkiri': 'Vabariigi Presidendi ametihüve seadus',
 'lyhend': None,
 'kehtivus': {'algus': '2012-01-01', 'lopp': None},
 'staatus': 'avaldatud',
 'tekst': 'terviktekst',
 'liik': 'seadus',
 'valjaandja': 'Riigikogu',
 'mitteJoustunud': False,
 'kehtivKehtetus': False,
 'muudetud': 1453907541078,
 'url': '/akt/128122010026.xml'}

In [95]:
response['aktid'][1]

{'globaalID': 113122014059,
 'terviktekstID': 1032179,
 'pealkiri': 'Vabariigi Presidendi ametihüve seadus',
 'lyhend': None,
 'kehtivus': {'algus': '2016-07-01', 'lopp': None},
 'staatus': 'avaldatud',
 'tekst': 'terviktekst',
 'liik': 'seadus',
 'valjaandja': 'Riigikogu',
 'mitteJoustunud': False,
 'kehtivKehtetus': False,
 'muudetud': 1453908766894,
 'url': '/akt/113122014059.xml'}

In [28]:
query=get_search_query(leht=1, kehtiv=current_date, dokument='seadus', limiit=500, pealkiri='presidendi')

In [52]:
payload = {'leht':1,'kehtiv':current_date, 'dokument':'seadus', 'limiit':500, 'pealkiri':'ametihüve presidendi', 'pealkiriOtsinguTyyp': 'koik_sonad'}
response = requests.get(f'{BASE_URL}/otsi', params=payload)

print(response.status_code)
assert response.status_code== 200, 'GET request failed'
response = response.json()

200


In [53]:
response

{'staatus': 'OK',
 'paring': '/otsi',
 'metaandmed': {'kokku': 5, 'leht': 1, 'limiit': 500},
 'aktid': [{'globaalID': 128122010026,
   'terviktekstID': 1002107,
   'pealkiri': 'Vabariigi Presidendi ametihüve seadus',
   'lyhend': None,
   'kehtivus': {'algus': '2012-01-01', 'lopp': None},
   'staatus': 'avaldatud',
   'tekst': 'terviktekst',
   'liik': 'seadus',
   'valjaandja': 'Riigikogu',
   'mitteJoustunud': False,
   'kehtivKehtetus': False,
   'muudetud': 1453907541078,
   'url': '/akt/128122010026.xml'},
  {'globaalID': 113122014059,
   'terviktekstID': 1032179,
   'pealkiri': 'Vabariigi Presidendi ametihüve seadus',
   'lyhend': None,
   'kehtivus': {'algus': '2016-07-01', 'lopp': None},
   'staatus': 'avaldatud',
   'tekst': 'terviktekst',
   'liik': 'seadus',
   'valjaandja': 'Riigikogu',
   'mitteJoustunud': False,
   'kehtivKehtetus': False,
   'muudetud': 1453908766894,
   'url': '/akt/113122014059.xml'},
  {'globaalID': 129122011152,
   'terviktekstID': 1009146,
   'pealk

In [31]:
?requests.get

[0;31mSignature:[0m [0mrequests[0m[0;34m.[0m[0mget[0m[0;34m([0m[0murl[0m[0;34m,[0m [0mparams[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m [0;34m**[0m[0mkwargs[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Sends a GET request.

:param url: URL for the new :class:`Request` object.
:param params: (optional) Dictionary, list of tuples or bytes to send
    in the query string for the :class:`Request`.
:param \*\*kwargs: Optional arguments that ``request`` takes.
:return: :class:`Response <Response>` object
:rtype: requests.Response
[0;31mFile:[0m      ~/Library/miniforge3/envs/rt-web-crawler/lib/python3.9/site-packages/requests/api.py
[0;31mType:[0m      function

In [29]:
response = requests.get(query)
assert response.status_code == 200, 'GET request failed'
response = response.json()

In [30]:
response

{'staatus': 'OK',
 'paring': '/otsi',
 'metaandmed': {'kokku': 12, 'leht': 1, 'limiit': 500},
 'aktid': [{'globaalID': 128122010026,
   'terviktekstID': 1002107,
   'pealkiri': 'Vabariigi Presidendi ametihüve seadus',
   'lyhend': None,
   'kehtivus': {'algus': '2012-01-01', 'lopp': None},
   'staatus': 'avaldatud',
   'tekst': 'terviktekst',
   'liik': 'seadus',
   'valjaandja': 'Riigikogu',
   'mitteJoustunud': False,
   'kehtivKehtetus': False,
   'muudetud': 1453907541078,
   'url': '/akt/128122010026.xml'},
  {'globaalID': 128122011068,
   'terviktekstID': 1009136,
   'pealkiri': 'Riigikogu ja Vabariigi Presidendi poolt nimetatavate riigiametnike ametipalkade seadus',
   'lyhend': None,
   'kehtivus': {'algus': '2013-01-01', 'lopp': None},
   'staatus': 'avaldatud',
   'tekst': 'terviktekst',
   'liik': 'seadus',
   'valjaandja': 'Riigikogu',
   'mitteJoustunud': False,
   'kehtivKehtetus': False,
   'muudetud': 1453907541078,
   'url': '/akt/128122011068.xml'},
  {'globaalID': 12

In [201]:
wordforms = generate_all_worldforms(' '.join(result['lemma']))

In [12]:
result = extend_query('president bresident xxx').sample(10)

In [13]:
result

Unnamed: 0,location,input,lemma,type,confidence,wordform
120,1,bresident,president,suggestion,0.0,presidendel
97,1,bresident,president,suggestion,0.0,presidendesse
7,0,president,president,original,1.0,presidendil
75,1,bresident,president,suggestion,0.0,presidentidesse
50,0,president,president,original,1.0,presidendes
5,0,president,president,original,1.0,presidendele
103,1,bresident,president,suggestion,0.0,presidentidega
81,1,bresident,president,suggestion,0.0,presidendina
26,0,president,president,original,1.0,presidentidest
86,1,bresident,president,suggestion,0.0,presidendes


In [203]:
?pd.merge

[0;31mSignature:[0m
[0mpd[0m[0;34m.[0m[0mmerge[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mleft[0m[0;34m:[0m [0;34m'DataFrame | Series'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mright[0m[0;34m:[0m [0;34m'DataFrame | Series'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mhow[0m[0;34m:[0m [0;34m'MergeHow'[0m [0;34m=[0m [0;34m'inner'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mon[0m[0;34m:[0m [0;34m'IndexLabel | None'[0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mleft_on[0m[0;34m:[0m [0;34m'IndexLabel | None'[0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mright_on[0m[0;34m:[0m [0;34m'IndexLabel | None'[0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mleft_index[0m[0;34m:[0m [0;34m'bool'[0m [0;34m=[0m [0;32mFalse[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mright_index[0m[0;34m:[0m [0;34m'bool'[0m [0;34m=[0m [0;32mFalse[0m[0;34m,[0m[0;34m[0m
[0

In [206]:
pd.merge(result, wordforms, on='lemma', how='left') 

Unnamed: 0,location,input,lemma,type,confidence,wordform
0,0,sadama,sadam,original,1.0,sadam
1,0,sadama,sadam,original,1.0,sadamaid
2,0,sadama,sadam,original,1.0,sadamaiks
3,0,sadama,sadam,original,1.0,sadamale
4,0,sadama,sadam,original,1.0,sadamais
...,...,...,...,...,...,...
431,3,satama,seatama,suggestion,0.0,seatamata
432,3,satama,seatama,suggestion,0.0,seatamadeta
433,3,satama,seatama,suggestion,0.0,seatamani
434,3,satama,seatama,suggestion,0.0,seatamana


In [193]:
    GENERATOR_QUERY = "https://smart-search.tartunlp.ai/api/generator/process"
    HEADERS = {"Content-Type": "application/json; charset=utf-8"}
    POST_DATA_TEMPLATE = {'type': 'text', 'content': lemmas}


In [194]:
    response = requests.post(GENERATOR_QUERY, json=POST_DATA_TEMPLATE, headers=HEADERS)
    assert response.ok, "Webservice failed"
    response = response.json()

In [195]:
response

{'response': {'type': 'texts',
  'texts': [{'content': 'sadam',
    'features': {'hint': '',
     'pos': '*',
     'features': '*',
     'kigi': '',
     'generated_forms': [{'token': 'sadam', 'pos': 'S', 'features': 'sg n'},
      {'token': 'sadama', 'pos': 'S', 'features': 'sg g'},
      {'token': 'sadama+d', 'pos': 'S', 'features': 'pl n'},
      {'token': 'sadama+ga', 'pos': 'S', 'features': 'sg kom'},
      {'token': 'sadama+id', 'pos': 'S', 'features': 'pl p'},
      {'token': 'sadama+iks', 'pos': 'S', 'features': 'pl tr'},
      {'token': 'sadama+il', 'pos': 'S', 'features': 'pl ad'},
      {'token': 'sadama+ile', 'pos': 'S', 'features': 'pl all'},
      {'token': 'sadama+ilt', 'pos': 'S', 'features': 'pl abl'},
      {'token': 'sadama+ina', 'pos': 'S', 'features': 'pl es'},
      {'token': 'sadama+ini', 'pos': 'S', 'features': 'pl ter'},
      {'token': 'sadama+is', 'pos': 'S', 'features': 'pl in'},
      {'token': 'sadama+isse', 'pos': 'S', 'features': 'pl ill'},
      {'token

Unnamed: 0,lemma,wordform
0,Tere,teresse
1,Tere,teredele
2,Tere,teredesse
3,Tere,teredeks
4,Tere,teredelt


Unnamed: 0,lemma,wordform
0,ÜRO,ÜRO


Unnamed: 0,lemma,wordform
0,ujuma,ujutagu
1,ujuma,ujub
2,ujuma,ujuksime
3,ujuma,ujutud
4,ujuma,ujunuksite


In [182]:
result['lemma'][0]

0     sadam
0    sadama
Name: lemma, dtype: object

In [163]:
result = extend_query('sadama maks presitent satama')

KeyError: "['location', 'lemmas'] not in index"

In [157]:
result

Unnamed: 0,input,lemma,type,confidence
0,sadama,sadam,original,1.0
0,sadama,sadama,original,1.0
1,maks,maks,original,1.0


In [149]:
result.reset_index(names='ahaa')

Unnamed: 0,ahaa,index,input,lemmas,suggestions
0,0,0,sadama,"[sadam, sadama]",
1,1,1,maks,[maks],
2,2,2,presitent,,"[(president, 0)]"
3,3,3,satama,,"[(osatama, 0), (saatma, 0), (sadama, 0), (seat..."


In [126]:
tbl = result[['input', 'lemmas']]
tbl[~tbl['lemmas'].isna()].explode('lemmas').assign(type='original').rename(columns = {'lemmas': 'lemma'}).assign(confidence=1.0)

Unnamed: 0,input,lemma,type,confidence
0,sadama,sadam,original,1.0
0,sadama,sadama,original,1.0
1,maks,maks,original,1.0


In [133]:
tbl = result[['input', 'suggestions']]
tbl = tbl[~tbl['suggestions'].isna()].explode('suggestions').assign(type='suggestion')
tbl[['lemma', 'confidence']] = tbl['suggestions'].tolist()
tbl['confidence'] = tbl['confidence'].astype(float)
tbl = tbl[['input', 'lemma', 'type', 'confidence']]

#.assign(confidence=lambda df: df['suggestions'].map(lambda x: x[1])).assign(suggestions=lambda df: df['suggestions'].map(lambda x: x[0]))

In [134]:
tbl

Unnamed: 0,input,lemma,type,confidence
2,presitent,president,suggestion,0.0
3,satama,osatama,suggestion,0.0
3,satama,saatma,suggestion,0.0
3,satama,sadama,suggestion,0.0
3,satama,seatama,suggestion,0.0


In [111]:
import pandas as pd

In [119]:
tbl[['lemma', 'confidence']] = tbl['suggestions'].tolist()##apply(pd.Series)

In [91]:
result['suggestions']

0                None
1                None
2    [(president, 0)]
Name: suggestions, dtype: object

In [26]:
response = response.json()

In [28]:
response['annotations']['query']

[['president']]

In [1]:
def analyze_document_caption(caption: str):
    """
    Uses web service to extract words and sub-words form document captions

    Returns a four column table with columns index, wordform, lemma, sublemmas.
    There can be several rows for each word as each word is analysed separately.
    All rows with the same index correspond to the same word.
    Wordform columns is added to facilitate tokenisation debugging.
    """
    corrected_caption = re.sub('\s+', ' ', re.sub('ˮ', ' ˮ ', re.sub('\s+', ' ', caption))) 
    
    ANALYZER_QUERY = "https://smart-search.tartunlp.ai/api/analyser/process"
    HEADERS = {"Content-Type": "application/json; charset=utf-8"}
    POST_DATA_TEMPLATE = {'params': {"vmetajson": ["--guess"]}, 'content': corrected_caption}

    response = requests.post(ANALYZER_QUERY, json=POST_DATA_TEMPLATE, headers=HEADERS)
    assert response.ok, "Webservice failed"
    response = response.json()

2