In [1]:
import sys
sys.path.append('../../')

In [2]:
from estnltk import Text
from pandas import concat 
from pandas import DataFrame

from fast_indexing.fast_indexing import extract_lemma_index
from estnltk_patches.rt_text_analyzer import RTTextAnalyzer 

In [3]:
RT_ANALYZER = RTTextAnalyzer()
RTP_ANALYZER = RTTextAnalyzer(propername=True)
columns = ['wordform', 'doc_id', 'start', 'end', 'weight', 'is_subword']
def are_equal(result: DataFrame, target: DataFrame):
    if len(result) != len(target):
        return False
    return sum(concat([result, target],axis=0).duplicated()) == len(result)

In [4]:
text = Text('kala')
RT_ANALYZER(text)
target = DataFrame([['kala', '42', 0, 4, 1.0, False]], columns=columns)
result = DataFrame(extract_lemma_index('42', text), columns = columns)
display(result)
assert are_equal(result, target)

Unnamed: 0,wordform,doc_id,start,end,weight,is_subword
0,kala,42,0,4,1.0,False


In [5]:
text = Text('sadama')
RT_ANALYZER(text)
target = DataFrame([['sadam', '42', 0, 6, 0.5, False], ['sadama', '42', 0, 6, 0.5, False]], columns=columns)
result = DataFrame(extract_lemma_index('42', text), columns = columns)
display(result)
assert are_equal(result, target)

Unnamed: 0,wordform,doc_id,start,end,weight,is_subword
0,sadama,42,0,6,0.5,False
1,sadam,42,0,6,0.5,False


In [6]:
text = Text('kalamees')
RT_ANALYZER(text)
target = DataFrame(
    [['mees', '42', 0, 8, 0.5, True],
     ['mesi', '42', 0, 8, 0.5, True],
     ['kalamees', '42', 0, 8, 1.0, False],
     ['kala', '42', 0, 8, 1.0, True]], columns=columns
)
result = DataFrame(extract_lemma_index('42', text), columns = columns)
display(result)
assert are_equal(result, target)

Unnamed: 0,wordform,doc_id,start,end,weight,is_subword
0,kalamees,42,0,8,1.0,False
1,kala,42,0,8,1.0,True
2,mees,42,0,8,0.5,True
3,mesi,42,0,8,0.5,True


In [7]:
text = Text('Punameremao pisarad.')
RT_ANALYZER(text)
target = DataFrame(
    [['punameri', '42', 0, 11, 1.0, True],
     ['meri', '42', 0, 11, 1.0, True],
     ['punama', '42', 0, 11, 0.5, True],
     ['magu', '42', 0, 11, 0.5, True],
     ['meremagu', '42', 0, 11, 0.5, True],
     ['puna', '42', 0, 11, 0.5, True],
     ['madu', '42', 0, 11, 0.5, True],
     ['punameremagu', '42', 0, 11, 0.5, False],
     ['pisar', '42', 12, 19, 1.0, False],
     ['punameremadu', '42', 0, 11, 0.5, False],
     ['meremadu', '42', 0, 11, 0.5, True]], columns=columns
)
result = DataFrame(extract_lemma_index('42', text), columns = columns)
display(result)
assert are_equal(result, target)

Unnamed: 0,wordform,doc_id,start,end,weight,is_subword
0,punameremadu,42,0,11,0.5,False
1,punameremagu,42,0,11,0.5,False
2,puna,42,0,11,0.5,True
3,punama,42,0,11,0.5,True
4,meri,42,0,11,1.0,True
5,meremagu,42,0,11,0.5,True
6,meremadu,42,0,11,0.5,True
7,punameri,42,0,11,1.0,True
8,madu,42,0,11,0.5,True
9,magu,42,0,11,0.5,True


In [8]:
text = Text('Töökorraseadus')
RT_ANALYZER(text)
target = DataFrame(
    [['seaduma', '42', 0, 14, 0.5, True],
     ['korraseadus', '42', 0, 14, 1.0, True],
     ['töökorraseadus', '42', 0, 14, 1.0, False],
     ['töö', '42', 0, 14, 1.0, True],
     ['seadus', '42', 0, 14, 0.5, True],
     ['kord', '42', 0, 14, 1.0, True],
     ['töökord', '42', 0, 14, 1.0, True]], columns=columns
)
result = DataFrame(extract_lemma_index('42', text), columns = columns)
display(result)
assert are_equal(result, target)

Unnamed: 0,wordform,doc_id,start,end,weight,is_subword
0,töökorraseadus,42,0,14,1.0,False
1,seadus,42,0,14,0.5,True
2,seaduma,42,0,14,0.5,True
3,korraseadus,42,0,14,1.0,True
4,kord,42,0,14,1.0,True
5,töö,42,0,14,1.0,True
6,töökord,42,0,14,1.0,True


In [9]:
text = Text('Töökorraseadus')
RTP_ANALYZER(text)
target = DataFrame(
    [['Töökorraseadus', '42', 0, 14, 0.5, False],
     ['töökorraseadus', '42', 0, 14, 0.5, False],
     ['seadus', '42', 0, 14, 0.5, True],
     ['seaduma', '42', 0, 14, 0.5, True],
     ['korraseadus', '42', 0, 14, 1.0, True],
     ['kord', '42', 0, 14, 1.0, True],
     ['töö', '42', 0, 14, 1.0, True],
     ['töökord', '42', 0, 14, 1.0, True]], columns=columns
)
result = DataFrame(extract_lemma_index('42', text), columns = ['wordform', 'doc_id', 'start', 'end', 'weight', 'is_subword'])
display(result)
assert are_equal(result, target)

Unnamed: 0,wordform,doc_id,start,end,weight,is_subword
0,Töökorraseadus,42,0,14,0.5,False
1,töökorraseadus,42,0,14,0.5,False
2,seadus,42,0,14,0.5,True
3,seaduma,42,0,14,0.5,True
4,korraseadus,42,0,14,1.0,True
5,kord,42,0,14,1.0,True
6,töö,42,0,14,1.0,True
7,töökord,42,0,14,1.0,True


In [10]:
text = Text('Töökorraseadus')
RTP_ANALYZER(text)
target = DataFrame(
    [['töökorraseadus', '42', 0, 14, 1.0, False],
     ['seadus', '42', 0, 14, 0.5, True],
     ['seaduma', '42', 0, 14, 0.5, True],
     ['korraseadus', '42', 0, 14, 1.0, True],
     ['kord', '42', 0, 14, 1.0, True],
     ['töö', '42', 0, 14, 1.0, True],
     ['töökord', '42', 0, 14, 1.0, True]], columns=columns
)
result = DataFrame(extract_lemma_index('42', text, ignore_pos=['H']), columns=columns)
display(result)
assert are_equal(result, target)

Unnamed: 0,wordform,doc_id,start,end,weight,is_subword
0,töökorraseadus,42,0,14,1.0,False
1,seadus,42,0,14,0.5,True
2,seaduma,42,0,14,0.5,True
3,korraseadus,42,0,14,1.0,True
4,kord,42,0,14,1.0,True
5,töö,42,0,14,1.0,True
6,töökord,42,0,14,1.0,True


In [11]:
text = Text('F.J. Wiedemanni')
RT_ANALYZER(text)
target = DataFrame(
    [['Wiedemann', '42', 0, 15, 1.0, True],
     ['F.J. Wiedemann', '42', 0, 15, 1.0, False],
     ['F.J', '42', 0, 15, 1.0, True]], columns=columns
)
result = DataFrame(extract_lemma_index('42', text), columns=columns)
display(result)
assert are_equal(result, target)

Unnamed: 0,wordform,doc_id,start,end,weight,is_subword
0,F.J. Wiedemann,42,0,15,1.0,False
1,Wiedemann,42,0,15,1.0,True
2,F.J,42,0,15,1.0,True


In [12]:
text = Text('F . J . Wiedemanni')
RT_ANALYZER(text)
target = DataFrame(
    [['F . J . Wiedemann', '42', 0, 18, 1.0, False],
     ['Wiedemann', '42', 0, 18, 1.0, True],
     ['F . J', '42', 0, 18, 1.0, True]], columns=columns
)
result = DataFrame(extract_lemma_index('42', text), columns=columns)
display(result)
assert are_equal(result, target)

Unnamed: 0,wordform,doc_id,start,end,weight,is_subword
0,F . J . Wiedemann,42,0,18,1.0,False
1,Wiedemann,42,0,18,1.0,True
2,F . J,42,0,18,1.0,True


**Known bugs**

In [13]:
text = Text('kaheteistkümne')
RT_ANALYZER(text)
target = DataFrame(
    [['kaksteist', '42', 0, 14, 1.0, True],
     ['teist', '42', 0, 14, 0.25, True],
     ['sina', '42', 0, 14, 0.25, True],
     ['kaksteist', '42', 0, 14, 1.0, False],
     ['kümme', '42', 0, 14, 1.0, True],
     ['kaks', '42', 0, 14, 1.0, True],
     ['tee', '42', 0, 14, 0.25, True],
     ['teistkümnema', '42', 0, 14, 0.5, True],
     ['teistkümne', '42', 0, 14, 0.5, True],
     ['teine', '42', 0, 14, 0.25, True]], columns=columns
)
result = DataFrame(extract_lemma_index('42', text, ignore_pos=['H']), columns=columns)
display(result)
# assert are_equal(result, target)

Unnamed: 0,wordform,doc_id,start,end,weight,is_subword
0,kaksteist,42,0,14,1.0,False
1,kaks,42,0,14,1.0,True
2,teistkümnema,42,0,14,0.5,True
3,teistkümne,42,0,14,0.5,True


In [14]:
text = Text('Tallinn-Rannamõisa-Kloogaranna riigimaantee')
RT_ANALYZER(text)
target = DataFrame(
    [['mõisa-Kloogaranna', '42', 0, 30, 1.0, True],
     ['riigimaan', '42', 31, 43, 1.0, True],
     ['maantee', '42', 31, 43, 1.0, True],
     ['rand', '42', 0, 30, 1.0, True],
     ['Tallinn-Rand', '42', 0, 30, 1.0, True],
     ['mõisa-Klooga', '42', 0, 30, 1.0, True],
     ['riik', '42', 31, 43, 1.0, True],
     ['Tallinn-Rannamõisa-Kloogaranna', '42', 0, 30, 1.0, False],
     ['maan', '42', 31, 43, 1.0, True],
     ['riigimaantee', '42', 31, 43, 1.0, False],
     ['Tallinn-Rannamõisa-Klooga', '42', 0, 30, 1.0, True],
     ['tegema', '42', 31, 43, 0.5, True],
     ['tee', '42', 31, 43, 0.5, True]], columns=columns
)
result = DataFrame(extract_lemma_index('42', text), columns=columns)
display(result)
# assert are_equal(result, target)

Unnamed: 0,wordform,doc_id,start,end,weight,is_subword
0,Tallinn-Rannamõisa-Kloogaranna,42,0,30,1.0,False
1,Tallinn,42,0,30,1.0,True
2,amõi,42,0,30,0.5,True
3,amõis,42,0,30,0.5,True
4,a-Kloo,42,0,30,1.0,True
5,rannamõis,42,0,30,1.0,True
6,amõisa-Kloo,42,0,30,1.0,True
7,Rannamõisa-Kloo,42,0,30,1.0,True
8,amõisa-Kloogaranna,42,0,30,1.0,True
9,Tallinn-Rann,42,0,30,1.0,True
