In [11]:
import sys
sys.path.append('../../')

In [12]:
from estnltk import Text
from pandas import concat
from pandas import DataFrame

from fast_indexing.fast_indexing import extract_wordform_index
from estnltk_patches.rt_text_analyzer import RTTextAnalyzer 

In [13]:
RT_ANALYZER = RTTextAnalyzer()
RTP_ANALYZER = RTTextAnalyzer(propername=True)
columns = ['wordform', 'doc_id', 'start', 'end', 'is_subword']
def are_equal(result: DataFrame, target: DataFrame):
    if len(result) != len(target):
        return False
    return sum(concat([result, target],axis=0).duplicated()) == len(result)

In [4]:
text = Text('Ta oli SUUR pulli- ja Kalamees')
RT_ANALYZER(text)
target = DataFrame(
    [['mees', '42', 22, 30, True],
     ['oli', '42', 3, 6, False],
     ['pulli', '42', 12, 18, False],
     ['suur', '42', 7, 11, False],
     ['kalamees', '42', 22, 30, False],
     ['kala', '42', 22, 30, True]], columns=columns
)
result = DataFrame(extract_wordform_index('42', text, ignore_pos=['P', 'Z', 'J']), columns=columns)
display(result)
assert are_equal(result, target)

Unnamed: 0,wordform,doc_id,start,end,is_subword
0,oli,42,3,6,False
1,suur,42,7,11,False
2,pulli,42,12,18,False
3,kalamees,42,22,30,False
4,kala,42,22,30,True
5,mees,42,22,30,True


In [5]:
text = Text('Punameremao pisarad.')
RT_ANALYZER(text)
target = DataFrame(
    [['mere', '42', 0, 11, True],
     ['meremao', '42', 0, 11, True],
     ['mao', '42', 0, 11, True],
     ['pisarad', '42', 12, 19, False],
     ['puna', '42', 0, 11, True],
     ['punameremao', '42', 0, 11, False],
     ['punamere', '42', 0, 11, True]], columns=columns
)
result = DataFrame(extract_wordform_index('42', text), columns=columns)
display(result)
assert are_equal(result, target)

Unnamed: 0,wordform,doc_id,start,end,is_subword
0,punameremao,42,0,11,False
1,puna,42,0,11,True
2,mere,42,0,11,True
3,meremao,42,0,11,True
4,punamere,42,0,11,True
5,mao,42,0,11,True
6,pisarad,42,12,19,False


In [6]:
text = Text('maa-algilmeline')
RT_ANALYZER(text)
target = DataFrame(
    [['maa', '42', 0, 3, False],
     ['ilmeline', '42', 4, 15, True],
     ['algilmeline', '42', 4, 15, False],
     ['alg', '42', 4, 15, True]], columns=columns
)
result = DataFrame(extract_wordform_index('42', text), columns=columns)
display(result)
assert are_equal(result, target)

Unnamed: 0,wordform,doc_id,start,end,is_subword
0,maa,42,0,3,False
1,algilmeline,42,4,15,False
2,alg,42,4,15,True
3,ilmeline,42,4,15,True


In [7]:
text = Text('Töökorraseadus')
RT_ANALYZER(text)
target = DataFrame(
    [['töökorraseadus', '42', 0, 14, False],
     ['korraseadus', '42', 0, 14, True],
     ['töökorra', '42', 0, 14, True],
     ['korra', '42', 0, 14, True],
     ['seadus', '42', 0, 14, True],
     ['töö', '42', 0, 14, True]], columns=columns
)
result = DataFrame(extract_wordform_index('42', text), columns=columns)
display(result)
assert are_equal(result, target)

Unnamed: 0,wordform,doc_id,start,end,is_subword
0,töökorraseadus,42,0,14,False
1,seadus,42,0,14,True
2,korraseadus,42,0,14,True
3,korra,42,0,14,True
4,töö,42,0,14,True
5,töökorra,42,0,14,True


In [8]:
text = Text('Töökorraseadus')
RTP_ANALYZER(text)
target = DataFrame(
    [['töökorraseadus', '42', 0, 14, False],
     ['Töökorraseadus', '42', 0, 14, False],
     ['seadus', '42', 0, 14, True],
     ['korraseadus', '42', 0, 14, True],
     ['korra', '42', 0, 14, True],
     ['Töö', '42', 0, 14, True],
     ['töö', '42', 0, 14, True],
     ['töökorra', '42', 0, 14, True],
     ['Töökorra', '42', 0, 14, True]], columns=columns
)
result = DataFrame(extract_wordform_index('42', text), columns=columns)
display(result)
assert are_equal(result, target)

Unnamed: 0,wordform,doc_id,start,end,is_subword
0,töökorraseadus,42,0,14,False
1,Töökorraseadus,42,0,14,False
2,seadus,42,0,14,True
3,korraseadus,42,0,14,True
4,korra,42,0,14,True
5,töö,42,0,14,True
6,Töö,42,0,14,True
7,töökorra,42,0,14,True
8,Töökorra,42,0,14,True


In [9]:
text = Text('Lapimaa')
RT_ANALYZER(text)
target = DataFrame(
    [['Lapi', '42', 0, 7, True],
     ['Lapimaa', '42', 0, 7, False],
     ['lapimaa', '42', 0, 7, False],
     ['lapi', '42', 0, 7, True],
     ['maa', '42', 0, 7, True]], columns=columns
)
result = DataFrame(extract_wordform_index('42', text), columns=columns)
display(result)
assert are_equal(result, target)

Unnamed: 0,wordform,doc_id,start,end,is_subword
0,Lapimaa,42,0,7,False
1,lapimaa,42,0,7,False
2,lapi,42,0,7,True
3,Lapi,42,0,7,True
4,maa,42,0,7,True


In [10]:
text = Text('NATOga TIPPKOHTUMINE on Põhja-Euroopas')
RT_ANALYZER(text)
target = DataFrame(
    [['Euroopas', '42', 30, 38, False],
     ['põhja', '42', 24, 29, False],
     ['tipp', '42', 7, 20, True],
     ['kohtumine', '42', 7, 20, True],
     ['NATOga', '42', 0, 6, False],
     ['tippkohtumine', '42', 7, 20, False],
     ['on', '42', 21, 23, False]], columns=columns
)
result = DataFrame(extract_wordform_index('42', text, ignore_pos=['P', 'J', 'Z']), columns=columns)
display(result)
assert are_equal(result, target)

Unnamed: 0,wordform,doc_id,start,end,is_subword
0,NATOga,42,0,6,False
1,tippkohtumine,42,7,20,False
2,tipp,42,7,20,True
3,kohtumine,42,7,20,True
4,on,42,21,23,False
5,põhja,42,24,29,False
6,Euroopas,42,30,38,False


**Known bugs**

In [17]:
text = Text('kaheteistkümne')
RT_ANALYZER(text)
target = DataFrame(
    [['kümne', '42', 0, 14, True],
     ['kaheteistkümne', '42', 0, 14, False],
     ['teist', '42', 0, 14, True],
     ['teistkümne', '42', 0, 14, True],
     ['kaheteist', '42', 0, 14, True],
     ['kahe', '42', 0, 14, True]], columns=columns
)
result = DataFrame(extract_wordform_index('42', text, ignore_pos=['P', 'J', 'Z']), columns=columns)
display(result)
#assert are_equal(result, target)

Unnamed: 0,wordform,doc_id,start,end,is_subword
0,kaheteistkümne,42,0,14,False
1,kahe,42,0,14,True
2,teistkümne,42,0,14,True
