In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'

In [3]:
import os
from estnltk.converters.conll.conll_importer import conll_to_text
from estnltk_core.layer_operations import split_by_sentences
from estnltk_patches.phrase_extractor import PhraseExtractor
from estnltk_patches.consistency_decorator import ConsistencyDecorator
from tqdm import tqdm

## Read files

In [4]:
root1 = r"UDpuupank/UD2_11_udreposse/Train"
root2 = r"UDpuupank/UD2_11_udreposse/Dev"

In [5]:
files1 = []
for path, subdirs, filess in os.walk(root1):
    files1 = filess 

files2 = []
for path, subdirs, filess2 in os.walk(root2):
    files2 = filess2 
    


In [6]:
output_layer = "obl_phrases"
model_path = r".../lib/python3.9/site-packages/estnltk_neural/taggers/syntax/stanza_tagger/stanza_resources/"

decorator = ConsistencyDecorator("sentences", model_path, "obl_phrases2", "stanza_syntax", "sentences")


phrase_tagger = PhraseExtractor(deprel="obl", decorator=decorator, input_type="stanza_syntax", 
                                syntax_layer="stanza_syntax", output_layer=output_layer, morph_layer="words",
                               output_attributes = ['syntax_conservation_score', "unlabelled_attachment_score", "label_accuracy",'root_id', 'root']
                 )


In [7]:
def split_and_tag_texts(files, root):

    texts = []
    file_sent_info = []

    for file in tqdm(files):

        input_file = os.path.join(root, file)
        text_obj = conll_to_text( input_file, syntax_layer='stanza_syntax' )
        texts2 = split_by_sentences(text=text_obj,
                                   layers_to_keep=list(text_obj.layers),
                                   trim_overlapping=True
                                   )
        for txt in texts2:
            phrase_tagger.tag(txt)
        
        texts3 = []
        for text in texts2:
            if output_layer in text.layers and len(text[output_layer])>0:
                texts3.append(text)
        
        for txt in texts3:
            for i in range(len(txt[output_layer])):
                obj = txt[output_layer][i]
                removed = " ".join(obj.text)
                cons = str(obj.syntax_conservation_score)
                ual = str(obj.unlabelled_attachment_score)
                la = str(obj.label_accuracy)
                textdata = "\\".join([file, txt.text, removed, cons, ual, la])
                #print(textdata)
                file_sent_info.append(textdata)

        texts += texts3
        
    return texts, file_sent_info
    

In [15]:
files1_txt, fileinfo1 = split_and_tag_texts(files1, root1)

100%|████████████████████████████████████████| 60/60 [2:01:27<00:00, 121.46s/it]


In [16]:
print(len(files1_txt), len(fileinfo1))

16588 31308


In [8]:
files2_txt, fileinfo2 = split_and_tag_texts(files2, root2)

100%|████████████████████████████████████████████| 9/9 [17:10<00:00, 114.49s/it]


In [9]:
print(len(files2_txt), len(fileinfo2))

2114 3915


In [24]:
fileinfo = fileinfo1 + fileinfo2

In [25]:
with open("ls_puupank_obl_export_big_v2_fileinfo.txt", "w", encoding="utf-8") as f:
    f.write("file\\text\\removed\\conservation_score\\ual\\la\n")
    f.write('\n'.join('%s' % x for x in fileinfo))

In [26]:
import random

In [27]:
texts = files1_txt + files2_txt

In [28]:
random.shuffle(texts)

In [29]:
len(texts)

18702

In [30]:
texts1 = texts[:10000]
texts2 = texts[10000:]
#texts3 = texts[20000:]

In [32]:
texts1[1].obl_phrases

layer name,attributes,parent,enveloping,ambiguous,span count
obl_phrases,"syntax_conservation_score, unlabelled_attachment_score, label_accuracy, root_id, root",,stanza_syntax,False,2

text,syntax_conservation_score,unlabelled_attachment_score,label_accuracy,root_id,root
['endast'],100.0,100.0,100.0,4,"Span('endast', [{'id': 4, 'lemma': 'ise', 'upostag': 'PRON', 'xpostag': 'P', 'fe ..., type: <class 'estnltk_core.layer.span.Span'>"
"['järsu', 'liigutusega']",100.0,100.0,100.0,6,"Span('liigutusega', [{'id': 6, 'lemma': 'liigutus', 'upostag': 'NOUN', 'xpostag' ..., type: <class 'estnltk_core.layer.span.Span'>"


## Files for label studio

In [33]:
deprel = "obl"

In [34]:
from collection_to_ls import collection_to_labelstudio, conf_gen

In [35]:
res_path = "ls_puupank_obl_export_big_v2_1.json"
collection_to_labelstudio(texts1, deprel, regular_layers=["obl_phrases"],filename=res_path)

res_path = "ls_puupank_obl_export_big_v2_2.json"
collection_to_labelstudio(texts2, deprel, regular_layers=["obl_phrases"],filename=res_path)

#res_path = "ls_puupank_obl_export_big_v2_3.json"
#collection_to_labelstudio(texts3, deprel, regular_layers=["obl_phrases"],filename=res_path)

In [36]:
with open("ls_puupank_obl_export_big_v2_conf.txt", "w", encoding="utf-8") as f:
    f.write(conf_gen(deprel, classes=["obl_phrases"]))