In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'

In [3]:
import os
from estnltk.converters.conll.conll_importer import conll_to_text
from estnltk_core.layer_operations import split_by_sentences
from estnltk_patches.phrase_extractor import PhraseExtractor
from estnltk_patches.consistency_decorator import ConsistencyDecorator
from tqdm import tqdm

## Read files

In [4]:
root1 = r"UDpuupank/UD2_11_udreposse/Train"
root2 = r"UDpuupank/UD2_11_udreposse/Dev"

In [5]:
files1 = []
for path, subdirs, filess in os.walk(root1):
    files1 = filess 

files2 = []
for path, subdirs, filess2 in os.walk(root2):
    files2 = filess2 
    


In [6]:
output_layer = "det_phrases"
model_path = r".../lib/python3.9/site-packages/estnltk_neural/taggers/syntax/stanza_tagger/stanza_resources/"
deprel = "det"

In [7]:

decorator = ConsistencyDecorator("sentences", model_path, "temp_phrases", "stanza_syntax", "sentences")


phrase_tagger = PhraseExtractor(deprel=deprel, decorator=decorator, input_type="stanza_syntax", 
                                syntax_layer="stanza_syntax", output_layer=output_layer, morph_layer="words",
                               output_attributes = ['syntax_conservation_score', "unlabelled_attachment_score", "label_accuracy",'root_id', 'root']
                 )


In [8]:
def split_and_tag_texts(files, root):

    texts = []
    file_sent_info = []

    for file in tqdm(files):

        input_file = os.path.join(root, file)
        text_obj = conll_to_text( input_file, syntax_layer='stanza_syntax' )
        texts2 = split_by_sentences(text=text_obj,
                                   layers_to_keep=list(text_obj.layers),
                                   trim_overlapping=True
                                   )
        for txt in texts2:
            phrase_tagger.tag(txt)
        
        texts3 = []
        for text in texts2:
            if output_layer in text.layers and len(text[output_layer])>0:
                texts3.append(text)
        
        for txt in texts3:
            for i in range(len(txt[output_layer])):
                obj = txt[output_layer][i]
                removed = " ".join(obj.text)
                cons = str(obj.syntax_conservation_score)
                ual = str(obj.unlabelled_attachment_score)
                la = str(obj.label_accuracy)
                textdata = "\\".join([file, txt.text, removed, cons, ual, la])
                #print(textdata)
                file_sent_info.append(textdata)

        texts += texts3
        
    return texts, file_sent_info
    

In [9]:
files1_txt, fileinfo1 = split_and_tag_texts(files1, root1)

100%|███████████████████████████████████████████| 60/60 [28:53<00:00, 28.90s/it]


In [10]:
print(len(files1_txt), len(fileinfo1))

4668 5389


In [11]:
files2_txt, fileinfo2 = split_and_tag_texts(files2, root2)

100%|█████████████████████████████████████████████| 9/9 [04:20<00:00, 28.89s/it]


In [12]:
print(len(files2_txt), len(fileinfo2))

577 672


In [13]:
fileinfo = fileinfo1 + fileinfo2

In [14]:
with open(f"ls_puupank_{deprel}_export_big_v1_fileinfo.txt", "w", encoding="utf-8") as f:
    f.write("file\\text\\removed\\conservation_score\\ual\\la\n")
    f.write('\n'.join('%s' % x for x in fileinfo))

In [15]:
import random

In [16]:
files2_txt[0].det_phrases

layer name,attributes,parent,enveloping,ambiguous,span count
det_phrases,"syntax_conservation_score, unlabelled_attachment_score, label_accuracy, root_id, root",,stanza_syntax,False,1

text,syntax_conservation_score,unlabelled_attachment_score,label_accuracy,root_id,root
['üks'],56.2,62.5,68.8,9,"Span('üks', [{'id': 9, 'lemma': 'üks', 'upostag': 'DET', 'xpostag': 'P', 'feats' ..., type: <class 'estnltk_core.layer.span.Span'>"


In [17]:
texts = files1_txt + files2_txt

In [18]:
random.shuffle(texts)

In [19]:
len(texts)

5245

## Files for label studio

In [20]:
from collection_to_ls import collection_to_labelstudio, conf_gen

In [21]:
res_path = f"ls_puupank_{deprel}_export_big_v1.json"
collection_to_labelstudio(texts, deprel, regular_layers=[output_layer],filename=res_path)


In [22]:
with open(f"ls_puupank_{deprel}_export_big_v1_conf.txt", "w", encoding="utf-8") as f:
    f.write(conf_gen(deprel, classes=[output_layer]))