## Õpikukorpuse vrt faili lausete süntaksianalüüs

Aluseks on keeleoppija_sonaveeb_2022v1_0.vrt fail.

In [52]:
import estnltk
from estnltk import Text, Layer
from estnltk.taggers import PretokenizedTextCompoundTokensTagger, WordTagger, WhiteSpaceTokensTagger
from estnltk_neural.taggers import StanzaSyntaxTagger
import os
import re
from tqdm import tqdm
from estnltk.converters import text_to_json, json_to_text
import json

In [2]:
infile = "keeleoppija_sonaveeb_2022v1_0.vrt"

with open(infile, "r", encoding="utf-8") as f:
    data = f.read()

In [3]:
data_lines = data.split("\n")
len(data_lines)

623219

In [4]:
# koguda kõik dokumendid kokku

document_start = False

documents = []
doc_lines = []

for line in data_lines:
    if line.startswith("<document"): # dokumendi algus
        document_start = True
    elif line.startswith("</document>"): # dokumendi lõpp
        document_start = False
        documents.append(doc_lines)
        doc_lines = []
    
    if document_start:
        doc_lines.append(line)
            

#### WhitespaceTokensTagger + custom words layer 

In [30]:
not_word_lines = ["<document", "<sentence", "<clause", "</clause>", "</sentence>", "</document>"]
meta_lines = ["<document", "<sentence",]

# erinevate tühikuga eraldatud numbrite tuvastamiseks
p2 = re.compile(r"""\d+(\s*\d*)*""")

texts_list = []

tokens_tagger = WhiteSpaceTokensTagger()

stanza_tagger = StanzaSyntaxTagger(input_type='morph_extended', input_morph_layer='morph_extended')

for doc in tqdm(documents):
    word_lines = []
    meta = {}
    for line in doc:
        # read, millel on lauses olevad sõnad
        if not any(ext in line for ext in not_word_lines):
            word_lines.append(line)
            
        # siin koguda ka metainfo kokku 
        elif any(ext in line for ext in meta_lines):
            line = line.replace("<document", "").replace("<sentence", "").strip()[:-1]
            pairs = dict(re.findall(r'(\w+)="([^"]*)"', line))
            meta.update(pairs)
            
    # võtta ainult esimene osa, ehk sõna
    sent_words = [row.split("\t")[0] for row in word_lines] 

    raw_words = [] # words kihi jaoks, sõnad nii nagu tekstis on
    normalized_words = [] # words kihi jaoks, numbri puhul nt tühikud number keskelt eemaldatud
    for_txt_words = [] # Text obj jaoks, ilusam, kirjavahemärgid ei ole tühikuga eraldatud

    multiword_expressions = [] # multiword tokenite jaoks
    
    # juhuks kui on rida "<!--g/-->", mis tähistab, et enne järgnevat sõna pole tühikut
    g_tag = False
    
    for raw_token in sent_words:
        if raw_token not in ['<!--g/-->']:
            if g_tag:
                for_txt_words[-1] += raw_token
                raw_words.append(raw_token)
                normalized_words.append(raw_token)
                g_tag = False
            else:
                # kontroll, kas on tegu numbriga, milles on tühikuid
                res = re.search(p2, raw_token)
                numeric_token = True if res is not None else False
                # normaliseeritud sõnades kustutame numbrite keskelt tühikud
                if " " in raw_token and numeric_token:
                    normalized_words.append(raw_token.replace(" ", ""))
                else:
                    normalized_words.append(raw_token)
                    
                raw_words.append(raw_token)
                for_txt_words.append(raw_token)
                    
                if ' ' in raw_token: # kõik mis on tühikuga, ka numbrid
                    multiword_expressions.append(raw_token)
                
        elif raw_token == "<!--g/-->": # tuleks liita eelnevale sõnale
            g_tag = True


    text_str = ' '.join(for_txt_words)
    text = Text(text_str)

    tokens_tagger.tag(text)

    multiword_expressions = [mw.split() for mw in multiword_expressions]
    compound_tokens_tagger = PretokenizedTextCompoundTokensTagger( multiword_units = multiword_expressions )
    compound_tokens_tagger.tag(text)
    
    # minu custom layer
    my_words = Layer(
        name='words',
        attributes=('normalized_form',),
        text_object=text,
        ambiguous=True
    )

    # lisada spanid lootuses, et start ja end saavad õiged
    idx = 0
    text_str = text.text
    for raw, norm in zip(raw_words, normalized_words):
        
        while idx < len(text_str) and text_str[idx].isspace():
            idx += 1
        
        # matchida token idx kohal
        if not text_str.startswith(raw, idx):
            context = text_str[idx:idx+len(raw)+3]
            raise ValueError(
                f"Token alignment failed.\n"
                f"Expected: '{raw}' at {idx} in context: '{context}'"
            )
        
        start = idx
        end = start + len(raw)

        my_words.add_annotation(
            (start, end),
            normalized_form=norm
        )

        idx = end

    # panna text-le külge
    text.add_layer(my_words)
    
    #text.tag_layer('words')
    text.tag_layer('sentences')    
    text.tag_layer('morph_extended')
    stanza_tagger.tag( text )

    # metainfo ka juurde panna
    text.meta = meta
    
    texts_list.append(text)


100%|█████████████████████████████████████| 35680/35680 [13:11<00:00, 45.06it/s]


## salvestada kõik tekstiobjektid ühte faili

In [53]:
file_texts = []

for txt in texts_list:
    file_texts.append(text_to_json(txt))

In [55]:
with open("keeleoppija_sonaveeb_2022v1_0_vrt.json", "w") as f:
    json.dump(file_texts, f, ensure_ascii = False)

#### näide sisselugemisest

In [56]:
with open("keeleoppija_sonaveeb_2022v1_0_vrt.json", "r") as f:
    indata = json.load(f)

In [60]:
json_to_text(indata[112]).stanza_syntax

layer name,attributes,parent,enveloping,ambiguous,span count
stanza_syntax,"id, lemma, upostag, xpostag, feats, head, deprel, deps, misc",morph_extended,,False,5

text,id,lemma,upostag,xpostag,feats,head,deprel,deps,misc
Helista,1,helistama,V,V,"{'mod': 'mod', 'imper': 'imper', 'pres': 'pres', 'ps2': 'ps2', 'sg': 'sg', 'ps': ..., type: <class 'dict'>, length: 7",0,root,_,_
kohe,2,kohe,D,D,{},1,advmod,_,_
tel,3,tel,Y,Y,{'nominal': 'nominal'},1,obl,_,_
641 1709,4,6411709,N,N,"{'card': 'card', '<?>': '<?>', 'digit': 'digit'}",3,nummod,_,_
.,5,.,Z,Z,{},1,punct,_,_


## salvestada iga tekstiobjekt eraldi faili

In [61]:
#for i, txt in enumerate(texts_list):
#    text_to_json(txt, file=f"keeleoppija_sonaveeb_2022v1_0_vrt/text_{i}.json")

#### näide sisselugemisest

In [63]:
#text_import = json_to_text(file="keeleoppija_sonaveeb_2022v1_0_vrt/text_112.json")
#text_import.stanza_syntax

layer name,attributes,parent,enveloping,ambiguous,span count
stanza_syntax,"id, lemma, upostag, xpostag, feats, head, deprel, deps, misc",morph_extended,,False,5

text,id,lemma,upostag,xpostag,feats,head,deprel,deps,misc
Helista,1,helistama,V,V,"{'mod': 'mod', 'imper': 'imper', 'pres': 'pres', 'ps2': 'ps2', 'sg': 'sg', 'ps': ..., type: <class 'dict'>, length: 7",0,root,_,_
kohe,2,kohe,D,D,{},1,advmod,_,_
tel,3,tel,Y,Y,{'nominal': 'nominal'},1,obl,_,_
641 1709,4,6411709,N,N,"{'card': 'card', '<?>': '<?>', 'digit': 'digit'}",3,nummod,_,_
.,5,.,Z,Z,{},1,punct,_,_
