## Convert Universal Dependencies' corpus to Vabamorf format 

_Prerequisite_ : download the Estonian UD corpus from here: https://github.com/UniversalDependencies/UD_Estonian-EDT (ver 2.5). Unpack into the current directory.

In [1]:
import os, os.path
ud_corpus_dir = "UD_Estonian-EDT-r2.5"
assert os.path.isdir( ud_corpus_dir )

from estnltk import Text
from estnltk.converters import text_to_json
from est_ud_utils import load_ud_file_texts_with_corrections
from est_ud_morph_conv import convert_ud_layer_to_reduced_morph_layer

In [2]:
output_dir = 'UD_converted'
if not os.path.isdir( output_dir ):
    os.makedirs(output_dir)
assert os.path.isdir( output_dir )

In [3]:
# Load UD corpus' files as EstNLTK Text objects
loaded_texts  = []
ud_layer_name = 'ud_syntax'
for fname in os.listdir( ud_corpus_dir ):
    #if 'train' in fname:
    #    continue
    #if 'dev' in fname:
    #    continue
    #if 'test' in fname:
    #    continue
    if fname.endswith('.conllu'):
        fpath = os.path.join( ud_corpus_dir, fname )
        texts = load_ud_file_texts_with_corrections( fpath, ud_layer_name )
        for text in texts:
            text.meta['file'] = fname
            loaded_texts.append( text )

In [4]:
# Convert UD's morphosyntactic annotations to Vabamorf-like annotations
for tid, text in enumerate(loaded_texts):
    convert_ud_layer_to_reduced_morph_layer( text, 'ud_syntax', 'ud_morph_reduced', add_layer=True )
    fname = text.meta['file'].replace('.conllu', '_'+('{:03d}'.format(tid))+'.json')
    fpath = os.path.join(output_dir, fname)
    text_to_json(text, file=fpath)