In [1]:
import os
import conllu
from random import Random, seed, choice, shuffle
from collections import Counter, OrderedDict
from estnltk.converters.conll_importer import conll_to_text

from src.syntax_sketch import clean_clause
from src.syntax_sketch import syntax_sketch
from src.clause_export import export_cleaned_clause

### I. Clean clauses and store them into separate file  

In [2]:
input_path = '../source_data/ud_splits/extended/'
output_path = './experiments/clauses/'
file_template = 'et_edt-ud-{}-morph_extended_noorphan.conllu'

dev = input_path + file_template.format('dev')
test = input_path + file_template.format('test')
train = input_path + file_template.format('train')

clause_dev = output_path + file_template.format('dev')
clause_test = output_path + file_template.format('test')
clause_train = output_path + file_template.format('train')

for input, output in zip([test, train, dev], [clause_test, clause_train, clause_dev]):
    print(input)
    text = conll_to_text(input, 'ud_syntax').tag_layer('clauses')
    expected_layers = {
        'clauses', 'compound_tokens', 'morph_analysis',
        'sentences', 'tokens', 'ud_syntax', 'words'
    }
    assert text.layers == expected_layers, 'Unexpected layers'
    
    valid_clauses = 0
    invalid_clauses = 0
    output_file = open(output, 'wt', encoding='utf-8')  
    for clause in text.clauses:
        cleaned_clause = clean_clause(clause)
        
        if len(cleaned_clause['root_loc']) != 1:
            invalid_clauses += 1
            continue
        
        if valid_clauses > 0:
            output_file.write('\n\n')
        
        output_file.write(export_cleaned_clause(cleaned_clause))
        valid_clauses += 1
    
    print('Valid clauses:   {}'.format(valid_clauses))
    print('Invalid clauses: {}'.format(invalid_clauses))
    output_file.close()

../source_data/ud_splits/extended/et_edt-ud-test-morph_extended_noorphan.conllu
Valid clauses:   6036
Invalid clauses: 126
../source_data/ud_splits/extended/et_edt-ud-train-morph_extended_noorphan.conllu
Valid clauses:   6036
Invalid clauses: 126
../source_data/ud_splits/extended/et_edt-ud-dev-morph_extended_noorphan.conllu
Valid clauses:   6036
Invalid clauses: 126
