## Convert from CoNLL-U to json

In [1]:
from conllu import parse
import pprint
import json
import glob
import random

### 1. Function conllu_to_dict parse conllu file using conllu library and returns in json format

In [2]:
def conllu_to_dict(filename, num_sents=999999):
    with open(filename, encoding="utf-8") as f:
        conllu = f.read()

    # Metadata of treebank documents
    document = {
        "_id": file[10:-7],
        "dir": file[10:-7].split('\\')
    }
    parsed = parse(conllu) # Use conllu library to parse conllu file
    
    sentences = []
    sent_size = min([len(parsed), num_sents]) # replace with len(parsed) to store all sentences
    
    for i in range(sent_size):

        sent = {
            "metadata": {
                **dict(parsed[i].metadata),
                "flag": random.randrange(0,3,1) #insert random int between 0 and 3, replace function as needed
            },
            "token": json.loads(json.dumps(parsed[i]))
        }
        sentences.append(sent)
        print("sentences parsed", i+1, end="\r")
        
    document["sentences"] = sentences
    
    return document

### 2. Get list of CoNLL-U files and parse it into JSON
`doc = conllu_to_dict(file, 20)` converts 20 sentences from each document
<br>
`doc = conllu_to_dict(file)` converts all sentences from each document

In [3]:
files = glob.glob("treebanks/**/*.conllu", recursive=True) 
docs = []
for i, file in enumerate(files):
    print(i, "/" , len(files), end="\r")
    
    doc = conllu_to_dict(file, 300) # Omitting second parameter will convert all sentences to JSON
    
    json.dump(doc, open(file[:-7] + '.json', 'w'), indent=4, separators=(',', ': '))
    print("Parsed document: ",doc["_id"], "\n Number of sentences parsed: ", len(doc["sentences"]))
    
    

Parsed document:  UD_Afrikaans-AfriBooms\af_afribooms-ud-dev 
 Number of sentences parsed:  194
Parsed document:  UD_Afrikaans-AfriBooms\af_afribooms-ud-test 
 Number of sentences parsed:  300
Parsed document:  UD_Afrikaans-AfriBooms\af_afribooms-ud-train 
 Number of sentences parsed:  300
Parsed document:  UD_English-GUM\en_gum-ud-dev 
 Number of sentences parsed:  149
Parsed document:  UD_English-GUM\en_gum-ud-test 
 Number of sentences parsed:  300
Parsed document:  UD_English-GUM\en_gum-ud-train 
 Number of sentences parsed:  300
Parsed document:  UD_Hindi-HDTB\hi_hdtb-ud-dev 
 Number of sentences parsed:  69
Parsed document:  UD_Hindi-HDTB\hi_hdtb-ud-test 
 Number of sentences parsed:  300
Parsed document:  UD_Hindi-HDTB\hi_hdtb-ud-train 
 Number of sentences parsed:  300
Parsed document:  UD_Tamil-TTB\ta_ttb-ud-dev 
 Number of sentences parsed:  80
Parsed document:  UD_Tamil-TTB\ta_ttb-ud-test 
 Number of sentences parsed:  120
Parsed document:  UD_Tamil-TTB\ta_ttb-ud-train 
 Num