In [None]:

from pathlib import Path
import os
import sentencepiece as spm
import json
import time
import shutil
import ast
import pandas as pd
import seaborn as sns
import re
import random

In [None]:
class TokBuilder:
    """
    builds tokenizers and freq dict for lang pair. can use unigram, bpe, model_type
    """
    def __init__(self, pair, model_type, data_path):
        self.pair = pair
        self.langs = pair.split("_")
        self.src_lang = self.langs[0]
        self.tgt_lang = self.langs[1]
        self.model_type = model_type
        self.data_path = data_path

    def count_chars(self, lang):
        """
        returns number of unique chars in file for char vocab_size
        """

        file_path = f'{self.data_path}/{self.src_lang}_{self.tgt_lang}/train.{lang}'

        with open(file_path, 'r') as file:   
            unique = []

            for line in file.readlines():
                for char in line:
                    if char not in unique:
                        unique.append(char)

        return int(len(unique))

    
    def make_batches(self, lang):
        """
        Makes batches of 5_000 lines from bigger txt file for the selectet lang
        """
        
        file_path = f'./{self.data_path}/{self.src_lang}_{self.tgt_lang}/train.{lang}'
        file = open(file_path, 'r')
        data = file.readlines()
        file.close()

        text_data = []
        file_count = 0

        for sample in data:
            sample = sample.replace('\n', '')
            text_data.append(sample)
            
            save_path = f'./{self.data_path}/{self.src_lang}_{self.tgt_lang}/train/text/train_{file_count}.{lang}'
            
            if len(text_data) == 5_000:
                # once we hit the 5K mark, save to file
                with open(save_path, 'w+', encoding='utf-8') as fp:
                    fp.write('\n'.join(text_data))
                text_data = []
                file_count += 1

        with open(save_path, 'w+', encoding='utf-8') as fp:
            fp.write('\n'.join(text_data))
    
    def gather_files(self, lang):
        """
        Returns the paths to the training batches for the selected lang
        """
        
        self.make_batches(lang)
        paths = [str(x) for x in Path(f'./{self.data_path}/{self.src_lang}_{self.tgt_lang}/train/text').glob(f'**/*.{lang}')]
        return paths

    def train_tokenizer(self, lang, vocab_size):
        """
        Trains a SentencePiece tokenizer for the selected lang and vocab_size
        
        if hft, must pretokenize beforehand
        """
        
        print(f'Training tokenizer for {lang} with vocab_size of {vocab_size}')
        
        tokenizer_name = f'{lang}_{self.model_type}_{vocab_size/1000}k'
              
        paths = self.gather_files(lang)
        
        tokenizer_path = f'./tokenizers/{self.src_lang}_{self.tgt_lang}/{lang}/{tokenizer_name}'
        
        if not os.path.isdir(f'./tokenizers/{self.src_lang}_{self.tgt_lang}'):
            os.mkdir(f'./tokenizers/{self.src_lang}_{self.tgt_lang}')
        
        if not os.path.isdir(f'./tokenizers/{self.src_lang}_{self.tgt_lang}/{lang}'):
            os.mkdir(f'./tokenizers/{self.src_lang}_{self.tgt_lang}/{lang}')
        
        if os.path.isdir(tokenizer_path):
            shutil.rmtree(tokenizer_path)        
        
        os.mkdir(tokenizer_path)
        
        if self.model_type == 'hft':
            
            #cmd0 = f'./pretokenize ./data/{self.pair}/train.{lang} > ./data/{self.pair}/train/tokenized/hft_pretokenized.{lang}' 
            cmd1 = f'./hftoks.py learn {self.data_path}/{self.pair}/train/tokenized/train_hft_pretokenized.{lang} {tokenizer_path}/{tokenizer_name}.vocab {vocab_size} 100'
            start = time.time()
            #os.system(cmd0)
            os.system(cmd1)
            end = time.time()
            print(f'Training time: {end-start}')
            return (end-start)
        
        else:
            manychars = ['ja']
            
            charcover = 1.0
            
            if lang in manychars:
                charcover = 0.98
            sp_model = spm.SentencePieceProcessor()
            start = time.time()
            
            spm.SentencePieceTrainer.train(
                input=paths,
                model_prefix=f'{tokenizer_path}/{tokenizer_name}',
                vocab_size=vocab_size,
                unk_id=2,
                bos_id=-1,
                eos_id=1,
                pad_id=0,
                model_type=self.model_type,
                character_coverage=charcover,
                train_extremely_large_corpus=False,
                minloglevel=100
            )

            end = time.time()
        
            print(f'Training time: {end-start}')
            return (end-start)
   
    def tokenize(self, lang, tokenizer, prefix):
        """
        Tokenize train for lang with tokenizer (prefix is used to tokenize "train" or "dev")
        """
        
        if self.model_type == 'hft':    
            tokenizer_path = f'./tokenizers/{self.src_lang}_{self.tgt_lang}/{lang}/{tokenizer}/{tokenizer}.vocab'
        
            train_path = f'{self.data_path}/{self.pair}/train/tokenized/{prefix}_hft_pretokenized.{lang}'
            
            dev_path = f'{self.data_path}/{self.pair}/train/tokenized/{prefix}_hft_pretokenized.{lang}'
            
            start = time.time()
            
            out = f'./{self.data_path}/{self.src_lang}_{self.tgt_lang}/train/tokenized/{prefix}_toks_{tokenizer}.{lang}'
            cmd = f'python3 hftoks.py tokenize {tokenizer_path} <{train_path} > {out}'
            os.system(cmd)

            end = time.time()
            print(f'{lang} text tokenized in {end-start} with {tokenizer}')
            return (end-start)
        
        else:
            tokenizer_path = f'./tokenizers/{self.src_lang}_{self.tgt_lang}/{lang}/{tokenizer}/{tokenizer}.model'

            sp = spm.SentencePieceProcessor()
            sp.load(f'{tokenizer_path}')
            
            if os.path.isfile(f'./{self.data_path}/{self.src_lang}_{self.tgt_lang}/train/tokenized/{prefix}_toks_{tokenizer}.{lang}'):
                    os.remove(f'./{self.data_path}/{self.src_lang}_{self.tgt_lang}/train/tokenized/{prefix}_toks_{tokenizer}.{lang}')
            
            with open(f'./{self.data_path}/{self.src_lang}_{self.tgt_lang}/{prefix}.{lang}', 'r') as text:
                
                start = time.time()               
                for line in text:
                    line = line.rstrip()
                    toks = sp.encode_as_pieces(line)
                    with open(f'./{self.data_path}/{self.src_lang}_{self.tgt_lang}/train/tokenized/{prefix}_toks_{tokenizer}.{lang}', 'a+') as out:
                            print(toks, file=out)
                end = time.time()   
            print(f'{lang} text tokenized in {end-start} with {tokenizer}')
            return(end-start)

    def make_freqs(self, lang, tokenizer):
        """
        Makes frequency files for the selected lang and tokenizer
        """
        if self.model_type == 'hft':    
            tokenizer_path = f'./tokenizers/{self.src_lang}_{self.tgt_lang}/{lang}/{tokenizer}/{tokenizer}.vocab'
            
            start = time.time()
            
            freqs_file = open(tokenizer_path, 'r')
            freqs = {}
            for line in freqs_file.readlines():
                line = line.split('\t')
                freqs[line[0].strip(' ')] = int(line[1].strip('\n'))
            
            freqs = dict(sorted(freqs.items(), key=lambda item: item[1], reverse=True))
            with open(f'./tokenizers/{self.src_lang}_{self.tgt_lang}/{lang}/{tokenizer}/{tokenizer}.freq', 'w+') as out:
                print(freqs, file=out)
            
            end=time.time()
            print(f"Made freqs for {tokenizer} in {end-start}")
            
        else:
            #correct for prefix
            start=time.time()
            tokenizer_path = f'./tokenizers/{self.src_lang}_{self.tgt_lang}/{lang}/{tokenizer}/{tokenizer}.model'
            tokenized_path = f'{self.data_path}/{self.src_lang}_{self.tgt_lang}/train/tokenized/train_toks_{tokenizer}.{lang}'
        
        
            sp = spm.SentencePieceProcessor()
            sp.load(f'{tokenizer_path}')
            
            toks = open(tokenized_path, 'r').readlines()
            vocabs = [sp.id_to_piece(id) for id in range(sp.get_piece_size())]
            
            freqs = {}
            
            for line in toks:
                line = ast.literal_eval(line)
                for tok in line:
                    #print (tok)
                    if tok in vocabs:
                        if tok in freqs.keys():
                            freqs[tok] = freqs[tok]+1
                        else:
                            freqs[tok] = 1
            
            freqs = dict(sorted(freqs.items(), key=lambda item: item[1], reverse=True))
            with open(f'./tokenizers/{self.src_lang}_{self.tgt_lang}/{lang}/{tokenizer}/{tokenizer}.freq', 'w+') as out:
                print(freqs, file=out)
            
            end=time.time()
            print(f"Made freqs for {tokenizer} in {end-start}")

    def run(self, langs=None, vocab_sizes=None, train=True, tokenize=True, dev=False, freqs=True, save_run=True):
        """
        Runs the other functions, returns train time and tokenization time
        """
        
        if not langs:
            langs = [self.src_lang, self.tgt_lang]
        
        df = pd.DataFrame(columns=['dataset', 'lang', 'tokenizer', 'vocab_size', 'train', 'token'])
        
        for lang in langs:
           
            if not vocab_sizes:
                # char model is not useful
                if self.model_type=='char':
                    vocab_sizes = [self.count_chars(lang)]
                elif self.model_type=='bpe': #merge operations
                    vocab_sizes = [
                                   #500,
                                   1000,
                                   2000,
                                   4000,
                                   8000, 
                                   16000,
                                   32000, 
                                   #48000, too big for en-ga
                                   #64000 too big for en-mr
                                   ]
                elif self.model_type=='unigram': #final vocabulary size
                    vocab_sizes = [
                                   500,
                                   750,
                                   1500,
                                   3000,
                                   4000,
                                   6000,
                                   8000
                                   ]
                elif self.model_type=='hft': #final vocabulary size
                    vocab_sizes = [500,
                                   750,
                                   1500,
                                   3000,
                                   4000,
                                   6000,
                                   8000
                                   ]
            
            
            for size in vocab_sizes:
                tokenizer_name = f'{lang}_{self.model_type}_{size/1000}k'
                train_time = 0
                token_time = 0
                if train:
                    train_time = self.train_tokenizer(lang, size)
                if tokenize:
                    if dev == True:
                        token_time = self.tokenize(lang, tokenizer_name, 'dev')
                    else:
                        token_time = self.tokenize(lang, tokenizer_name, 'train')
                if freqs:
                        self.make_freqs(lang, tokenizer_name) 
                if save_run:
                    row = {'dataset':self.pair, 'lang':lang, 'tokenizer':tokenizer_name, 'vocab_size':size, 'train':train_time, 'token':token_time}
                    df = df.append(row, ignore_index=True)
        if save_run:        
            df.to_csv(f'./stats/{self.pair}_time.csv', sep='\t')
                

In [16]:
"""
Cannot run all together bacause jupyter saves outputs checkpoints, stalling the console. Run one dataset at a time,
one model at a time, or pass one lang.
"""

datasets = [
            #'en_mr',
            #'en_ga',
            #"en_hi",
            #"en_lt",
            #"ja_my",
            'cs_sv',
            #'gbi_chr',
            #'syr_zu'
            ]
model_types = [
              #'char',
              'unigram',
              #'bpe',
              #'hft'
              ]
               
for dataset in datasets:
    for model_type in model_types:
        print(dataset, model_type)
        model = TokBuilder(dataset, model_type=model_type, data_path='./data')
        model.run(langs=[lang for lang in dataset.split('_')],
                  #train=False,
                  #tokenize=True,
                  #dev=True,
                  #freqs=False,
                  #save_run=False
                 )
print('Done')

cs_sv unigram
Training tokenizer for cs with vocab_size of 500
Training time: 3.8944690227508545
cs text tokenized in 1.9679884910583496 with cs_unigram_0.5k
Made freqs for cs_unigram_0.5k in 2.7274913787841797
Training tokenizer for cs with vocab_size of 750


  df = df.append(row, ignore_index=True)


Training time: 3.505624294281006
cs text tokenized in 1.997018814086914 with cs_unigram_0.75k
Made freqs for cs_unigram_0.75k in 2.8336851596832275
Training tokenizer for cs with vocab_size of 1500


  df = df.append(row, ignore_index=True)


Training time: 3.46278977394104
cs text tokenized in 1.853463888168335 with cs_unigram_1.5k
Made freqs for cs_unigram_1.5k in 3.190277576446533
Training tokenizer for cs with vocab_size of 3000


  df = df.append(row, ignore_index=True)


Training time: 3.196333646774292
cs text tokenized in 1.7011232376098633 with cs_unigram_3.0k
Made freqs for cs_unigram_3.0k in 4.088312149047852
Training tokenizer for cs with vocab_size of 4000


  df = df.append(row, ignore_index=True)


Training time: 3.115480661392212
cs text tokenized in 1.6884493827819824 with cs_unigram_4.0k
Made freqs for cs_unigram_4.0k in 4.579796075820923
Training tokenizer for cs with vocab_size of 6000


  df = df.append(row, ignore_index=True)


Training time: 2.9848227500915527
cs text tokenized in 1.6737074851989746 with cs_unigram_6.0k
Made freqs for cs_unigram_6.0k in 5.559765577316284
Training tokenizer for cs with vocab_size of 8000


  df = df.append(row, ignore_index=True)


Training time: 2.8677000999450684
cs text tokenized in 1.6866333484649658 with cs_unigram_8.0k
Made freqs for cs_unigram_8.0k in 6.444267749786377
Training tokenizer for sv with vocab_size of 500


  df = df.append(row, ignore_index=True)


Training time: 2.6655242443084717
sv text tokenized in 1.9195024967193604 with sv_unigram_0.5k
Made freqs for sv_unigram_0.5k in 2.821676254272461
Training tokenizer for sv with vocab_size of 750


  df = df.append(row, ignore_index=True)


Training time: 2.6190483570098877
sv text tokenized in 2.026533365249634 with sv_unigram_0.75k
Made freqs for sv_unigram_0.75k in 2.8850386142730713
Training tokenizer for sv with vocab_size of 1500
Training time: 2.882889986038208


  df = df.append(row, ignore_index=True)


sv text tokenized in 1.8612146377563477 with sv_unigram_1.5k
Made freqs for sv_unigram_1.5k in 3.3034181594848633
Training tokenizer for sv with vocab_size of 3000
Training time: 2.457979679107666


  df = df.append(row, ignore_index=True)


sv text tokenized in 1.7699251174926758 with sv_unigram_3.0k
Made freqs for sv_unigram_3.0k in 3.8328843116760254
Training tokenizer for sv with vocab_size of 4000
Training time: 2.3484950065612793


  df = df.append(row, ignore_index=True)


sv text tokenized in 1.7398505210876465 with sv_unigram_4.0k
Made freqs for sv_unigram_4.0k in 4.246295213699341
Training tokenizer for sv with vocab_size of 6000


  df = df.append(row, ignore_index=True)


Training time: 2.20053768157959
sv text tokenized in 1.7809720039367676 with sv_unigram_6.0k
Made freqs for sv_unigram_6.0k in 4.867952585220337
Training tokenizer for sv with vocab_size of 8000
Training time: 2.0801753997802734


  df = df.append(row, ignore_index=True)


sv text tokenized in 1.9306962490081787 with sv_unigram_8.0k
Made freqs for sv_unigram_8.0k in 5.845574855804443
Done


  df = df.append(row, ignore_index=True)


In [None]:
class Plotter:
    """
    Called Plotter for previous versions, DOES NOT PLOT (use other notebook). Generates .csv with statistics
    """
    
    def __init__(self, dataset, dataset_dir, just_tgt=False):
        self.dataset = dataset
        self.pair = self.dataset.split('_')
        self.dataset_dir = dataset_dir
        self.tokenizers_dir = f'./tokenizers/{dataset}'
        self.just_tgt = just_tgt
        
    def collect_paths(self):
        """
        Create a dictionary of paths of relevant files
        """
        
        langs = self.pair
        if self.just_tgt:
            langs = [langs[1]]
       
        paths = {} #lang : {}
          
        for lang in langs:
            tokenizers = {} # tokenizer : (freqs, train, tokenized)
            tokenizers_paths = [path for path in os.listdir(f'{self.tokenizers_dir}/{lang}')]
            
            for path in tokenizers_paths:
                
                    tokenizer_name = os.path.basename(path)
                    freqs = f'{self.tokenizers_dir}/{lang}/{tokenizer_name}/{tokenizer_name}.freq'
                    
                    if 'hft' in path:
                        train = f'{self.dataset_dir}/{self.dataset}/train/tokenized/train_hft_pretokenized.{lang}'
                    else:
                        train = f'{self.dataset_dir}/{self.dataset}/train.{lang}'
                    
                    tokenized = f'{self.dataset_dir}/{self.dataset}/train/tokenized/train_toks_{tokenizer_name}.{lang}'

                    tokenizers[path] = (freqs, train, tokenized)
            
            paths[lang] = tokenizers
        
        return (paths)
    
    
    def collect_percs(self):
        """
        This should return the amount of tokens with equal or less freq than 0, 10, 100. Not fully implemented yet, I think
        """
        
        
        paths = self.collect_paths()
        
        df = pd.DataFrame(columns=['dataset', 'lang', 'tokenizer', 'vocab_size', '0', '10', '100'])
        
        last_index = len(df)
        for lang in paths.keys():
            for tokenizer in paths[lang].keys():
                vocab = open(f'{self.tokenizers_dir}/{lang}/{tokenizer}/{tokenizer}.vocab', 'r').readlines()
                freqs_path = f'{self.tokenizers_dir}/{lang}/{tokenizer}/{tokenizer}.freq'
                freqs = ast.literal_eval(open(freqs_path, 'r').read())
                """print(tokenizer)
                print('freqs', len(freqs.values()))
                print('vocab', len(vocab))"""
                
                vocab_size = float(re.sub(r'[^\d.]+',"", tokenizer))*1000
                
                zeros = len(vocab)-len(freqs.values())
                tens = 0
                hundr = 0
                
                for value in freqs.values():
                    if int(value) <= 10:
                        tens += 1
                    if int(value) <= 100:
                        hundr += 1
                
                zeros = (zeros/vocab_size)*100
                tens = (tens/vocab_size)*100
                hundr = (hundr/vocab_size)*100
                        
                row = {"dataset" : self.dataset,
                   "lang" : lang,                  
                   "tokenizer" : tokenizer,
                   "vocab_size" : vocab_size,
                   "0" : zeros,
                   "10" : tens,
                    "100" : hundr}
                df = df.append(row, ignore_index=True)
        
        df = df.sort_values(by="vocab_size", axis=0, ascending=True, inplace=False, kind='quicksort', na_position='last')
        print(df.head())
        
        with open(f'./{self.dataset}_percs.csv', 'w+') as out:
            df.to_csv(out, sep='\t')
    
    def collect_stats(self):
        """
        Collects the stats and generates a .csv
        """
        
        paths = self.collect_paths()
        
        df = pd.DataFrame(columns=['dataset', 'lang', 'tokenizer', 'vocab_size', 'freq@95%', 'avg_len'])
        
        last_index = len(df)
        for lang in paths.keys():
            for tokenizer in paths[lang].keys():
                    
                freqs_path = paths[lang][tokenizer][0]
                tokenized_path = paths[lang][tokenizer][2]

                tokenized_text = open(tokenized_path, 'r')
   
                freqs = ast.literal_eval(open(freqs_path).read())
                freqs = list(sorted(freqs.items(), key=lambda item: int(item[1]), reverse=True))
                
                freq_at_95 = freqs[int((len(freqs)/100)*95)][1]

                lines = tokenized_text.readlines()

                if 'hft' in tokenizer:
                    avg_len = 0

                    for line in lines:
                        line = line.split(' ')
                        avg_len += len(line)
                        
                    avg_len = avg_len/len(lines)
                
                else:
                    avg_len = 0

                    for line in lines:
                        line = line.split(',')
                        avg_len += len(line)

                    avg_len = avg_len/len(lines)

                vocab_size = float(re.sub(r'[^\d.]+',"", tokenizer))*1000

                if "unigram" in tokenizer:
                    tokenizer_type = "unigram"
                elif "bpe" in tokenizer:
                    tokenizer_type = "bpe"
                elif "char" in tokenizer:
                    tokenizer_type = "char" #char has just 1 value, add to another type?
                elif "hft" in tokenizer:
                    tokenizer_type = "hft"
                    
                row = {"dataset" : self.dataset,
                       "lang" : lang,                  
                       "tokenizer" : tokenizer_type,
                       "vocab_size" : vocab_size,
                       "freq@95%" : freq_at_95,
                       "avg_len" : avg_len}
                df = df.append(row, ignore_index=True)
        
        df = df.sort_values(by="tokenizer", axis=0, ascending=True, inplace=False, kind='quicksort', na_position='last')
        with open(f'./stats/{self.dataset}.csv', 'w+') as out:
            df.to_csv(out, sep='\t')
        return(df)
    
    def run(self, percs=True, stats=True):
        """
        runs other functions
        """
        
        if percs:
            self.collect_percs()
        if stats:
            self.collect_stats()

In [None]:
datasets = [
            #'en_mr',
            #'en_ga',
            #"en_hi",
            #"en_lt",
            "ja_my"
            ]

for dataset in datasets:
    Plotter(dataset, './data').run(percs=False)

In [None]:
"""generate env var and run from server screen ctrl+a d, and to reconnect screen -r

or redirect all the outputs on a file and run the process with nohup and & (running in bg)"""

In [None]:
class BleuTester:
    """
    TBA, future work
    
    trains nmt from tokenized with tokenizers,
    translates,
    computes bleu scores and plots results
    """
    
    def __init__(self, pair, tokenizers):
        self.pair = pair.split('-')
        self.src_lang = self.pair[0]
        self.tgt_lang = self.pair[1]
        
    def tokenize(self, ):
        """
        loads tokenizer, 
        tokenizes train.lang,
        returns tokenized, speed
        """
    
    def generate_env_var(self, ):
        """
        generate env_vars for current run
        """
        
        env_vars = 'export DATA_PATH= ../data

        export VOCAB_SOURCE=${DATA_PATH}/vocab.bpe.32000
        export VOCAB_TARGET=${DATA_PATH}/vocab.bpe.32000
        export TRAIN_SOURCES=${DATA_PATH}/toks_0.5k.en
        export TRAIN_TARGETS=${DATA_PATH}/toks_0.5k.mr
        export DEV_SOURCES=${DATA_PATH}/newstest2013.tok.bpe.32000.en
        export DEV_TARGETS=${DATA_PATH}/newstest2013.tok.bpe.32000.de

        export DEV_TARGETS_REF=${DATA_PATH}/newstest2013.tok.de
        export TRAIN_STEPS=1000000'
    
    def train_nmt(self,):
        """
        loads tokenized,
        trains model
        """
        
    def translate(self, ):
        """
        loads model,
        loads dev or test,
        translates
        returns translation
        """
    
    def compute_bleu(self, ):
        """
        loads translation,
        computes bleu,
        returns list of bleu scores
        """
    
    def plot(self, ):
        """
        plots results
        """
    def run(self):
        """
        runs the whole thing
        """class BleuTester:
    def __init__(self,):
        
    def train_nmt(self,)
    
    def compute_bleu(self,)
    
    def run(self):

In [None]:
#utils

def big_to_small(pair, size):
    """
    generates a sample of a bigger file according to language pair and size
    """
    random.seed(20220713)
    
    pair = pair.split('_')

    big_path1 = f'./data_big/{pair[0]}_{pair[1]}/train.{pair[0]}'
    big_file1 = open(big_path1, 'r').readlines()
    small_path1 = f'./data/{pair[0]}_{pair[1]}/train.{pair[0]}'
    
    sample = random.sample(range(len(big_file1)), size)
    
    big_path2 = f'./data_big/{pair[0]}_{pair[1]}/train.{pair[1]}'
    big_file2 = open(big_path2, 'r').readlines()
    small_path2 = f'./data/{pair[0]}_{pair[1]}/train.{pair[1]}'
    
    #os.mkdir(f'./data/{pair[0]}_{pair[1]}')

    with open(small_path1, 'w+') as small_file1:
        for i in sample:
            print(big_file1[i].strip('\n'), file=small_file1)
    
    with open(small_path2, 'w+') as small_file2:
        for i in sample:
            print(big_file2[i].strip('\n'), file=small_file2) 

def xml_to_raw(in_file, out_file):
    """
    generates a raw txt from a xml
    """
    
    import xml.etree.ElementTree as ET
    
    tree = ET.parse(in_file)
    root = tree.getroot()
    
    with open(out_file, 'w+') as out:
        for child in root.iter('s'):
            
            print(child.text, file=out)


In [None]:
xml_to_raw('./data/syr_zu/Zulu-NT.xml', './data/syr_zu/train.zu')