In [1]:
from pathlib import Path
import os
import sentencepiece as spm
import json
import time
import shutil
import ast
import pandas as pd
import seaborn as sns
import re
import random
import math
from IPython.display import clear_output

In [22]:
class TokBuilder:
    """
    builds tokenizers and freq dict for lang pair. can use unigram, bpe, model_type
    """
    def __init__(self, lang, model_type):
       
        self.lang = lang
        self.model_type = model_type
    
    def make_batches(self):
        """
        Makes batches of 5_000 lines from bigger txt file for the selectet lang
        """
        
        file_path = f'./data/{self.lang}/train/train.{self.lang}'
        file = open(file_path, 'r')
        data = file.readlines()
        file.close()

        text_data = []
        file_count = 0

        for sample in data:
            sample = sample.replace('\n', '')
            text_data.append(sample)
            
            save_path = f'./data/{self.lang}/train/train_{file_count}.{self.lang}'
            
            if len(text_data) == 5_000:
                # once we hit the 5K mark, save to file
                with open(save_path, 'w+', encoding='utf-8') as fp:
                    fp.write('\n'.join(text_data))
                text_data = []
                file_count += 1

        with open(save_path, 'w+', encoding='utf-8') as fp:
            fp.write('\n'.join(text_data))
    
    def gather_batches(self):
        """
        Returns the paths to the training batches for the selected lang
        """
        
        self.make_batches()
        paths = [str(x) for x in Path(f'./data/{self.lang}/train').glob(f'**/*.{self.lang}')]
        return paths

    def train_tokenizer(self, vocab_size):
        """
        Trains a SentencePiece tokenizer for the selected lang and vocab_size
        
        if hft, must pretokenize beforehand
        """
        
        print(f'Training {self.model_type} tokenizer for {self.lang} with vocab_size of {vocab_size}')
        
        tokenizer_name = f'{self.lang}_{self.model_type}_{vocab_size/1000}k'
              
        paths = self.gather_batches()
        
        tokenizer_path = f'./data/{self.lang}/tokenizers/{tokenizer_name}'
        
        #if not os.path.isdir(f'./data/{self.lang}/tokenizers/'):
         #   os.mkdir(f'./data/{self.lang}/tokenizers/')
            
        if os.path.isdir(tokenizer_path):
            shutil.rmtree(tokenizer_path)        
        
        os.mkdir(tokenizer_path)
        
        if self.model_type == 'hft':
            
            cmd0 = f'./pretokenize ./data/{self.lang}/train/train.{self.lang} > ./data/{self.lang}/train/train_hft_pretokenized.{self.lang}' 
            cmd1 = f'./hftoks.py learn ./data/{self.lang}/train/train_hft_pretokenized.{self.lang} {tokenizer_path}/{tokenizer_name}.vocab {vocab_size} 100'
            start = time.time()
            os.system(cmd0)
            os.system(cmd1)
            end = time.time()
            #print(f'Training time: {end-start}')
            return (end-start)
        
        else:
            manychars = ['ja']
            
            charcover = 1.0
            
            if self.lang in manychars:
                charcover = 0.98
            sp_model = spm.SentencePieceProcessor()
            start = time.time()
            
            spm.SentencePieceTrainer.train(
                input=paths,
                model_prefix=f'{tokenizer_path}/{tokenizer_name}',
                vocab_size=vocab_size,
                unk_id=2,
                bos_id=-1,
                eos_id=1,
                pad_id=0,
                model_type=self.model_type,
                character_coverage=charcover,
                train_extremely_large_corpus=False,
                minloglevel=100
            )

            end = time.time()
        
            #print(f'Training time: {end-start}')
            return (end-start)
   
    def tokenize(self, tokenizer):
        """
        Tokenize train for lang with tokenizer
        """
        
        out = f'./data/{self.lang}/tokenized/train_toks_{tokenizer}.{self.lang}'
        
        if self.model_type == 'hft':    
            tokenizer_path = f'./data/{self.lang}/tokenizers//{tokenizer}/{tokenizer}.vocab'
        
            train_path = f'./data/{self.lang}/train/train_hft_pretokenized.{self.lang}'
            
            start = time.time()
            
            cmd = f'python3 hftoks.py tokenize {tokenizer_path} <{train_path} > {out}'
            os.system(cmd)

            end = time.time()
            #print(f'{self.lang} text tokenized in {end-start} with {tokenizer}')
            return (end-start)
        
        else:
            tokenizer_path = f'./data/{self.lang}/tokenizers/{tokenizer}/{tokenizer}.model'

            sp = spm.SentencePieceProcessor()
            sp.load(f'{tokenizer_path}')
            
            if os.path.isfile(out):
                    os.remove(out)
            
            with open(f'./data/{self.lang}/train/train.{self.lang}', 'r') as text:
                
                start = time.time()               
                for line in text:
                    line = line.rstrip()
                    toks = sp.encode_as_pieces(line)
                    with open(out, 'a+') as outtoks:
                            print(toks, file=outtoks)
                end = time.time()   
            #print(f'{self.lang} text tokenized in {end-start} with {tokenizer}')
            return(end-start)

    def make_freqs(self, tokenizer):
        """
        Makes frequency files for the selected lang and tokenizer
        """
        start=time.time()
        
        if self.model_type == 'hft':    
            tokenizer_path = f'./data/{self.lang}/tokenizers/{tokenizer}/{tokenizer}.vocab'
            
            vocab = open(tokenizer_path, 'r').readlines()
            vocabs = []

            for line in vocab:
                line=line.split("\t")
                vocabs.append(line[0])

        else:
            tokenizer_path = f'./data/{self.lang}/tokenizers/{tokenizer}/{tokenizer}.model'
            
            sp = spm.SentencePieceProcessor()
            sp.load(f'{tokenizer_path}')
            vocabs = [sp.id_to_piece(id) for id in range(sp.get_piece_size())]

        tokenized_path = f'./data/{self.lang}/tokenized/train_toks_{tokenizer}.{self.lang}'
        toks = open(tokenized_path, 'r').readlines()
        
        freqs = {}

        for line in toks:
            if model_type == 'hft':
                line = line.split(' ')
            else:
                line = ast.literal_eval(line)
            for tok in line:
                if tok in vocabs:
                    if tok in freqs.keys():
                        freqs[tok] += 1
                    else:
                        freqs[tok] = 1

            freqs = dict(sorted(freqs.items(), key=lambda item: item[1], reverse=True))
            with open(f'./data/{self.lang}/tokenizers/{tokenizer}/{tokenizer}.freq', 'w+') as out:
                print(freqs, file=out)
        
        end = time.time()
        #print(f'Collected freq for {tokenizer} in {end-start}')
            
    def run(self, vocab_sizes=None, train=True, tokenize=True, freqs=True, save_run=True):
        """
        Runs the other functions, returns train time and tokenization time
        """
        
        df = pd.DataFrame(columns=['lang', 'tokenizer', 'vocab_size', 'train', 'token'])

        if not vocab_sizes:
            if self.lang == 'ja':
                vocab_sizes = [
                           #500,
                           750,
                           1500,
                           3000,
                           4000,
                           6000,
                           8000
                               ]
            elif self.lang == 'quc':
                vocab_sizes = [
                           500,
                           750,
                           1500,
                           3000,
                           4000,
                           #6000,
                           #8000
                               ]
            
            else:
                vocab_sizes = [
                               500,
                               750,
                               1500,
                               3000,
                               4000,
                               6000,
                               8000
                                   ]
           
        for size in vocab_sizes:
            tokenizer_name = f'{self.lang}_{self.model_type}_{size/1000}k'
            train_time = 0
            token_time = 0
            if train:
                train_time = self.train_tokenizer(size)
            if tokenize:
                token_time = self.tokenize(tokenizer_name)
            if freqs:
                self.make_freqs(tokenizer_name)
            if save_run:
                df.loc[len(df)]=[self.lang, tokenizer_name, size, train_time, token_time]
        if save_run:        
            df.to_csv(f'./stats/{self.lang}_time.csv', sep='\t')


In [26]:
"""
Cannot run all together bacause jupyter saves outputs checkpoints, stalling the console. Run one dataset at a time,
one model at a time, or pass one lang.
"""

datasets = [
            #'am', V
            #'ar', V
            #'chr', V
            #'cs', V
            #'en',V
            #'fi', V
            #'ga',V
            #'hi',V
            #'it',V
            #'ja',
            'jak',
            #'lt',V
            #'mr',V
            #'my',
            #'ojb',
            #'sv',
            #'syr',
            #'zu',
            
            ]
model_types = [
              'unigram',
              'bpe',
              'hft'
              ]
               
for dataset in datasets:
    for model_type in model_types:
        print(dataset, model_type)
        model = TokBuilder(dataset, model_type=model_type)
        model.run(
                  #vocab_sizes = [3000],
                  #train=False,
                  #tokenize=False,
                  #freqs=False,
                  #save_run=False
                 )
        clear_output(wait=True)
print('Done')

Done


In [30]:
class Plotter:
    """
    Called Plotter for previous versions, DOES NOT PLOT (use other notebook). Generates .csv with statistics
    """
    
    def __init__(self, lang):
        self.lang = lang
        self.tokenizers_dir = f'./data/{self.lang}/tokenizers/'
        
    def collect_paths(self):
        """
        Create a dictionary of paths of relevant files
        """
        
        paths = {} #lang : {}
          
        tokenizers = {} # tokenizer : (freqs, train, tokenized)
        tokenizers_paths = [path for path in os.listdir(f'./data/{self.lang}/tokenizers')]

        for path in tokenizers_paths:
                tokenizer_name = os.path.basename(path)
                freqs = f'./data/{self.lang}/tokenizers/{tokenizer_name}/{tokenizer_name}.freq'

                if 'hft' in path:
                    train = f'./data/{self.lang}/train/tokenized/train_hft_pretokenized.{self.lang}'
                else:
                    train = f'./data/{self.lang}/train/tokenized/train.{self.lang}'

                tokenized = f'./data/{self.lang}/tokenized/train_toks_{tokenizer_name}.{self.lang}'

                tokenizers[path] = (freqs, train, tokenized)

        paths[self.lang] = tokenizers
        
        return (paths)
    
    def metric(self, lang, size, model_type):
        
        size = size/1000

        f_m = open(f'./data/{self.lang}/tokenizers/{self.lang}_{model_type}_{size}k/{self.lang}_{model_type}_{size}k.freq', 'r').read()
        f_m = ast.literal_eval(f_m)

        i = 1

        weighted_m_val = 0

        m_95_len = int((len(f_m)/100)*95)

        for val in f_m.keys():

            weighted_m_val += int(f_m[val])*i
            
            if i > m_95_len:
                break
           
            i += 1

        weighted_m_val = weighted_m_val/sum(range(i))
        
        return weighted_m_val

    
    def collect_stats(self):
        """
        Collects the stats and generates a .csv
        """
        
        paths = self.collect_paths()
        
        df = pd.DataFrame(columns=['lang', 'tokenizer', 'vocab_size', 'freq@95%', 'avg_len','weighted'])

        for tokenizer in paths[self.lang].keys():

            freqs_path = paths[self.lang][tokenizer][0]
            tokenized_path = paths[self.lang][tokenizer][2]

            tokenized_text = open(tokenized_path, 'r')

            freqs = ast.literal_eval(open(freqs_path).read())
            freqs = list(sorted(freqs.items(), key=lambda item: int(item[1]), reverse=True))

            index = math.floor((len(freqs)/100)*95)
            
            freq_at_95 = freqs[index][1]

            lines = tokenized_text.readlines()

            if 'hft' in tokenizer:
                avg_len = 0

                for line in lines:
                    line = line.split(' ')
                    avg_len += len(line)

                avg_len = avg_len/len(lines)

            else:
                avg_len = 0

                for line in lines:
                    line = line.split(',')
                    avg_len += len(line)

                avg_len = avg_len/len(lines)

            vocab_size = float(re.sub(r'[^\d.]+',"", tokenizer))*1000

            if "unigram" in tokenizer:
                tokenizer_type = "unigram"
            elif "bpe" in tokenizer:
                tokenizer_type = "bpe"
            elif "hft" in tokenizer:
                tokenizer_type = "hft"

            weighted = self.metric(self.lang, vocab_size, tokenizer_type)

            df.loc[len(df)]=[self.lang, tokenizer_type, vocab_size, freq_at_95, avg_len, weighted]

        df = df.sort_values(by="tokenizer", axis=0, ascending=True, inplace=False, kind='quicksort', na_position='last')
        with open(f'./stats/{self.lang}.csv', 'w+') as out:
            df.to_csv(out, sep='\t')
        return(df)


In [34]:
datasets = [
            'am',
            'ar',
            'chr',
            'cs',
            'en',
            #'fi',
            #'ga',
            #'hi',
            #'it',
            'ja',
            #'lt',
            #'mr',
            #'my',
            #'sv',
            #'syr',
            #'zu',
            #'ojb'
            #'jak'
            ]

for dataset in datasets:
    print(dataset)
    Plotter(dataset).collect_stats()

am
ar
chr
cs
en
ja


In [None]:
"""generate env var and run from server screen ctrl+a d, and to reconnect screen -r

or redirect all the outputs on a file and run the process with nohup and & (running in bg)"""

In [None]:
class BleuTester:
    """
    TBA, future work
    
    trains nmt from tokenized with tokenizers,
    translates,
    computes bleu scores and plots results
    """
    
    def __init__(self, pair, tokenizers):
        self.pair = pair.split('-')
        self.src_lang = self.pair[0]
        self.tgt_lang = self.pair[1]
        
    def tokenize(self, ):
        """
        loads tokenizer, 
        tokenizes train.lang,
        returns tokenized, speed
        """
    
    def generate_env_var(self, ):
        """
        generate env_vars for current run
        """
        
        env_vars = 'export DATA_PATH= ../data

        export VOCAB_SOURCE=${DATA_PATH}/vocab.bpe.32000
        export VOCAB_TARGET=${DATA_PATH}/vocab.bpe.32000
        export TRAIN_SOURCES=${DATA_PATH}/toks_0.5k.en
        export TRAIN_TARGETS=${DATA_PATH}/toks_0.5k.mr
        export DEV_SOURCES=${DATA_PATH}/newstest2013.tok.bpe.32000.en
        export DEV_TARGETS=${DATA_PATH}/newstest2013.tok.bpe.32000.de

        export DEV_TARGETS_REF=${DATA_PATH}/newstest2013.tok.de
        export TRAIN_STEPS=1000000'
    
    def train_nmt(self,):
        """
        loads tokenized,
        trains model
        """
        
    def translate(self, ):
        """
        loads model,
        loads dev or test,
        translates
        returns translation
        """
    
    def compute_bleu(self, ):
        """
        loads translation,
        computes bleu,
        returns list of bleu scores
        """
    
    def plot(self, ):
        """
        plots results
        """
    def run(self):
        """
        runs the whole thing
        """class BleuTester:
    def __init__(self,):
        
    def train_nmt(self,)
    
    def compute_bleu(self,)
    
    def run(self):

In [7]:
#utils

def big_to_small(pair, size):
    """
    generates a sample of a bigger file according to language pair and size
    """
    random.seed(20220713)
    
    pair = pair.split('_')

    big_path1 = f'./data_big/{pair[0]}_{pair[1]}/train.{pair[0]}'
    big_file1 = open(big_path1, 'r').readlines()
    small_path1 = f'./data/{pair[0]}_{pair[1]}/train.{pair[0]}'
    
    sample = random.sample(range(len(big_file1)), size)
    
    big_path2 = f'./data_big/{pair[0]}_{pair[1]}/train.{pair[1]}'
    big_file2 = open(big_path2, 'r').readlines()
    small_path2 = f'./data/{pair[0]}_{pair[1]}/train.{pair[1]}'
    
    #os.mkdir(f'./data/{pair[0]}_{pair[1]}')

    with open(small_path1, 'w+') as small_file1:
        for i in sample:
            print(big_file1[i].strip('\n'), file=small_file1)
    
    with open(small_path2, 'w+') as small_file2:
        for i in sample:
            print(big_file2[i].strip('\n'), file=small_file2) 

def xml_to_raw(in_file, out_file):
    """
    generates a raw txt from a xml
    """
    
    import xml.etree.ElementTree as ET
    
    tree = ET.parse(in_file)
    root = tree.getroot()
    
    with open(out_file, 'w+') as out:
        for child in root.iter('s'):
            
                print(child.text, file=out)


In [25]:

xml_to_raw('./data/jak/train/Jakalteko-NT.xml', f'./data/jak/train/train.jak')