In [None]:
"""
SENTENCEPIECE v0.1

EN-DE : Euparl, News, CCrawl, 
DE-EN : idem

EN-LT : Eupar

EN-MR : LoRes21
EN-GA : LoRes21

tokenized with sacremoses

Lenght mu = arithmetic mean of target seqs after encoding
Freq@95% = least freq in the 95% of vocab (log)

vocab_sizes = [500, 1000, 2000, 4000, 8000, 16000, 32000, 48000, 64000]

VVV BPE and SentPiece voc_size are not comparable, SentPiece gives error over max value that changes with data. Still
    BPE uses final vocabulary size in sentence piece, even when another model cannot use that size
            
!!! get logs of training and tokenization speed and other output to df and save csv for final run
    > I need:
        > vocab_size
        > length 
        > freq@95%
    > check the correctness of freq@95 and avg_len stats
        > get percentiles of used tokens
    
!!! better plots
    > find good variables to correlate
    > grid plots, change df to include dataset, model, value
    > must plot all things together

!!! better self contained functions
    > selective run to be passed in init
    > separate make_freqs from tokenization

build BleuTester with trained NMT
"""

from pathlib import Path
import os
import sentencepiece as spm
import json
import time
import shutil
import ast
import pandas as pd
import seaborn as sns
import re


In [None]:
class TokBuilder:
    """
    builds tokenizers and freq dict for lang pair. can use unigram, bpe, char model_type
    """
    def __init__(self, pair, model_type, data_path):
        self.pair = pair
        self.langs = pair.split("_")
        self.src_lang = self.langs[0]
        self.tgt_lang = self.langs[1]
        self.model_type = model_type
        self.data_path = data_path

    def count_chars(self, lang):
        """
        returns number of unique chars in file for char vocab_size
        """

        file_path = f'./{self.data_path}/{self.src_lang}_{self.tgt_lang}/train.{lang}'

        with open(file_path, 'r') as file:   
            unique = []

            for line in file.readlines():
                for char in line:
                    if char not in unique:
                        unique.append(char)

        return int(len(unique))

    
    def make_batches(self, lang):
        """
        Makes batches of 5_000 lines from bigger txt file for the selectet lang
        """
        
        file_path = f'./{self.data_path}/{self.src_lang}_{self.tgt_lang}/train.{lang}'
        file = open(file_path, 'r')
        data = file.readlines()
        file.close()

        text_data = []
        file_count = 0

        for sample in data:
            sample = sample.replace('\n', '')
            text_data.append(sample)
            
            save_path = f'./{self.data_path}/{self.src_lang}_{self.tgt_lang}/train/text/train_{file_count}.{lang}'
            
            if len(text_data) == 5_000:
                # once we hit the 5K mark, save to file
                with open(save_path, 'w+', encoding='utf-8') as fp:
                    fp.write('\n'.join(text_data))
                text_data = []
                file_count += 1

        with open(save_path, 'w+', encoding='utf-8') as fp:
            fp.write('\n'.join(text_data))
    
    def gather_files(self, lang):
        """
        Returns the paths to the training batches for the selected lang
        """
        
        self.make_batches(lang)
        paths = [str(x) for x in Path(f'./{self.data_path}/{self.src_lang}_{self.tgt_lang}/train/text').glob(f'**/*.{lang}')]
        return paths

    def train_tokenizer(self, lang, vocab_size):
        """
        Trains a SentencePiece tokenizer for the selected lang and vocab_size
        """
        
        print(f'Training tokenizer for {lang} with vocab_size of {vocab_size}')
        
        tokenizer_name = f'{lang}_{self.model_type}_{vocab_size/1000}k'
              
        paths = self.gather_files(lang)
        
        tokenizer_path = f'./tokenizers/{self.src_lang}_{self.tgt_lang}/{lang}/{tokenizer_name}'
        
        if not os.path.isdir(f'./tokenizers/{self.src_lang}_{self.tgt_lang}'):
            os.mkdir(f'./tokenizers/{self.src_lang}_{self.tgt_lang}')
        
        if not os.path.isdir(f'./tokenizers/{self.src_lang}_{self.tgt_lang}/{lang}'):
            os.mkdir(f'./tokenizers/{self.src_lang}_{self.tgt_lang}/{lang}')
        
        if os.path.isdir(tokenizer_path):
            shutil.rmtree(tokenizer_path)        
        
        os.mkdir(tokenizer_path)
        
        if self.model_type == 'hft':
            
            #cmd0 = f'./pretokenize ./data/{self.pair}/train.{lang} > ./data/{self.pair}/train/tokenized/hft_pretokenized.{lang}' 
            cmd1 = f'./hftoks.py learn ./{self.data_path}/{self.pair}/train/tokenized/hft_pretokenized.{lang} {tokenizer_path}/{tokenizer_name}.vocab {vocab_size} 100'
            start = time.time()
            #os.system(cmd0)
            os.system(cmd1)
            end = time.time()
            print(f'Training time: {end-start}')
            return (end-start)
        
        else:
            
            sp_model = spm.SentencePieceProcessor()
            start = time.time()

            spm.SentencePieceTrainer.train(
                input=paths,
                model_prefix=f'{tokenizer_path}/{tokenizer_name}',
                vocab_size=vocab_size,
                unk_id=2,
                bos_id=-1,
                eos_id=1,
                pad_id=0,
                model_type=self.model_type,
                train_extremely_large_corpus=False,
                minloglevel=100
            )

            end = time.time()
        
            print(f'Training time: {end-start}')
            return (end-start)
   
    def tokenize(self, lang, tokenizer):
        """
        Tokenize train for lang
        """
        
        if self.model_type == 'hft':    
            tokenizer_path = f'./tokenizers/{self.src_lang}_{self.tgt_lang}/{lang}/{tokenizer}/{tokenizer}.vocab'
        
            train_path = f'./{self.data_path}/{self.pair}/train/tokenized/hft_pretokenized.{lang}'
            
            start = time.time()
            
            out = f'./{self.data_path}/{self.src_lang}_{self.tgt_lang}/train/tokenized/toks_{tokenizer}.{lang}'
            cmd = f'python3 hftoks.py tokenize {tokenizer_path} <{train_path} > {out}'
            os.system(cmd)

            end = time.time()
            print(f'{lang} text tokenized in {end-start} with {tokenizer}')
            return (end-start)
        
        else:
            tokenizer_path = f'./tokenizers/{self.src_lang}_{self.tgt_lang}/{lang}/{tokenizer}/{tokenizer}.model'

            sp = spm.SentencePieceProcessor()
            sp.load(f'{tokenizer_path}')
            
            if os.path.isfile(f'./{self.data_path}/{self.src_lang}_{self.tgt_lang}/train/tokenized/toks_{tokenizer}.{lang}'):
                    os.remove(f'./{self.data_path}/{self.src_lang}_{self.tgt_lang}/train/tokenized/toks_{tokenizer}.{lang}')
            
            with open(f'./{self.data_path}/{self.src_lang}_{self.tgt_lang}/train.{lang}', 'r') as text:
                
                start = time.time()               
                for line in text:
                    line = line.rstrip()
                    toks = sp.encode_as_pieces(line)
                    with open(f'./{self.data_path}/{self.src_lang}_{self.tgt_lang}/train/tokenized/toks_{tokenizer}.{lang}', 'a+') as out:
                            print(toks, file=out)
                end = time.time()   
            print(f'{lang} text tokenized in {end-start} with {tokenizer}')
            return(end-start)

    def make_freqs(self, lang, tokenizer):
        """
        Makes frequency files for the selected lang and tokenizer
        """
        if self.model_type == 'hft':    
            tokenizer_path = f'./tokenizers/{self.src_lang}_{self.tgt_lang}/{lang}/{tokenizer}/{tokenizer}.vocab'
            
            start = time.time()
            
            freqs_file = open(tokenizer_path, 'r')
            freqs = {}
            for line in freqs_file.readlines():
                
                line = line.split('\t')
                freqs[line[0].strip(' ')] = int(line[1].strip('\n'))
            
            freqs = dict(sorted(freqs.items(), key=lambda item: item[1], reverse=True))
            with open(f'./tokenizers/{self.src_lang}_{self.tgt_lang}/{lang}/{tokenizer}/{tokenizer}.freq', 'w+') as out:
                print(freqs, file=out)
            
            end=time.time()
            print(f"Made freqs for {tokenizer} in {end-start}")
            
        else:
            tokenizer_path = f'./tokenizers/{self.src_lang}_{self.tgt_lang}/{lang}/{tokenizer}/{tokenizer}.model'
            
            start=time.time()
            sp = spm.SentencePieceProcessor()
            sp.load(f'{tokenizer_path}')

            vocabs = [sp.id_to_piece(id) for id in range(sp.get_piece_size())]

            freqs = {}
            with open(f'./{self.data_path}/{self.src_lang}_{self.tgt_lang}/train/tokenized/toks_{tokenizer}.{lang}', 'r') as f:
                for line in f:
                    for piece in line:
                        freqs.setdefault(piece, 0)
                        freqs[piece] += 1
                        
            freqs = dict(sorted(freqs.items(), key=lambda item: item[1], reverse=True))
            with open(f'./tokenizers/{self.src_lang}_{self.tgt_lang}/{lang}/{tokenizer}/{tokenizer}.freq', 'w+') as out:
                print(freqs, file=out)
            
            end=time.time()
            print(f"Made freqs for {tokenizer} in {end-start}")

    def run(self, langs=None, vocab_sizes=None, train=True, tokenize=True, freqs=True):
        """
        Runs the training and frequency
        """
        
        df = pd.DataFrame(columns=['dataset', 'lang', 'tokenizer', 'vocab_size', 'train', 'token'])
        if not langs:
            langs = [self.src_lang, self.tgt_lang]
        
        for lang in langs:
           
            if not vocab_sizes:
            
                if self.model_type=='char':
                    vocab_sizes = [self.count_chars(lang)]
                elif self.model_type=='bpe': #merge operations
                    vocab_sizes = [500,
                                   1000,
                                   2000,
                                   4000,
                                   8000, 
                                   16000,
                                   32000, 
                                   #48000, too big for en-ga
                                   #64000 too big for en-mr
                                   ]
                elif self.model_type=='unigram': #final vocabulary size
                    vocab_sizes = [500,
                                   750,
                                   1500,
                                   3000,
                                   4000,
                                   6000,
                                   8000
                                   ]
                elif self.model_type=='hft': #final vocabulary size
                    vocab_sizes = [500,
                                   750,
                                   1500,
                                   3000,
                                   4000,
                                   6000,
                                   8000
                                   ]
            
            for size in vocab_sizes:
                tokenizer_name = f'{lang}_{self.model_type}_{size/1000}k'
                if train:
                    train_time = self.train_tokenizer(lang, size)
                if tokenize:
                    token_time = self.tokenize(lang, tokenizer_name)                   
                if freqs: 
                    self.make_freqs(lang, tokenizer_name)
                
                row = {'dataset':self.pair, 'lang':lang, 'tokenizer':tokenizer_name, 'vocab_size':size, 'train':train_time, 'token':token_time}
                df = df.append(row, ignore_index=True)
                
        df.to_csv(f'./run_{time.time()}.csv', sep='\t')
                

In [10]:
datasets = [
            #'en_hi',
            'lt_en'
           ]
model_types = [
              #'char',
              #'unigram',
              'bpe',
              #'hft'
              ]
               
for dataset in datasets:
    for model_type in model_types:
        print(dataset, model_type)
        model = TokBuilder(dataset, model_type=model_type, data_path='./data_big')
        model.run(langs=['en'])
print('Done')

lt_en bpe
Training tokenizer for en with vocab_size of 500
Training time: 8.015945672988892
en text tokenized in 47.78504824638367 with en_bpe_0.5k
Made freqs for en_bpe_0.5k in 18.73268747329712
Training tokenizer for en with vocab_size of 1000


  df = df.append(row, ignore_index=True)


Training time: 10.47726321220398
en text tokenized in 74.31485342979431 with en_bpe_1.0k
Made freqs for en_bpe_1.0k in 15.750118017196655
Training tokenizer for en with vocab_size of 2000


  df = df.append(row, ignore_index=True)


Training time: 10.013867855072021
en text tokenized in 72.7385926246643 with en_bpe_2.0k
Made freqs for en_bpe_2.0k in 14.896342515945435
Training tokenizer for en with vocab_size of 4000


  df = df.append(row, ignore_index=True)


Training time: 10.733312606811523
en text tokenized in 75.86577677726746 with en_bpe_4.0k
Made freqs for en_bpe_4.0k in 13.72604775428772
Training tokenizer for en with vocab_size of 8000


  df = df.append(row, ignore_index=True)


Training time: 11.868770122528076
en text tokenized in 82.51929187774658 with en_bpe_8.0k
Made freqs for en_bpe_8.0k in 13.161970853805542
Training tokenizer for en with vocab_size of 16000


  df = df.append(row, ignore_index=True)


Training time: 14.215688228607178
en text tokenized in 91.46743273735046 with en_bpe_16.0k
Made freqs for en_bpe_16.0k in 12.913257837295532
Training tokenizer for en with vocab_size of 32000


  df = df.append(row, ignore_index=True)


Training time: 15.332538843154907
en text tokenized in 75.91682291030884 with en_bpe_32.0k
Made freqs for en_bpe_32.0k in 12.726557731628418
Done


  df = df.append(row, ignore_index=True)


In [14]:
class Plotter:
    def __init__(self, dataset, just_tgt=False,):
        self.dataset = dataset
        self.pair = self.dataset.split('_')
        self.dataset_dir = f'./data_big/{dataset}'
        self.tokenizers_dir = f'./tokenizers/{dataset}'
        self.just_tgt = just_tgt
        
    def collect_paths(self):
        
        langs = self.pair
        if self.just_tgt:
            langs = [langs[1]]
       
        paths = {} #lang : {}
          
        for lang in langs:
            tokenizers = {} # tokenizer : ( freqs, train, tokenized)
            tokenizers_paths = [path for path in os.listdir(f'{self.tokenizers_dir}/{lang}')]
            
            for path in tokenizers_paths:
               # if "hft" not in path: #to remove after hftoks implementation

                    tokenizer_name = os.path.basename(path)
                    freqs = f'{self.tokenizers_dir}/{lang}/{tokenizer_name}/{tokenizer_name}.freq'
                    
                    if 'hft' in path:
                        train = f'{self.dataset_dir}/train/tokenized/hft_pretokenized.{lang}'
                    else:
                        train = f'{self.dataset_dir}/train.{lang}'
                    
                    tokenized = f'{self.dataset_dir}/train/tokenized/toks_{tokenizer_name}.{lang}'

                    tokenizers[path] = (freqs, train, tokenized)
            
            paths[lang] = tokenizers
        
        return (paths)
        
    def collect_stats(self):
        """
        do for all data
        
        for pair in pairs:
            for lang in pair:
                for tokenizer in lang_tokenizers:
                    collect stats
        
        return(df)
        """
        
        paths = self.collect_paths()
        
        df = pd.DataFrame(columns=['dataset', 'lang', 'tokenizer', 'vocab_size', 'freq@95%', 'avg_len'])
        
        last_index = len(df)
        for lang in paths.keys():
            for tokenizer in paths[lang].keys():
                    
                freqs_path = paths[lang][tokenizer][0]
                tokenized_path = paths[lang][tokenizer][2]

                tokenized_text = open(tokenized_path, 'r')
   
                freqs = ast.literal_eval(open(freqs_path).read())
                freqs = list(sorted(freqs.items(), key=lambda item: int(item[1]), reverse=True))

                freq_at_95 = freqs[int((len(freqs)/100)*95)][1]

                lines = tokenized_text.readlines()

                if 'hft' in tokenizer:
                    avg_len = 0

                    for line in lines:
                        line = line.split(' ')
                        new_line = [i for i in line if i not in ["𐋇","▁","𐊣","𐊼"]]
                        avg_len += len(new_line)

                    avg_len = avg_len/len(lines)
                    
                else:    
                    avg_len = 0

                    for line in lines:
                        line = line.split(',')
                        avg_len += len(line)

                    avg_len = avg_len/len(lines)

                vocab_size = float(re.sub(r'[^\d.]+',"", tokenizer))*1000

                if "unigram" in tokenizer:
                    tokenizer_type = "unigram"
                elif "bpe" in tokenizer:
                    tokenizer_type = "bpe"
                elif "char" in tokenizer:
                    tokenizer_type = "char" #char has just 1 value, add to another type?
                elif "hft" in tokenizer:
                    tokenizer_type = "hft"
                    

                row = {"dataset" : self.dataset,
                       "lang" : lang,                  
                       "tokenizer" : tokenizer_type,
                       "vocab_size" : vocab_size,
                       "freq@95%" : freq_at_95,
                       "avg_len" : avg_len}
                df = df.append(row, ignore_index=True)
        
        df = df.sort_values(by="vocab_size", axis=0, ascending=True, inplace=False, kind='quicksort', na_position='last')
        with open(f'./{self.dataset}.csv', 'w+') as out:
            df.to_csv(out, sep='\t')
        return(df)
    
    def plot(self, value):
        """
        returns plot
        
        plot must be at the end of all data, and have vocab_size on x and freq@95% on y, tokenizer names
        do not matter
        """
        
        df = self.collect_stats()

        sns.set_theme(style="whitegrid")
        ax = sns.lineplot(data=df,
                    x="vocab_size", y=value, hue="tokenizer", style="tokenizer",
                    ci=None, markers=True, dashes=False, palette="tab10", linewidth=2.5, sort=True)
        
        ax.set(title={self.dataset},
                    xlabel="Vocabulary size",
                    ylabel=value,
                    )
        if value == 'freq@95%':
            ax.invert_yaxis()



In [15]:
p = Plotter('en_ga', just_tgt=False)
p2 = Plotter('en_mr', just_tgt=False)
p3 = Plotter('en_hi', just_tgt=False)

In [None]:
p.collect_stats()

In [None]:
p2.collect_stats()

In [16]:
p3.collect_stats()

  df = df.append(row, ignore_index=True)
  df = df.append(row, ignore_index=True)
  df = df.append(row, ignore_index=True)
  df = df.append(row, ignore_index=True)
  df = df.append(row, ignore_index=True)
  df = df.append(row, ignore_index=True)
  df = df.append(row, ignore_index=True)
  df = df.append(row, ignore_index=True)
  df = df.append(row, ignore_index=True)
  df = df.append(row, ignore_index=True)
  df = df.append(row, ignore_index=True)
  df = df.append(row, ignore_index=True)
  df = df.append(row, ignore_index=True)
  df = df.append(row, ignore_index=True)
  df = df.append(row, ignore_index=True)
  df = df.append(row, ignore_index=True)
  df = df.append(row, ignore_index=True)
  df = df.append(row, ignore_index=True)
  df = df.append(row, ignore_index=True)
  df = df.append(row, ignore_index=True)
  df = df.append(row, ignore_index=True)
  df = df.append(row, ignore_index=True)
  df = df.append(row, ignore_index=True)
  df = df.append(row, ignore_index=True)
  df = df.append

  df = df.append(row, ignore_index=True)
  df = df.append(row, ignore_index=True)
  df = df.append(row, ignore_index=True)
  df = df.append(row, ignore_index=True)
  df = df.append(row, ignore_index=True)


Unnamed: 0,dataset,lang,tokenizer,vocab_size,freq@95%,avg_len
2,en_hi,en,bpe,500.0,1,33.669924
6,en_hi,en,unigram,500.0,1,33.68101
34,en_hi,hi,hft,500.0,1,57.327669
33,en_hi,hi,unigram,500.0,1,36.517914
25,en_hi,hi,bpe,500.0,1,36.802882
16,en_hi,en,hft,500.0,1,50.99497
21,en_hi,en,hft,750.0,1,50.99497
38,en_hi,hi,unigram,750.0,1,32.541738
24,en_hi,hi,hft,750.0,1,44.527349
15,en_hi,en,unigram,750.0,1,30.650875


In [None]:
"""generate env var and run from server screen ctrl+a d, and to reconnect screen -r

or redirect all the outputs on a file and run the process with nohup and & (running in bg)"""

In [None]:
class BleuTester:
    """
    trains nmt from tokenized with tokenizers,
    translates,
    computes bleu scores and plots results
    """
    
    def __init__(self, pair, tokenizers):
        self.pair = pair.split('-')
        self.src_lang = self.pair[0]
        self.tgt_lang = self.pair[1]
        
    def tokenize(self, ):
        """
        loads tokenizer, 
        tokenizes train.lang,
        returns tokenized, speed
        """
    
    def generate_env_var(self, ):
        """
        generate env_vars for current run
        """
        
        env_vars = 'export DATA_PATH= ../data

        export VOCAB_SOURCE=${DATA_PATH}/vocab.bpe.32000
        export VOCAB_TARGET=${DATA_PATH}/vocab.bpe.32000
        export TRAIN_SOURCES=${DATA_PATH}/toks_0.5k.en
        export TRAIN_TARGETS=${DATA_PATH}/toks_0.5k.mr
        export DEV_SOURCES=${DATA_PATH}/newstest2013.tok.bpe.32000.en
        export DEV_TARGETS=${DATA_PATH}/newstest2013.tok.bpe.32000.de

        export DEV_TARGETS_REF=${DATA_PATH}/newstest2013.tok.de
        export TRAIN_STEPS=1000000'
    
    def train_nmt(self,):
        """
        loads tokenized,
        trains model
        """
        
    def translate(self, ):
        """
        loads model,
        loads dev or test,
        translates
        returns translation
        """
    
    def compute_bleu(self, ):
        """
        loads translation,
        computes bleu,
        returns list of bleu scores
        """
    
    def plot(self, ):
        """
        plots results
        """
    def run(self):
        """
        runs the whole thing
        """class BleuTester:
    def __init__(self,):
        
    def train_nmt(self,)
    
    def compute_bleu(self,)
    
    def run(self):