In [37]:
"""
SENTENCEPIECE v0.1

REFERENCE:
https://colab.research.google.com/github/google/sentencepiece/blob/master/python/sentencepiece_python_module_example.ipynb
https://github.com/google/sentencepiece

EN-DE : Euparl, News, CCrawl, 
DE-EN : idem

EN-LT : Eupar

EN-MR : LoRes21
EN-GA : LoRes21

tokenized with sacremoses

Lenght mu = arithmetic mean of target seqs after encoding
Freq@95% = least freq in the 95% of vocab (log)

vocab_sizes = [500, 1000, 2000, 4000, 8000, 16000, 32000, 48000, 64000]

!!! BPE and SentPiece voc_size are not comparable, SentPiece gives error over max value that changes with data
    > limit at 8k
    > bacause of max number of unique forms in train data?
    > clarify vocab_size vs merge operations. does sentencepiece uses vocab or merge for bpe implementation?
    
!!! in original paper tokenized with Sacremoses but it has no option to change voc_size
    > sentencepiece has a bpe mode and char, test that and adapt functions and vocab_sizes to accomodate that
        > sentencepiece implementation is good enough
        
!!! get logs of training and tokenization speed and other output to df and save csv for final runs
    > add incremental row index to df for storing
    > check the correctness of frea@95 and avg_len stats


build BleuTester with trained NMT
"""

from pathlib import Path
import os
import sentencepiece as spm
import json
import time
import shutil
import ast
import shutil
import pandas as pd
import seaborn as sns
import re

In [59]:
class TokBuilder:
    """
    builds tokenizers and freq dict for lang pair. can use unigram, bpe, char model_type
    """
    def __init__(self, pair, model_type):
        self.pair = pair.split('-')
        self.src_lang = self.pair[0]
        self.tgt_lang = self.pair[1]
        self.model_type = model_type

    def count_chars(self, lang):
        """
        returns number of unique chars in file for char vocab_size
        """

        file_path = f'./data/{self.src_lang}_{self.tgt_lang}/train.{lang}'

        with open(file_path, 'r') as file:   
            unique = []

            for line in file.readlines():
                for char in line:
                    if char not in unique:
                        unique.append(char)

        return int(len(unique))

    
    def make_batches(self, lang):
        """
        Makes batches of 5_000 lines from bigger txt file for the selectet lang
        """
        
        file_path = f'./data/{self.src_lang}_{self.tgt_lang}/train.{lang}'
        file = open(file_path, 'r')
        data = file.readlines()
        file.close()

        text_data = []
        file_count = 0

        for sample in data:
            sample = sample.replace('\n', '')
            text_data.append(sample)
            
            save_path = f'./data/{self.src_lang}_{self.tgt_lang}/train/text/train_{file_count}.{lang}'
            
            if len(text_data) == 5_000:
                # once we hit the 5K mark, save to file
                with open(save_path, 'w+', encoding='utf-8') as fp:
                    fp.write('\n'.join(text_data))
                text_data = []
                file_count += 1

        with open(save_path, 'w+', encoding='utf-8') as fp:
            fp.write('\n'.join(text_data))
    
    def gather_files(self, lang):
        """
        Returns the paths to the training batches for the selected lang
        """
        
        self.make_batches(lang)
        paths = [str(x) for x in Path(f'./data/{self.src_lang}_{self.tgt_lang}/train/text').glob(f'**/*.{lang}')]
        return paths

    def train_tokenizer(self, lang, vocab_size):
        """
        Trains a SentencePiece tokenizer for the selected lang and vocab_size
        """
        
        print(f'Training tokenizer for {lang} with vocab_size of {vocab_size}')
        
        tokenizer_name = f'{lang}_{self.model_type}_{vocab_size/1000}k'
        
        sp_model = spm.SentencePieceProcessor()
              
        paths = self.gather_files(lang)
        
        
        #the revoming part gets the script stuck after a while. removing manually makes it work
        
        tokenizer_path = f'./tokenizers/{self.src_lang}_{self.tgt_lang}/{lang}/{tokenizer_name}'
        
        if not os.path.isdir(f'./tokenizers/{self.src_lang}_{self.tgt_lang}'):
            os.mkdir(f'./tokenizers/{self.src_lang}_{self.tgt_lang}')
        
        if not os.path.isdir(f'./tokenizers/{self.src_lang}_{self.tgt_lang}/{lang}'):
            os.mkdir(f'./tokenizers/{self.src_lang}_{self.tgt_lang}/{lang}')
        
        if os.path.isdir(tokenizer_path):
            shutil.rmtree(tokenizer_path)        
        
        os.mkdir(tokenizer_path)
        
        start = time.time()
        
        spm.SentencePieceTrainer.train(
            input=paths,
            model_prefix=f'{tokenizer_path}/{tokenizer_name}',
            vocab_size=vocab_size,
            unk_id=2,
            bos_id=-1,
            eos_id=1,
            pad_id=0,
            model_type=self.model_type,
            train_extremely_large_corpus=False
        )
 
        end = time.time()
        
        print(f'Training time: {end-start}')

        print("\n }-{}-<>-{}-<>-{}-<>-{}-<>-{}-<>-{}-<>-{}-<>-{}-<>-{}-<>-{}-<>-{ \n")
       
    def make_freqs(self, lang, tokenizer, save_tokenized=False):
        """
        Makes frequency files for the selected lang and tokenizer
        """
        
        tokenizer_path = f'./tokenizers/{self.src_lang}_{self.tgt_lang}/{lang}/{tokenizer}/{tokenizer}.model'
        
        sp = spm.SentencePieceProcessor()
        sp.load(f'{tokenizer_path}')
        
        vocabs = [sp.id_to_piece(id) for id in range(sp.get_piece_size())]

        if save_tokenized == True:
            if os.path.isfile(f'./data/{self.src_lang}_{self.tgt_lang}/train/tokenized/toks_{tokenizer}.{lang}'):
                os.remove(f'./data/{self.src_lang}_{self.tgt_lang}/train/tokenized/toks_{tokenizer}.{lang}')
        
        freq = {}
        with open(f'./data/{self.src_lang}_{self.tgt_lang}/train.{lang}', 'r') as f:
            start = time.time()
            for line in f:
                line = line.rstrip()
                toks = sp.encode_as_pieces(line)
                for piece in toks:
                    freq.setdefault(piece, 0)
                    freq[piece] += 1
            
                if save_tokenized == True:
                    with open(f'./data/{self.src_lang}_{self.tgt_lang}/train/tokenized/toks_{tokenizer}.{lang}', 'a+') as out:
                        print(toks, file=out)
            end = time.time()
        
        freq = sorted(freqs.items(), key=lambda item: item[1], reverse=True)
        with open(f'./tokenizers/{self.src_lang}_{self.tgt_lang}/{lang}/{tokenizer}/{tokenizer}.freq', 'w+') as out:
            print(freq, file=out)

        print(f'{lang} text tokenized in {end-start} with {tokenizer}')
        print("\n }-{}-<>-{}-<>-{}-<>-{}-<>-{}-<>-{}-<>-{}-<>-{}-<>-{}-<>-{}-<>-{ \n")

    
    def run(self
            #plot=False
           ):
        """
        Runs the training and frequency
        """
        
        langs = [self.src_lang, self.tgt_lang]
       
        for lang in langs:
            
            if self.model_type=='char':
                vocab_sizes = [self.count_chars(lang)]
            elif self.model_type=='bpe':
                vocab_sizes = [500,
                               1000,
                               2000,
                               4000,
                               8000, 
                               16000,
                               32000, 
                               #48000, too big for en-ga
                               #64000 too big for en-mr
                               ]
            elif self.model_type=='unigram':
                vocab_sizes = [500,
                               1000,
                               2000,
                               4000,
                               8000
                               ]
            
            for size in vocab_sizes:
                
                    self.train_tokenizer(lang, size)
                    tokenizer_name = f'{lang}_{self.model_type}_{size/1000}k'
                    self.make_freqs(lang, tokenizer_name, save_tokenized=True)
         
    def collect_stats(self, lang):
        
        tokenizers = os.listdir(f'./tokenizers/{self.src_lang}_{self.tgt_lang}/{lang}')
        
        df = pd.DataFrame(columns=['tokenizer', 'vocab_size', 'freq@95%', 'avg_len'])
            
        for tokenizer in tokenizers:
        
            tokenizer_name = os.path.basename(tokenizer)
            tokenized_text = open(f'./tokenizers/{self.src_lang}_{self.tgt_lang}/{lang}/{tokenizer}/{tokenizer}.freq', 'r')
            freqs = ast.literal_eval(open(f'./tokenizers/{self.src_lang}_{self.tgt_lang}/{lang}/{tokenizer}/{tokenizer_name}.freq').read())
            freqs = list(sorted(freqs.items(), key=lambda item: item[1], reverse=True))

            freq_at_95 = freqs[int((len(freqs)/100)*95)][1]

            lines = tokenized_text.readlines()
            
            avg_len = 0
            
            for line in lines:
                line = line.split(',')
                avg_len += len(line)
                
            avg_len = avg_len/len(lines)

            vocab_size = int(re.sub('\D',"", tokenizer_name))*1000
            
            for row in df:
                df.at[row[0], "tokenizer"] = tokenizer_name
                df.at[row[0], "vocab_size"] = vocab_size
                df.at[row[0], "freq@95%"] = freq_at_95
                df.at[row[0], "avg_len"] = avg_len
            
        print(df.head())
    
    def plot(tokenizers):
        """
        returns plot
        """
        pass
        
        
            

In [60]:
en_ga_char = TokBuilder('en-ga',  model_type='char')

In [None]:
en_ga_char.run()

In [61]:
en_ga_char.collect_stats('en')

         tokenizer vocab_size freq@95% avg_len
t  en_unigram_2.0k      20000        9  2044.0
v  en_unigram_2.0k      20000        9  2044.0
f  en_unigram_2.0k      20000        9  2044.0
a  en_unigram_2.0k      20000        9  2044.0


In [None]:
tokenizers = [str(x) for x in Path('./tokenizers/en_mr/').glob('**/*')]

plot(tokenizers)

In [None]:
"""generate env var and run from server screen ctrl+a d, and to reconnect screen -r

or redirect all the outputs on a file and run the process with nohup and & (running in bg)"""

In [None]:
class BleuTester:
    """
    trains nmt from tokenized with tokenizers,
    translates,
    computes bleu scores and plots results
    """
    
    def __init__(self, pair, tokenizers):
        self.pair = pair.split('-')
        self.src_lang = self.pair[0]
        self.tgt_lang = self.pair[1]
        
    def tokenize(self, ):
        """
        loads tokenizer, 
        tokenizes train.lang,
        returns tokenized, speed
        """
    
    def generate_env_var(self, ):
        """
        generate env_vars for current run
        """
        
        env_vars = 'export DATA_PATH= ../data

        export VOCAB_SOURCE=${DATA_PATH}/vocab.bpe.32000
        export VOCAB_TARGET=${DATA_PATH}/vocab.bpe.32000
        export TRAIN_SOURCES=${DATA_PATH}/toks_0.5k.en
        export TRAIN_TARGETS=${DATA_PATH}/toks_0.5k.mr
        export DEV_SOURCES=${DATA_PATH}/newstest2013.tok.bpe.32000.en
        export DEV_TARGETS=${DATA_PATH}/newstest2013.tok.bpe.32000.de

        export DEV_TARGETS_REF=${DATA_PATH}/newstest2013.tok.de
        export TRAIN_STEPS=1000000'
    
    def train_nmt(self,):
        """
        loads tokenized,
        trains model
        """
        
    def translate(self, ):
        """
        loads model,
        loads dev or test,
        translates
        returns translation
        """
    
    def compute_bleu(self, ):
        """
        loads translation,
        computes bleu,
        returns list of bleu scores
        """
    
    def plot(self, ):
        """
        plots results
        """
    def run(self):
        """
        runs the whole thing
        """class BleuTester:
    def __init__(self,):
        
    def train_nmt(self,)
    
    def compute_bleu(self,)
    
    def run(self):