# Main Imports

In [None]:
!pip3 install ipatok pykakasi pinyin_jyutping_sentence korean_romanizer

Collecting ipatok
  Downloading ipatok-0.4.2-py2.py3-none-any.whl (15 kB)
Collecting pykakasi
  Downloading pykakasi-2.2.1-py3-none-any.whl (2.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.4/2.4 MB[0m [31m14.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pinyin_jyutping_sentence
  Downloading pinyin_jyutping_sentence-1.3.tar.gz (12.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.4/12.4 MB[0m [31m22.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting korean_romanizer
  Downloading korean_romanizer-0.25.1-py3-none-any.whl (18 kB)
Collecting jaconv (from pykakasi)
  Downloading jaconv-0.3.4.tar.gz (16 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting deprecated (from pykakasi)
  Downloading Deprecated-1.2.14-py2.py3-none-any.whl (9.6 kB)
Building wheels for collected packages: pinyin_jyutping_sentence, jaconv
  Building wheel for pinyin_jyutping_sentence (setup.py) 

In [None]:
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import re
import random

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

import numpy as np
from torch.utils.data import TensorDataset, DataLoader, RandomSampler

from ipatok import tokenise
from datetime import datetime
import os
import pandas as pd

import shutil
import copy

import time
import math

import json
import pickle

import pinyin_jyutping_sentence
import pykakasi
from korean_romanizer.romanizer import Romanizer

import matplotlib.pyplot as plt
import statistics
import numbers
import csv

import matplotlib.ticker as ticker
import numpy as np

# Local Config

In [None]:
LC = {
    'is_initialized': False,
    'is_colab': True if os.getenv("COLAB_RELEASE_TAG") else False,
    'colab_auto_quit': True,
    'to_train': True,
    'lang': 'ja',
    'to_load': False,
    'to_load_model': '2024-04-20_21-50-55',
    'to_save': True,
    'drive_mount': '/content/drive',
    'drive_root': '/content/drive/My Drive/seq2seq/',
    'src_data': 'training-v12-full.zip',
    'src_testing': 'testing-v12.zip',
    'batch_size': int(5800 * (1280 / (1536 + 40))), # can be 'auto' or a number
    'epoch_goal': 500,
    'data_root': 'data',
    'testing_root': 'testing',
    'model_root': 'models',
    'model_path': '',
    'model_export_path': '',
    'default_model_config': {
        'lang': False,
        'hidden_size': 1536,
        'max_length': 17,
        'epoch_count': 0,
        'last_trained_at': False,
        'created_at': False,
        'title': 'ja-v12-1536-full',
        'testing_high_total': 0,
        'testing_high_lv': 0,
        'testing_high_median': 0,
        'testing_high_epoch': 0,
        'testing_saved_total': 0,
        'testing_saved_lv': 0,
        'testing_latest_total': 0,
        'testing_latest_lv': 0,
        'testing_latest_median': 0,
        'testing_latest_epoch': 0,
        'testing_max_total': 0,
        'verify_high_total': 0,
        'verify_high_median': 0,
        'verify_high_epoch': 0,
        'verify_saved_total': 0,
        'verify_latest_total': 0,
        'verify_latest_median': 0,
        'verify_latest_epoch': 0,
        'verify_max_total': 0,
    },
    'model': False
}

# Classes

## Lang

In [None]:
SOS_token = 0
EOS_token = 1

class Lang:
    def __init__(self, isIpa = False):
        self.token2index = {}
        self.token2count = {}
        self.index2token = {0: "?", 1: "EOS" }
        self.n_tokens = len(self.index2token.keys())
        self.isIpa = isIpa

    def addWord(self, word):
        if self.isIpa:
            word, language = splitInput(word)
            if language:
                self.addToken(language)

            word = standardiseIpa(word)

        for token in word:
            self.addToken(token)

    def addToken(self, token):
        if token not in self.token2index:
            self.token2index[token] = self.n_tokens
            self.token2count[token] = 1
            self.index2token[self.n_tokens] = token
            self.n_tokens += 1
        else:
            self.token2count[token] += 1

## Encoder

In [None]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size, dropout_p=0.1):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size, batch_first=True)
        self.dropout = nn.Dropout(dropout_p)

    def forward(self, input):
        embedded = self.dropout(self.embedding(input))
        output, hidden = self.gru(embedded)
        return output, hidden

## Attention (Bahdanau)

In [None]:
class BahdanauAttention(nn.Module):
    def __init__(self, hidden_size):
        super(BahdanauAttention, self).__init__()
        self.Wa = nn.Linear(hidden_size, hidden_size)
        self.Ua = nn.Linear(hidden_size, hidden_size)
        self.Va = nn.Linear(hidden_size, 1)

    def forward(self, query, keys):
        scores = self.Va(torch.tanh(self.Wa(query) + self.Ua(keys)))
        scores = scores.squeeze(2).unsqueeze(1)

        weights = F.softmax(scores, dim=-1)
        context = torch.bmm(weights, keys)

        return context, weights

## Decoder (with attention)

In [None]:
class AttnDecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, maxWordLength, dropout_p=0.1):
        super(AttnDecoderRNN, self).__init__()
        self.embedding = nn.Embedding(output_size, hidden_size)
        self.attention = BahdanauAttention(hidden_size)
        self.gru = nn.GRU(2 * hidden_size, hidden_size, batch_first=True)
        self.out = nn.Linear(hidden_size, output_size)
        self.dropout = nn.Dropout(dropout_p)
        self.maxWordLength = maxWordLength

    def forward(self, encoder_outputs, encoder_hidden, target_tensor=None):
        batch_size = encoder_outputs.size(0)
        decoder_input = torch.empty(batch_size, 1, dtype=torch.long, device=device).fill_(SOS_token)
        decoder_hidden = encoder_hidden
        decoder_outputs = []
        attentions = []

        for i in range(self.maxWordLength):
            decoder_output, decoder_hidden, attn_weights = self.forward_step(
                decoder_input, decoder_hidden, encoder_outputs
            )
            decoder_outputs.append(decoder_output)
            attentions.append(attn_weights)

            if target_tensor is not None:
                # Teacher forcing: Feed the target as the next input
                decoder_input = target_tensor[:, i].unsqueeze(1) # Teacher forcing
            else:
                # Without teacher forcing: use its own predictions as the next input
                _, topi = decoder_output.topk(1)
                decoder_input = topi.squeeze(-1).detach()  # detach from history as input

        decoder_outputs = torch.cat(decoder_outputs, dim=1)
        decoder_outputs = F.log_softmax(decoder_outputs, dim=-1)
        attentions = torch.cat(attentions, dim=1)

        return decoder_outputs, decoder_hidden, attentions


    def forward_step(self, input, hidden, encoder_outputs):
        embedded =  self.dropout(self.embedding(input))

        query = hidden.permute(1, 0, 2)
        context, attn_weights = self.attention(query, encoder_outputs)
        input_gru = torch.cat((embedded, context), dim=2)

        output, hidden = self.gru(input_gru, hidden)
        output = self.out(output)

        return output, hidden, attn_weights

# Initialization Helper Functions

## Get Date Time

In [None]:
def getDateTime():
    return datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

## Mount COLAB

In [None]:
def mountColab():
  drive.mount(LC['drive_mount'])

  !cp "{os.path.join(LC['drive_root'], LC['src_data'])}" "{os.path.join(LC['data_root'], LC['src_data'])}"
  !cp "{os.path.join(LC['drive_root'], LC['src_testing'])}" "{os.path.join(LC['testing_root'], LC['src_testing'])}"

  !unzip -o "{os.path.join(LC['data_root'], LC['src_data'])}" -d "{os.path.join(LC['data_root'])}"
  !unzip -o "{os.path.join(LC['testing_root'], LC['src_testing'])}" -d "{os.path.join(LC['testing_root'])}"
  !rm "{os.path.join(LC['data_root'], LC['src_data'])}"

## Get Pairs

In [None]:
def getPairs(filename):
    # Read the file and split into lines
    lines = open(filename, encoding='utf-8').read().strip().split('\n')

    # Split every line into pairs
    pairs = [[s for s in l.split('\t')] for l in lines]

    return pairs

## Get Testing File

In [None]:
def getTestingFile(filename):
    # Read the file and split into lines
    content = open(filename, encoding='utf-8').read().strip()

    return json.loads(content)

## Create Lang

In [None]:
def createLang(pairs, pairIndex, isIpa):
    lang = Lang(isIpa)

    for pair in pairs:
        lang.addWord(pair[pairIndex])

    return lang

## Create Model

In [None]:
def createModel():
    config = copy.copy(LC['default_model_config'])
    config['created_at'] = getDateTime()
    config['lang'] = LC['lang']

    LC['model_path'] = os.path.join(LC['model_root'], config['lang'], config['title'])
    LC['model_export_path'] = os.path.join(LC['drive_root'], LC['model_root'], config['lang'], config['title'] + '.zip')

    pairs = getPairs(os.path.join(LC['data_root'], config['lang'] + '.tsv'))

    if LC['batch_size'] == 'auto':
        LC['batch_size'] = len(pairs)

    inputLang = createLang(pairs, 0, True)
    outputLang = createLang(pairs, 1, False)

    encoder = EncoderRNN(inputLang.n_tokens, config['hidden_size']).to(device)
    decoder = AttnDecoderRNN(config['hidden_size'], outputLang.n_tokens, config['max_length']).to(device)
    encoder = encoder.to(memory_format=torch.channels_last)
    decoder = decoder.to(memory_format=torch.channels_last)

    testing = getTestingFile(os.path.join(LC['testing_root'], 'testing_%s.json' % config['lang']))
    config['testing_max_total'] = testing[2][len(testing[2]) - 1]
    verifyData = getTestingFile(os.path.join(LC['testing_root'], 'verify_%s.json' % config['lang']))
    config['verify_max_total'] = verifyData[2][len(verifyData[2]) - 1]

    romanized = []

    for item in testing[2]:
      if not isinstance(item, numbers.Number):
        romanized.append(romanize(config['lang'], item))

    testing.append(romanized)

    return {
        'config': config,
        'input_lang': inputLang,
        'output_lang': outputLang,
        'pairs': pairs,
        'encoder': encoder,
        'decoder': decoder,
        'loss_plot': [],
        'testing': testing,
        'verify': verifyData
    }

## Load Model

In [None]:
def loadModel():
    LC['model_export_path'] = os.path.join(LC['drive_root'], LC['model_root'], LC['lang'], LC['to_load_model'] + '.zip')
    LC['model_path'] = os.path.join(LC['model_root'], LC['lang'], LC['to_load_model'])

    !mkdir -p "{LC['model_path']}"
    if LC['is_colab']:
        !unzip -o "{LC['model_export_path']}" -d "{LC['model_path']}"

    config = False
    with open(os.path.join(LC['model_path'], 'config.json'), 'r') as f:
        config = json.load(f)
    LC['lang'] = config['lang']

    loss_plot = False
    with open(os.path.join(LC['model_path'], 'loss_plot.json'), 'r') as f:
        loss_plot = json.load(f)

    pairs = getPairs(os.path.join(LC['model_path'], 'pairs.tsv'))

    inputLang = False
    with open(os.path.join(LC['model_path'], 'input_lang.pickle'), 'rb') as f:
        inputLang = pickle.load(f)

    outputLang = False
    with open(os.path.join(LC['model_path'], 'output_lang.pickle'), 'rb') as f:
        outputLang = pickle.load(f)

    encoder = EncoderRNN(inputLang.n_tokens, config['hidden_size']).to(device)
    decoder = AttnDecoderRNN(config['hidden_size'], outputLang.n_tokens, config['max_length']).to(device)

    encoder.load_state_dict(torch.load(os.path.join(LC['model_path'], 'encoder.pth')))
    decoder.load_state_dict(torch.load(os.path.join(LC['model_path'], 'decoder.pth')))
    encoder = encoder.to(memory_format=torch.channels_last)
    decoder = decoder.to(memory_format=torch.channels_last)

    testing = False
    with open(os.path.join(LC['model_path'], 'testing.json')) as f:
        testing = json.load(f)

    LC['model']['config']['testing_max_total'] = testing[2][len(testing[2]) - 1]

    verifyData = False
    with open(os.path.join(LC['model_path'], 'verify.json')) as f:
        verifyData = json.load(f)

    LC['model']['config']['verify_max_total'] = verifyData[2][len(verifyData[2]) - 1]

    return {
        'config': config,
        'input_lang': inputLang,
        'output_lang': outputLang,
        'pairs': pairs,
        'encoder': encoder,
        'decoder': decoder,
        'loss_plot': loss_plot,
        'testing': testing,
        'verify': verifyData
    }

## Save Model

In [None]:
def saveModel():
    print('Saving...')
    !mkdir -p "{LC['model_path']}"

    LC['model']['config']['testing_saved_total'] = LC['model']['config']['testing_latest_total']
    LC['model']['config']['testing_saved_lv'] = LC['model']['config']['testing_latest_lv']
    LC['model']['config']['verify_saved_total'] = LC['model']['config']['verify_latest_total']

    with open(os.path.join(LC['model_path'], 'config.json'), 'w') as f:
        json.dump(LC['model']['config'], f)

    with open(os.path.join(LC['model_path'], 'loss_plot.json'), 'w') as f:
        json.dump(LC['model']['loss_plot'], f)

    lines = []
    for pair in LC['model']['pairs']:
        lines.append("\t".join(pair))
    with open(os.path.join(LC['model_path'], 'pairs.tsv'), 'w') as f:
        f.write("\n".join(lines))

    with open(os.path.join(LC['model_path'], 'input_lang.pickle'), 'wb') as f:
        pickle.dump(LC['model']['input_lang'], f)

    with open(os.path.join(LC['model_path'], 'output_lang.pickle'), 'wb') as f:
        pickle.dump(LC['model']['output_lang'], f)

    torch.save(LC['model']['encoder'].state_dict(), os.path.join(LC['model_path'], 'encoder.pth'))
    torch.save(LC['model']['decoder'].state_dict(), os.path.join(LC['model_path'], 'decoder.pth'))

    with open(os.path.join(LC['model_path'], 'testing.json'), 'w') as f:
        json.dump(LC['model']['testing'], f)

    with open(os.path.join(LC['model_path'], 'verify.json'), 'w') as f:
        json.dump(LC['model']['verify'], f)

    df = pd.DataFrame(LC['model']['testing'])
    df.to_csv(os.path.join(LC['model_path'], 'testing.csv'), quoting=csv.QUOTE_NONNUMERIC)

    df = pd.DataFrame(LC['model']['verify'])
    df.to_csv(os.path.join(LC['model_path'], 'verify.csv'), quoting=csv.QUOTE_NONNUMERIC)

    if LC['is_colab']:
        !zip -q "{LC['model_export_path']}" -j "{os.path.join('.', LC['model_path'])}"/*

    print('Saved!')
    return True

## Standardise IPA

In [None]:
def standardiseIpa(word):
    return ''.join(tokenise(word, strict=False, replace=True, diphthongs=False, tones=False, unknown=False))

## Split Input

In [None]:
def splitInput(inputWord):
    word = ''
    language = False
    if len(inputWord) > 2:
        if inputWord[2] == "_":
            language = inputWord[:3]
            word = inputWord[3:]
        else:
            word = inputWord
    else:
        word = inputWord

    return word, language

## Tokenize Word

In [None]:
def tokenizeWord(lang, word):
    tokens = []
    if lang.isIpa:
      word, language = splitInput(word)

      if language:
          tokens.append(language)

      word = ''.join(tokenise(word, strict=False, replace=True, diphthongs=False, tones=False, unknown=False))
    for token in word:
        tokens.append(token)

    return tokens

## Get Indexes From Word

In [None]:
def indexesFromWord(lang, word):
    return [lang.token2index[token] for token in tokenizeWord(lang, word)]

## Get Tensor From Word

In [None]:
def tensorFromWord(lang, word):
    indexes = indexesFromWord(lang, word)
    indexes.append(EOS_token)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(1, -1)

## Get Dataloader

In [None]:
def get_dataloader(input_lang, output_lang, pairs, batch_size, maxWordLength):
    n = len(pairs)
    input_ids = np.zeros((n, maxWordLength), dtype=np.int32)
    target_ids = np.zeros((n, maxWordLength), dtype=np.int32)

    for idx, (inp, tgt) in enumerate(pairs):
        inp_ids = indexesFromWord(input_lang, inp)
        tgt_ids = indexesFromWord(output_lang, tgt)
        inp_ids.append(EOS_token)
        tgt_ids.append(EOS_token)
        input_ids[idx, :len(inp_ids)] = inp_ids
        target_ids[idx, :len(tgt_ids)] = tgt_ids

    train_data = TensorDataset(torch.LongTensor(input_ids).to(device),
                               torch.LongTensor(target_ids).to(device))

    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size, num_workers=2, persistent_workers=True)

    return train_dataloader

## Train Epoch

In [None]:
def train_epoch(dataloader, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, scaler):
    total_loss = 0
    for data in dataloader:
        input_tensor, target_tensor = data

        encoder_optimizer.zero_grad(set_to_none=True)
        decoder_optimizer.zero_grad(set_to_none=True)

        with torch.cuda.amp.autocast():
          encoder_outputs, encoder_hidden = encoder(input_tensor)
          decoder_outputs, _, _ = decoder(encoder_outputs, encoder_hidden, target_tensor)

          loss = criterion(
              decoder_outputs.view(-1, decoder_outputs.size(-1)),
              target_tensor.view(-1)
          )
        #autocast until here

        scaler.scale(loss).backward()

        scaler.step(encoder_optimizer)
        scaler.step(decoder_optimizer)

        scaler.update()
#        loss.backward()

#        encoder_optimizer.step()
#        decoder_optimizer.step()

        total_loss += loss.item()

    return total_loss / len(dataloader)

## Get Time as Minutes

In [None]:
def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

## Get Time Since

In [None]:
def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

## Romanize

In [None]:
def romanize(language, word):
    if language == 'zh':
        return pinyin_jyutping_sentence.pinyin(word)
    elif language == 'ja':
        kks = pykakasi.kakasi()
        conversionResult = kks.convert(word)
        res = ''
        for item in conversionResult:
            res += item['hepburn']
        return res
    elif language == 'ko':
        r = Romanizer(word)
        return r.romanize()
    else:
        return word

## Evaluate


In [None]:
def evaluate(word):
    encoder = LC['model']['encoder']
    decoder = LC['model']['decoder']
    input_lang = LC['model']['input_lang']
    output_lang = LC['model']['output_lang']

    with torch.no_grad():
        input_tensor = tensorFromWord(input_lang, word)

        encoder_outputs, encoder_hidden = encoder(input_tensor)
        decoder_outputs, decoder_hidden, decoder_attn = decoder(encoder_outputs, encoder_hidden)

        _, topi = decoder_outputs.topk(1)
        decoded_ids = topi.squeeze()

        decoded_tokens = []
        for idx in decoded_ids:
            if idx.item() == EOS_token:
                break
            decoded_tokens.append(output_lang.index2token[idx.item()])
    return ''.join(decoded_tokens)

## Test

In [None]:
def test(epoch):
    testing = LC['model']['testing']

    LC['model']['encoder'].eval()
    LC['model']['decoder'].eval()

    totals = {
        'total': 0
    }
    totalsarr = []
    res = []

    for i in range(0, len(testing[0])):
        if testing[0][i] != 'total':
            testlang = testing[0][i]
            testword = testing[1][i]
            expected = testing[2][i]

            actual = evaluate(testword)
            if testlang not in totals:
              totals[testlang] = 0
            if actual == expected:
                res.append('')
                totals[testlang] += 1
                totals['total'] += 1
            else:
                res.append("%s\n(%s)" % (actual, romanize(LC['lang'], actual)))
        else:
            if testing[1][i] != 'all':
                testlang = testing[1][i]
                res.append(totals[testlang])
                totalsarr.append(totals[testlang])
                if testlang == 'lv':
                    LC['model']['config']['testing_latest_lv'] = totals[testlang]

                    if totals[testlang] > LC['model']['config']['testing_high_lv']:
                        LC['model']['config']['testing_high_lv'] = totals[testlang]
            else:
                res.append(totals['total'])
    res.append(epoch)

    LC['model']['testing'].append(res)
    median = statistics.median(totalsarr)
    # Consider moving this elsewhere
    LC['model']['config']['testing_latest_total'] = totals['total']
    LC['model']['config']['testing_latest_median'] = median
    LC['model']['config']['testing_latest_epoch'] = epoch
    if (totals['total'] > LC['model']['config']['testing_high_total']) or (totals['total'] == LC['model']['config']['testing_high_total'] and median > LC['model']['config']['testing_high_median']):
        LC['model']['config']['testing_high_total'] = totals['total']
        LC['model']['config']['testing_high_median'] = median
        LC['model']['config']['testing_high_epoch'] = epoch

    # Return to traning mode
    LC['model']['encoder'].train()
    LC['model']['decoder'].train()

    return totals['total'], median

## Verify

In [None]:
def verify(epoch):
    testing = LC['model']['verify']

    LC['model']['encoder'].eval()
    LC['model']['decoder'].eval()

    totals = {
        'total': 0
    }
    totalsarr = []
    res = []

    for i in range(0, len(testing[0])):
        if testing[0][i] != 'total':
            testlang = testing[0][i]
            testword = testing[1][i]
            expected = testing[2][i]

            actual = evaluate(testword)
            if testlang not in totals:
              totals[testlang] = 0
            if actual == expected:
                res.append('')
                totals[testlang] += 1
                totals['total'] += 1
            else:
                res.append("%s\n(%s)" % (actual, romanize(LC['lang'], actual)))
        else:
            if testing[1][i] != 'all':
                testlang = testing[1][i]
                res.append(totals[testlang])
                totalsarr.append(totals[testlang])
            else:
                res.append(totals['total'])
    res.append(epoch)

    LC['model']['verify'].append(res)
    median = statistics.median(totalsarr)
    # Consider moving this elsewhere
    LC['model']['config']['verify_latest_total'] = totals['total']
    LC['model']['config']['verify_latest_median'] = median
    LC['model']['config']['verify_latest_epoch'] = epoch
    if (totals['total'] > LC['model']['config']['verify_high_total']) or (totals['total'] == LC['model']['config']['verify_high_total'] and median > LC['model']['config']['verify_high_median']):
        LC['model']['config']['verify_high_total'] = totals['total']
        LC['model']['config']['verify_high_median'] = median
        LC['model']['config']['verify_high_epoch'] = epoch

    # Return to traning mode
    LC['model']['encoder'].train()
    LC['model']['decoder'].train()

    return totals['total'], median

## Train

In [None]:
def train(train_dataloader, encoder, decoder, n_epochs, learning_rate=0.001,
               print_every=100, test_every=50, plot_every=100):
    start = time.time()
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every

    encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate)
    scaler = torch.cuda.amp.GradScaler()
    criterion = nn.NLLLoss()

    for epoch in range(1, n_epochs + 1):
        loss = train_epoch(train_dataloader, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, scaler)
        print_loss_total += loss
        plot_loss_total += loss

        LC['model']['config']['epoch_count'] += 1

        if epoch % test_every == 0:
            totalsV, medianV = verify(epoch)
            totalsT, medianT = test(epoch)
            if totalsT == LC['model']['config']['testing_high_total']:
                isHighScore = 'HIGH SCORE!'
            else:
                isHighScore = ''
            print("%s %s total score %s/%s (%s), median %s. verify score %s/%s (%s), median %s." % (isHighScore, epoch, totalsT, LC['model']['config']['testing_max_total'], str(round(totalsT / LC['model']['config']['testing_max_total'] * 100, 2)) + '%', medianT, totalsV, LC['model']['config']['verify_max_total'], str(round(totalsV / LC['model']['config']['verify_max_total'] * 100, 2)) + '%', medianV))
            if (totalsT == LC['model']['config']['testing_high_total'] and LC['to_save'] and (totalsT > LC['model']['config']['testing_saved_total'] or LC['model']['config']['testing_latest_lv'] > LC['model']['config']['testing_saved_lv'] or LC['model']['config']['verify_latest_total'] > LC['model']['config']['verify_saved_total'])):
                LC['model']['config']['last_trained_at'] = getDateTime()
                saveModel()
            if totalsT == LC['model']['config']['testing_max_total']:
                LC['to_save'] = False
                break

        if epoch % plot_every == 0:
            plot_loss_avg = plot_loss_total / plot_every
            LC['model']['loss_plot'].append(plot_loss_avg)
            plot_loss_total = 0

        if epoch % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print('%s (%d %d%%) %.8f' % (timeSince(start, epoch / n_epochs), epoch, epoch / n_epochs * 100, print_loss_avg))

# Initialization

In [None]:
if not LC['is_initialized']:
    %matplotlib inline

    # Create directories
    !mkdir -p "{LC['data_root']}" "{LC['model_root']}" "{LC['testing_root']}"

    if LC['is_colab']:
        from google.colab import drive
        from google.colab import runtime
        mountColab()

    torch.multiprocessing.set_start_method('forkserver')

    torch.autograd.set_detect_anomaly(False, check_nan=False)
    torch.autograd.profiler.profile(enabled=False)
    torch.autograd.profiler.emit_nvtx(enabled=False)
    torch.backends.cudnn.benchmark = True

    torch.manual_seed(0)
    random.seed(0)
    np.random.seed(0)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    if LC['to_load']:
        LC['model'] = loadModel()
    else:
        LC['model'] = createModel()

    print("%s (%s):" % (LC['model']['config']['title'], LC['model']['config']['lang']));
    print("*  Input tokens: %s" % LC['model']['input_lang'].n_tokens);
    print("*  Output tokens: %s" % LC['model']['output_lang'].n_tokens);
    print("*  Hidden size: %s" % LC['model']['config']['hidden_size']);
    print("*  Max length: %s" % LC['model']['config']['max_length']);
    print("*  Epochs: %s" % LC['model']['config']['epoch_count']);
    print("*  Created at: %s" % LC['model']['config']['created_at']);
    print("*  Last trained at: %s" % LC['model']['config']['last_trained_at']);
    print("*  Pairs: %s" % len(LC['model']['pairs']));

    LC['is_initialized'] = True

Mounted at /content/drive
Archive:  data/training-v12-full.zip
  inflating: data/ja.tsv             
  inflating: data/ko.tsv             
  inflating: data/zh.tsv             
Archive:  testing/testing-v12.zip
  inflating: testing/testing_ja.json  
  inflating: testing/testing_ko.json  
  inflating: testing/testing_zh.json  
  inflating: testing/verify_ja.json  
  inflating: testing/verify_ko.json  
  inflating: testing/verify_zh.json  
ja-v12-1536-full (ja):
*  Input tokens: 99
*  Output tokens: 86
*  Hidden size: 1536
*  Max length: 17
*  Epochs: 0
*  Created at: 2024-05-05_16-28-32
*  Last trained at: False
*  Pairs: 183619


# Training

In [None]:
if LC['to_train']:
    LC['model']['encoder'].train()
    LC['model']['decoder'].train()
    LC['model']['config']['last_trained_at'] = getDateTime()

    train_dataloader = get_dataloader(
        LC['model']['input_lang'],
        LC['model']['output_lang'],
        LC['model']['pairs'],
        LC['batch_size'],
        LC['model']['config']['max_length']
    )

    for inputs, targets in train_dataloader:
        inputs, targets = inputs.to(device), targets.to(device)

    epoch_count = LC['epoch_goal'] - LC['model']['config']['epoch_count']
    print("Epochs to train: %s" % epoch_count)
    train(
        train_dataloader,
        LC['model']['encoder'],
        LC['model']['decoder'],
        epoch_count,
        # print_every=math.floor(epoch_count/100*1),
        print_every=5,
        test_every=5,
        # learning_rate=0.00075,
        plot_every=1
    )

Epochs to train: 500
HIGH SCORE! 5 total score 67/195 (34.36%), median 6. verify score 50/195 (25.64%), median 4.
Saving...
Saved!
2m 25s (- 240m 52s) (5 1%) 0.40373224
HIGH SCORE! 10 total score 96/195 (49.23%), median 8. verify score 95/195 (48.72%), median 7.
Saving...
Saved!
4m 46s (- 234m 8s) (10 2%) 0.11596207
HIGH SCORE! 15 total score 97/195 (49.74%), median 8. verify score 111/195 (56.92%), median 8.
Saving...
Saved!
7m 8s (- 230m 42s) (15 3%) 0.06364830
 20 total score 94/195 (48.21%), median 7. verify score 133/195 (68.21%), median 10.
9m 15s (- 222m 17s) (20 4%) 0.03845032
 25 total score 92/195 (47.18%), median 6. verify score 142/195 (72.82%), median 11.
11m 24s (- 216m 38s) (25 5%) 0.02244005
 30 total score 94/195 (48.21%), median 7. verify score 146/195 (74.87%), median 12.
13m 32s (- 212m 12s) (30 6%) 0.01338041
 35 total score 95/195 (48.72%), median 7. verify score 159/195 (81.54%), median 12.
15m 40s (- 208m 11s) (35 7%) 0.00776343
HIGH SCORE! 40 total score 99/195

In [None]:
if LC['to_save']:
    if LC['model']['config']['testing_max_total'] != LC['model']['config']['testing_latest_total']:
      LC['model_path'] = os.path.join(LC['model_root'], LC['model']['config']['lang'], LC['model']['config']['title'] + '-' + str(LC['model']['config']['epoch_count']))
      LC['model_export_path'] = os.path.join(LC['drive_root'], LC['model_root'], LC['model']['config']['lang'], LC['model']['config']['title'] + '-' + str(LC['model']['config']['epoch_count']) + '.zip')
      saveModel()

Saving...
Saved!


# Exit

In [None]:
!date

Sun May  5 08:03:03 PM UTC 2024


In [None]:
if LC['is_colab'] and LC['colab_auto_quit']:
  runtime.unassign()