<a href="https://colab.research.google.com/github/emukans/en-lv-translator/blob/master/LV_EN_Transformer_translate_on_GPU.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# EN-LV Translator

In [0]:
# Install deps
!pip install -q -U tensor2tensor

#1. Initialization


##1.1. Make some directories

In [0]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
import tensorflow as tf
import os

DRIVE_DIR = '/content/drive/My Drive'
DATA_DIR = DRIVE_DIR + "/t2t/data" # This folder contain the data
TMP_DIR = DRIVE_DIR + "/t2t/tmp"
TRAIN_DIR = DRIVE_DIR + "/t2t/train" # This folder contain the model
EXPORT_DIR = DRIVE_DIR + "/t2t/export" # This folder contain the exported model for production
TRANSLATIONS_DIR = DRIVE_DIR + "/t2t/translation" # This folder contain  all translated sequence
EVENT_DIR = DRIVE_DIR + "/t2t/event" # Test the BLEU score
USR_DIR = DRIVE_DIR + "/t2t/user" # This folder contains our data that we want to add

tf.gfile.MakeDirs(DATA_DIR)
tf.gfile.MakeDirs(TMP_DIR)
tf.gfile.MakeDirs(TRAIN_DIR)
tf.gfile.MakeDirs(EXPORT_DIR)
tf.gfile.MakeDirs(TRANSLATIONS_DIR)
tf.gfile.MakeDirs(EVENT_DIR)
tf.gfile.MakeDirs(USR_DIR)

## 1.2. Prepare a problem

In [0]:
# !gcloud auth application-default login

In [0]:
from tensor2tensor.data_generators import problem
from tensor2tensor.data_generators import text_encoder
from tensor2tensor.data_generators import text_problems
from tensor2tensor.data_generators import translate
from tensor2tensor.data_generators import wiki_lm
from tensor2tensor.utils import registry

FLAGS = tf.flags.FLAGS

# End-of-sentence marker.
EOS = text_encoder.EOS_ID


@registry.register_problem
class TranslateEnLv(translate.TranslateProblem):
  """Problem spec for En-Lv translation."""

  @property
  def approx_vocab_size(self):
    return 2**14

  @property
  def vocab_type(self):
    return text_problems.VocabType.SUBWORD

  @property
  def is_generate_per_split(self):
    return False

  @property
  def dataset_splits(self):
    # Since we are responsible for generating the dataset splits, we override
    # `Text2TextProblem.dataset_splits` to specify that we intend to keep
    # 90% data for training and 5% for evaluation and testing each.
    return [{
        "split": problem.DatasetSplit.TRAIN,
        "shards": 90,
    }, {
        "split": problem.DatasetSplit.EVAL,
        "shards": 5,
    }, {
        "split": problem.DatasetSplit.TEST,
        "shards": 5,
    }]

  def source_data_files(self, dataset_split):
    return [
      (
        'gs://translator-lv-en/annotations.tgz',
        ('annotations/annotations.en', 'annotations/annotations.lv')
      ),
      (
        'gs://translator-lv-en/europarl.tgz',
        ('europarl/europarl-v8.lv-en.en', 'europarl/europarl-v8.lv-en.lv')
      ),
      (
        'gs://translator-lv-en/rapid2016.tgz',
        ('rapid2016/rapid2016.en-lv.en', 'rapid2016/rapid2016.en-lv.lv')
      ),
      (
        'gs://translator-lv-en/farewell.tgz',
        ('farewell/farewell.en', 'farewell/farewell.lv')
      ),
      (
        'gs://translator-lv-en/dcep.tgz',
        ('dcep/dcep.en', 'dcep/dcep.lv')
      )
    ]
    

  def generate_samples(self, data_dir, tmp_dir, dataset_split):
    datasets = self.source_data_files(dataset_split)

    tag = "train" if dataset_split == problem.DatasetSplit.TRAIN else "dev"
    data_path = translate.compile_data(tmp_dir, datasets, f'{self.name}-compiled-{tag}')
    
    for example in text_problems.text2text_txt_iterator(data_path + ".lang1", data_path + ".lang2"):
      yield example

## 1.3. Init parameters





In [0]:
PROBLEM = "translate_en_lv"
MODEL = "transformer"
HPARAMS = "transformer_base"

# 2. Data generation 

Generate the data (download the dataset and generate the data).

In [0]:
from tensor2tensor.utils import registry
from tensor2tensor import problems

t2t_problem = problems.problem(PROBLEM)
t2t_problem.generate_data(DATA_DIR, TMP_DIR)

INFO:tensorflow:Found vocab file: /content/drive/My Drive/t2t/data/vocab.translate_en_lv.16384.subwords


INFO:tensorflow:Found vocab file: /content/drive/My Drive/t2t/data/vocab.translate_en_lv.16384.subwords


INFO:tensorflow:Skipping generator because outputs files exists at ['/content/drive/My Drive/t2t/data/translate_en_lv-unshuffled-train-00000-of-00090', '/content/drive/My Drive/t2t/data/translate_en_lv-unshuffled-train-00001-of-00090', '/content/drive/My Drive/t2t/data/translate_en_lv-unshuffled-train-00002-of-00090', '/content/drive/My Drive/t2t/data/translate_en_lv-unshuffled-train-00003-of-00090', '/content/drive/My Drive/t2t/data/translate_en_lv-unshuffled-train-00004-of-00090', '/content/drive/My Drive/t2t/data/translate_en_lv-unshuffled-train-00005-of-00090', '/content/drive/My Drive/t2t/data/translate_en_lv-unshuffled-train-00006-of-00090', '/content/drive/My Drive/t2t/data/translate_en_lv-unshuffled-train-00007-of-00090', '/content/drive/My Drive/t2t/data/translate_en_lv-unshuffled-train-00008-of-00090', '/content/drive/My Drive/t2t/data/translate_en_lv-unshuffled-train-00009-of-00090', '/content/drive/My Drive/t2t/data/translate_en_lv-unshuffled-train-00010-of-00090', '/conten

INFO:tensorflow:Skipping generator because outputs files exists at ['/content/drive/My Drive/t2t/data/translate_en_lv-unshuffled-train-00000-of-00090', '/content/drive/My Drive/t2t/data/translate_en_lv-unshuffled-train-00001-of-00090', '/content/drive/My Drive/t2t/data/translate_en_lv-unshuffled-train-00002-of-00090', '/content/drive/My Drive/t2t/data/translate_en_lv-unshuffled-train-00003-of-00090', '/content/drive/My Drive/t2t/data/translate_en_lv-unshuffled-train-00004-of-00090', '/content/drive/My Drive/t2t/data/translate_en_lv-unshuffled-train-00005-of-00090', '/content/drive/My Drive/t2t/data/translate_en_lv-unshuffled-train-00006-of-00090', '/content/drive/My Drive/t2t/data/translate_en_lv-unshuffled-train-00007-of-00090', '/content/drive/My Drive/t2t/data/translate_en_lv-unshuffled-train-00008-of-00090', '/content/drive/My Drive/t2t/data/translate_en_lv-unshuffled-train-00009-of-00090', '/content/drive/My Drive/t2t/data/translate_en_lv-unshuffled-train-00010-of-00090', '/conten

INFO:tensorflow:Skipping shuffle because output files exist


INFO:tensorflow:Skipping shuffle because output files exist


# 3. Train the model





##3.1. Init parameters
---

 batch_size :  a great value of preference.

---
train_steps : research paper mentioned 300k steps with 8 gpu on big transformer. So if you have 1 gpu, you will need to train the model x8 more. (https://arxiv.org/abs/1706.03762 for more information).



In [0]:
train_steps = 150000
eval_steps = 100
batch_size = 2048
save_checkpoints_steps = 2000
ALPHA = 0.1
schedule = "continuous_train_and_eval"

You can choose schedule :
 

*  train. Bad quality
*  continuous_train_and_eval (default)
*   train_and_eval



##3.2. Train the model

In [0]:
from tensor2tensor.utils.trainer_lib import create_run_config, create_experiment
from tensor2tensor.utils.trainer_lib import create_hparams
from tensor2tensor.utils import registry
from tensor2tensor import models
from tensor2tensor import problems

# Init Hparams object from T2T Problem
hparams = create_hparams(HPARAMS)

# Make Changes to Hparams
hparams.batch_size = batch_size
hparams.learning_rate = ALPHA

























In [0]:
RUN_CONFIG = create_run_config(
      model_dir=TRAIN_DIR,
      model_name=MODEL,
      save_checkpoints_steps= save_checkpoints_steps
)

tensorflow_exp_fn = create_experiment(
        run_config=RUN_CONFIG,
        hparams=hparams,
        model_name=MODEL,
        problem_name=PROBLEM,
        data_dir=DATA_DIR, 
        train_steps=train_steps, 
        eval_steps=eval_steps, 
        use_xla=True # For acceleration
    ) 

tensorflow_exp_fn.train_and_evaluate()



















Instructions for updating:
When switching to tf.estimator.Estimator, use tf.estimator.RunConfig instead.


Instructions for updating:
When switching to tf.estimator.Estimator, use tf.estimator.RunConfig instead.








INFO:tensorflow:Configuring DataParallelism to replicate the model.


INFO:tensorflow:Configuring DataParallelism to replicate the model.


INFO:tensorflow:schedule=continuous_train_and_eval


INFO:tensorflow:schedule=continuous_train_and_eval


INFO:tensorflow:worker_gpu=1


INFO:tensorflow:worker_gpu=1


INFO:tensorflow:sync=False


INFO:tensorflow:sync=False












INFO:tensorflow:datashard_devices: ['gpu:0']


INFO:tensorflow:datashard_devices: ['gpu:0']


INFO:tensorflow:caching_devices: None


INFO:tensorflow:caching_devices: None


INFO:tensorflow:ps_devices: ['gpu:0']


INFO:tensorflow:ps_devices: ['gpu:0']


INFO:tensorflow:Using config: {'_task_type': None, '_task_id': 0, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f9e89cbfa58>, '_master': '', '_num_ps_replicas': 0, '_num_worker_replicas': 0, '_environment': 'local', '_is_chief': True, '_evaluation_master': '', '_train_distribute': None, '_eval_distribute': None, '_experimental_max_worker_delay_secs': None, '_device_fn': None, '_tf_config': gpu_options {
  per_process_gpu_memory_fraction: 1.0
}
, '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_secs': None, '_log_step_count_steps': 100, '_protocol': None, '_session_config': gpu_options {
  per_process_gpu_memory_fraction: 0.95
}
allow_soft_placement: true
graph_options {
  optimizer_options {
    global_jit_level: OFF
  }
}
isolate_session_state: true
, '_save_checkpoints_steps': 2000, '_keep_checkpoint_max': 20, '_keep_checkpoint_every_n_hours': 10000, '_model_dir': '/content/drive/My Drive/t2t/train', '_session_creation_timeout

INFO:tensorflow:Using config: {'_task_type': None, '_task_id': 0, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f9e89cbfa58>, '_master': '', '_num_ps_replicas': 0, '_num_worker_replicas': 0, '_environment': 'local', '_is_chief': True, '_evaluation_master': '', '_train_distribute': None, '_eval_distribute': None, '_experimental_max_worker_delay_secs': None, '_device_fn': None, '_tf_config': gpu_options {
  per_process_gpu_memory_fraction: 1.0
}
, '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_secs': None, '_log_step_count_steps': 100, '_protocol': None, '_session_config': gpu_options {
  per_process_gpu_memory_fraction: 0.95
}
allow_soft_placement: true
graph_options {
  optimizer_options {
    global_jit_level: OFF
  }
}
isolate_session_state: true
, '_save_checkpoints_steps': 2000, '_keep_checkpoint_max': 20, '_keep_checkpoint_every_n_hours': 10000, '_model_dir': '/content/drive/My Drive/t2t/train', '_session_creation_timeout





INFO:tensorflow:Using ValidationMonitor


INFO:tensorflow:Using ValidationMonitor


Instructions for updating:
Monitors are deprecated. Please use tf.train.SessionRunHook.


Instructions for updating:
Monitors are deprecated. Please use tf.train.SessionRunHook.












INFO:tensorflow:Skipping training since max_steps has already saved.


INFO:tensorflow:Skipping training since max_steps has already saved.


#4. Prediction of sentence


##4.1. Configuration and help functions initialization

In [0]:
import tensorflow as tf

tfe = tf.contrib.eager
tfe.enable_eager_execution()
Modes = tf.estimator.ModeKeys

The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.



The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.



In [0]:
#Config

from tensor2tensor import models
from tensor2tensor import problems
from tensor2tensor.layers import common_layers
from tensor2tensor.utils import trainer_lib
from tensor2tensor.utils import t2t_model
from tensor2tensor.utils import registry
from tensor2tensor.utils import metrics
import numpy as np

enfr_problem = problems.problem(PROBLEM)

vocab_name = "vocab.translate_en_lv.16384.subwords"
vocab_file = os.path.join(DATA_DIR, vocab_name)

encoders = enfr_problem.feature_encoders(DATA_DIR)

ckpt_path = tf.train.latest_checkpoint(os.path.join(TRAIN_DIR))
print(ckpt_path)

def translate(inputs):
  encoded_inputs = encode(inputs)
  with tfe.restore_variables_on_create(ckpt_path):
    model_output = translate_model.infer(encoded_inputs)["outputs"]
  return decode(model_output)

def encode(input_str, output_str=None):
  """Input str to features dict, ready for inference"""
  inputs = encoders["inputs"].encode(input_str) + [1]  # add EOS id
  batch_inputs = tf.reshape(inputs, [1, -1, 1])  # Make it 3D.
  return {"inputs": batch_inputs}

def decode(integers):
  """List of ints to str"""
  integers = list(np.squeeze(integers))
  if 1 in integers:
    integers = integers[:integers.index(1)]
  return encoders["inputs"].decode(np.squeeze(integers))

hparams = trainer_lib.create_hparams(HPARAMS, data_dir=DATA_DIR, problem_name=PROBLEM)
translate_model = registry.model(MODEL)(hparams, Modes.PREDICT)

/content/drive/My Drive/t2t/train/model.ckpt-150000
INFO:tensorflow:Setting T2TModel mode to 'infer'


INFO:tensorflow:Setting T2TModel mode to 'infer'


INFO:tensorflow:Setting hparams.dropout to 0.0


INFO:tensorflow:Setting hparams.dropout to 0.0


INFO:tensorflow:Setting hparams.label_smoothing to 0.0


INFO:tensorflow:Setting hparams.label_smoothing to 0.0


INFO:tensorflow:Setting hparams.layer_prepostprocess_dropout to 0.0


INFO:tensorflow:Setting hparams.layer_prepostprocess_dropout to 0.0


INFO:tensorflow:Setting hparams.symbol_dropout to 0.0


INFO:tensorflow:Setting hparams.symbol_dropout to 0.0


INFO:tensorflow:Setting hparams.attention_dropout to 0.0


INFO:tensorflow:Setting hparams.attention_dropout to 0.0


INFO:tensorflow:Setting hparams.relu_dropout to 0.0


INFO:tensorflow:Setting hparams.relu_dropout to 0.0














##4.2 Translation of the sentences

In [0]:
#Predict 

en_input = "It matters because we want to feed our people with food."
lv_result = translate(en_input)

print(f"Input: {en_input}")
print(f"Output: {lv_result}")







INFO:tensorflow:Greedy Decoding


INFO:tensorflow:Greedy Decoding


Instructions for updating:
Use `tf.cast` instead.


Instructions for updating:
Use `tf.cast` instead.




















Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


Instructions for updating:
Use `tf.cast` instead.


Instructions for updating:
Use `tf.cast` instead.




















Instructions for updating:
Use `tf.cast` instead.


Instructions for updating:
Use `tf.cast` instead.


Input: The people prefer democracy. They likes freedom.
Output: Cilvēki dod priekšroku demokrātijai, viņi arī dod brīvību.


##4.3 Persist translations

In [0]:
file_input = open(os.path.join(TRANSLATIONS_DIR, "outputs_lv.txt"),"a")
file_input.write(f'{train_steps}: input - "{en_input}", output - "{lv_result}"\n')
file_input.close()