In [22]:
import utils
import yaml
from ipywidgets import interact, fixed, Textarea
from functools import partial
%load_ext yamlmagic

The yamlmagic extension is already loaded. To reload it, use:
  %reload_ext yamlmagic


# Task 1: Sentence-Level Direct Assessment - WMT20

# Train Predictor

In [0]:
# If using terminal, the following will download and extract the data
# $ curl http://geolite.maxmind.com/download/geoip/database/GeoLite2-Country.tar.gz | tar -xz  -C /data/training/

# Download Data
OK_url = 'https://www.quest.dcs.shef.ac.uk/wmt20_files_qe/training_en-de.tar.gz'
utils.download_kiwi(OK_url)

# Extract Data
import tarfile
my_tar = tarfile.open('./data/training/training_en-de.tar.gz')
my_tar.extractall('./data/training') # specify which folder to extract to
my_tar.close()

In [0]:
# Reduction of Data:

# If using terminal, the following will get x amounts of rows
# head -n 100 ./data/training/train.ende.en > tinytrainen

import pandas as pd
tinytrainen = pd.read_csv('./data/training/train.ende.en',chunksize=10000, sep='None, /n', engine='python')
readme = tinytrainen.get_chunk(10000)
readme.to_csv(r'./data/training/tinytrainen', index=False, header=False)

In [0]:
tinytrainde = pd.read_csv('./data/training/train.ende.de',chunksize=10000, sep='None, /n', engine='python')
reader = tinytrainde.get_chunk(10000)
reader.to_csv(r'./data/training/tinytrainde', index=False, header=False)

In [23]:
# Check tinytrain files size
import os
os.path.getsize('./data/training/tinytrainde')

1475606

In [24]:
%%yaml train_predictor
#### Train Predictor  ####

model: predictor

# Model Files will be saved here
output-dir: ./runs/predictor

#### MODEL SPECIFIC OPTS ####

## PREDICTOR ##

# LSTM Settings (Both SRC and TGT)
hidden-pred: 400
rnn-layers-pred: 2
# If set, takes precedence over other embedding params
embedding-sizes: 200
# Source, Target, and Target Softmax Embedding
source-embeddings-size: 200
target-embeddings-size: 200
out-embeddings-size: 200
# Dropout
dropout-pred: 0.5
# Set to true to predict from target to source
# (To create a source predictor for source tag prediction)
predict-inverse: false

### TRAIN OPTS ###
epochs: 6
# Eval and checkpoint every n samples
# Disable by setting to zero (default)
checkpoint-validation-steps: 5000
# If False, never save the Models
checkpoint-save: true
# Keep Only the n best models according to the main metric (Perplexity by default)
# Ueful to avoid filling the harddrive during a long run
checkpoint-keep-only-best: 1
# If greater than zero, Early Stop after n evaluation cycles without improvement
checkpoint-early-stop-patience: 0

optimizer: adam
# Print Train Stats Every n batches
log-interval: 100
# Learning Rate
# 1e-3 * (batch_size / 32) seems to work well
learning-rate: 2e-3
learning-rate-decay: 0.6
learning-rate-decay-start: 2
train-batch-size: 64
valid-batch-size: 64

### DATA OPTS ###

# Source and Target Files
train-source: ./data/training/tinytrainen
train-target: ./data/training/tinytrainde
# Optionally load more data which is used only for vocabulary creation.
# This is useful to reduce OOV words if the parallel data
# and QE data are from different domains.
# extend-source-vocab: data/WMT17/word_level/en_de/train.src
# extend-target-vocab: data/WMT17/word_level/en_de/train.pe
# Optionally Specify Validation Sets
# valid-source: data/WMT17/word_level/en_de/dev.src
# valid-target: data/WMT17/word_level/en_de/dev.pe
# If No valid is specified, randomly split the train corpus
split: 0.99


## VOCAB ##

# Load Vocabulary from a previous run.
# This is needed e.g. for training a source predictor via the flag
# predict-inverse: True
# If set, the other vocab options are ignored.
# load-vocab: /mnt/data/datasets/kiwi/trained_models/predest/en_de/vocab.torch

source-vocab-size: 45000
target-vocab-size: 45000
# Remove Sentences not in the specified Length Range
source-max-length: 50
source-min-length: 1
target-max-length: 50
target-min-length: 1
# Require Minimum Frequency of words
source-vocab-min-frequency: 1
target-vocab-min-frequency: 1


### GENERAL OPTS ###

# Experiment Name for MLFlow
# experiment-name: EN-DE Pretrain Predictor
# Do not set or set to negative number for CPU
# gpu-id: 0

<IPython.core.display.Javascript object>

In [0]:
utils.save_config(train_predictor, './runs/predictor/train_predictor.yaml')

In [0]:
import kiwi

predictor_config = './runs/predictor/train_predictor.yaml'
kiwi.train(predictor_config)

# Train Estimator

In [0]:
#development file

file = open('./data/traindev/dev.ende.df.short.tsv')
data = file.readlines()[1:]
file.close()

de = open('./data/traindev/wmt20_dev.de', 'w')
en = open('./data/traindev/wmt20_dev.en', 'w')
hter = open('./data/traindev/wmt20_dev.hter_avg', 'w')
for d in data:
	d = d.split('\t')
	print(d)
	de.write(d[1] + "\n")
	en.write(d[2] + "\n")
	hter.write(d[4] + "\n")
de.close()
en.close()
hter.close()

In [0]:
#train file

file = open('./data/traindev/train.ende.df.short.tsv')
data = file.readlines()[1:]
file.close()


de = open('./data/traindev/wmt20_train.de', 'w')
en = open('./data/traindev/wmt20_train.en', 'w')
hter = open('./data/traindev/wmt20_train.hter_avg', 'w')
for d in data:
	d = d.split('\t')
	print(d)
	de.write(d[1] + "\n")
	en.write(d[2] + "\n")
	hter.write(d[4] + "\n")
de.close()
en.close()
hter.close()

In [26]:
%%yaml train_estimator
### Train Predictor Estimator ###

model: estimator

# Model Files will be saved here
output-dir: ./runs/estimator

#### MODEL SPECIFIC OPTS ####

## ESTIMATOR ##

# If load-model points to a pretrained Estimator,
# These settings are ignored.

# LSTM Settings
hidden-est: 125
rnn-layers-est: 1
dropout-est: 0.0
# Use linear layer to reduce dimension prior to LSTM
mlp-est: True

# Multitask Learning Settings #

# Continue training the predictor on the postedited text.
# If set, will do an additional forward pass through the predictor
# Using the SRC, PE pair and add the `Predictor` loss for the tokens in the
# postedited text PE. Recommended if you have access to PE
# Requires setting train-pe, valid-pe
token-level: False
# Predict Sentence Level Scores
# Requires setting train-sentence-scores, valid-sentence-scores
sentence-level: True
# Use probabilistic Loss for sentence scores instead of squared error.
# If set, the model will output mean and variance of a truncated Gaussian
# distribution over the interval [0, 1], and use log-likelihood loss instead
# of mean squared error.
# Seems to improve performance
sentence-ll: False
# Predict Binary Label for each sentence, indicating hter == 0.0
# Requires setting train-sentence-scores, valid-sentence-scores
binary-level: False

# WMT 20 Format Settings #

# Predict target tags. Requires train-target-tags, valid-target-tags to be set.
predict-target: false
target-bad-weight: 2.5
# Predict source tags. Requires train-source-tags, valid-source-tags to be set.
predict-source: false
source-bad-weight: 2.5
# Predict gap tags. Requires train-target-tags, valid-target-tags to be set.
# and wmt18-format set to true
predict-gaps: false
target-bad-weight: 2.5


### TRAIN OPTS ###
epochs: 10
# Additionally Eval and checkpoint every n training steps
# Explicitly disable by setting to zero (default)
checkpoint-validation-steps: 0
# If False, never save the Models
checkpoint-save: true
# Keep Only the n best models according to the main metric (F1Mult by default)
# USeful to avoid filling the harddrive during a long run
checkpoint-keep-only-best: 3
# If greater than zero, Early Stop after n evaluation cycles without improvement
checkpoint-early-stop-patience: 0


# Print Train Stats Every n batches
log-interval: 100
# LR. Currently ADAM is only optimizer supported.
# 1e-3 * (batch_size / 32) seems to work well
learning-rate: 1e-3

train-batch-size: 8
valid-batch-size: 8



### LOADING ###

# Load pretrained (sub-)model.
# If set, the model architecture params are ignored.
# As the vocabulary of the pretrained model will be used,
# all vocab-params will also be ignored.

# (i) load-pred-source or load-pred-target: Predictor instance
#     -> a new Estimator is initialized with the given predictor(s).
# (ii) load-model: Estimator instance.
#                  As the Predictor is a submodule of the Estimator,
#                  load-pred-{source,target} will be ignored if this is set.

# load-model: path_to_estimator
# load-pred-source: path_to_predictor_source_target
load-pred-target: ./runs/predictor/best_model.torch


###  DATA ###

# Set to True to use target_tags in WMT format
wmt20-format: false

train-source: ./data/traindev/wmt20_train.en
train-target: ./data/traindev/wmt20_train.de
# train-pe: /content/drive/My Drive/Proyectos/Machine Learning/Colab Notebooks/data/train.pe
# train-target-tags: /content/drive/My Drive/Proyectos/Machine Learning/Colab Notebooks/data/train.tags
train-sentence-scores: ./data/traindev/wmt20_train.hter_avg


valid-source: ./data/traindev/wmt20_dev.en
valid-target: ./data/traindev/wmt20_dev.de
# valid-pe: /content/drive/My Drive/Proyectos/Machine Learning/Colab Notebooks/WMT20/data/dev.pe
# valid-target-tags: /content/drive/My Drive/Proyectos/Machine Learning/Colab Notebooks/WMT20/data/dev.tags
valid-sentence-scores: ./data/traindev/wmt20_dev.hter_avg

### GENERAL OPTS ###

# Experiment Name for MLFlow
experiment-name: EN-DE Train Estimator
# Do not set or set to negative number for CPU
# gpu-id: 0

<IPython.core.display.Javascript object>

In [0]:
utils.save_config(train_estimator, './runs/estimator/train_estimator.yaml')
utils.save_config(train_estimator, './experiments/train_estimator.yaml')

In [0]:
import kiwi

estimator_config = './runs/estimator/train_estimator.yaml'
kiwi.train(estimator_config)