# Demo Main

### First, let's import the libraries and configure logger. 
It will print lists of implemented models and data loaders.

In [1]:
from config import basic_conf as conf
from libs import ModelManager as mm
from config.constants import HyperParamKey, PathKey
import logging

import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
conf.init_logger(logging.DEBUG)
logger = logging.getLogger('__main__')
config_update = {'data_path': '/Users/xliu/Downloads/iwslt-vi-en/',
                 PathKey.INPUT_LANG: 'vi'}
mgr = mm.ModelManager(mode='notebook', control_overrides=config_update)

[2018-11-04 16:31:24] [INFO] Initializing Model Manager, version 0.4.0 ...
[2018-11-04 16:31:24] [INFO] 
=== Models Available ===
BagOfWords
[2018-11-04 16:31:24] [INFO] 
=== Loaders Available ===
IMDB
IWSLT
[2018-11-04 16:31:24] [INFO] 
*********** Model Manager Details ***********
-- self.hparams.num_epochs = 1
-- self.hparams.lr = 0.01
-- self.hparams.train_plus_val_size = 25000
-- self.hparams.test_size = 25000
-- self.hparams.val_size = 5000
-- self.hparams.voc_size = 100000
-- self.hparams.train_loop_check_freq = 100
-- self.hparams.embedding_dim = 50
-- self.hparams.batch_size = 32
-- self.hparams.ngram_size = 2
-- self.hparams.remove_punc = True
-- self.hparams.check_early_stop = True
-- self.hparams.es_look_back = 5
-- self.hparams.es_req_prog = 0.01
-- self.hparams.optim_enc = <class 'torch.optim.adam.Adam'>
-- self.hparams.optim_dec = <class 'torch.optim.adam.Adam'>
-- self.hparams.scheduler = <class 'torch.optim.lr_scheduler.ExponentialLR'>
-- self.hparams.scheduler_gamma =

### Now let's load the data for translation task: IWSLT

This might take a few minutes if it needs to generate the vocabulary, instead of load from file.

In [3]:
mgr.load_data(mm.loaderRegister.IWSLT)

[2018-11-04 16:31:27] [INFO] Loading data using IWSLT ...
[2018-11-04 16:31:27] [INFO] Get source language datum list...
[2018-11-04 16:31:28] [INFO] Get target language datum list...
[2018-11-04 16:34:54] [INFO] Generated token2id, id2token for both src/target languages!
[2018-11-04 16:34:54] [INFO] Convert token to index for source language ...
[2018-11-04 16:34:56] [INFO] Convert token to index for target language ...
[2018-11-04 16:34:57] [INFO] Datum list loaded for both src/target languages!


Take a look at the actual vocabulary size returned by the data loader:

In [4]:
mgr.lparams

{'act_vocab_size': {'source': 41962, 'target': 54117}}

Let's look at the first 5 samples in the train set:
- tokens 
- token_indices

Notice that there will be a EOS_IDX (1) added to the end of each sentence.

In [5]:
for i in range(5):
    print(mgr.dataloader.data['source'][0][i].tokens)
    print(mgr.dataloader.data['source'][0][i].token_indices, '\n')

['khoa_học', 'đằng_sau', 'một', 'tiêu_đề', 'về', 'khí_hậu']
[192, 1297, 8, 3224, 31, 849, 1] 

['trong', '4', 'phút', ',', 'chuyên_gia', 'hoá_học', 'khí_quyển', 'Rachel', 'Pike', 'giới_thiệu', 'sơ_lược', 'về', 'những', 'nỗ_lực', 'khoa_học', 'miệt_mài', 'đằng_sau', 'những', 'tiêu_đề', 'táo_bạo', 'về', 'biến_đổi', 'khí_hậu', ',', 'cùng', 'với', 'đoàn', 'nghiên_cứu', 'của', 'mình', '-', '-', 'hàng', 'ngàn', 'người', 'đã', 'cống_hiến', 'cho', 'dự_án', 'này', '-', '-', 'một', 'chuyến', 'bay', 'mạo_hiểm', 'qua', 'rừng_già', 'để', 'tìm_kiếm', 'thông_tin', 'về', 'một', 'phân_tử', 'then_chốt', '.']
[15, 413, 383, 3, 882, 1065, 2008, 9396, 16190, 743, 9397, 31, 9, 1022, 192, 7376, 1297, 9, 3224, 2668, 31, 706, 849, 3, 149, 25, 3036, 156, 10, 73, 22, 22, 170, 675, 19, 17, 3109, 27, 318, 21, 22, 22, 8, 610, 465, 1686, 134, 10017, 29, 467, 222, 31, 8, 633, 3931, 4, 1] 

['tôi', 'muốn', 'cho', 'các', 'bạn', 'biết', 'về', 'sự', 'to_lớn', 'của', 'những', 'nỗ_lực', 'khoa_học', 'đã', 'góp_phần', 'làm_nê

### Try load the indexers from file (Double-check the vocab)

In [6]:
mgr.load_data(mm.loaderRegister.IWSLT)

[2018-11-04 16:35:39] [INFO] Loading data using IWSLT ...
[2018-11-04 16:35:40] [INFO] Get source language datum list...
[2018-11-04 16:35:41] [INFO] Get target language datum list...
[2018-11-04 16:35:42] [INFO] Vocabulary found and loaded! (token2id, id2token, vocabs)
[2018-11-04 16:35:42] [INFO] Convert token to index for source language ...
[2018-11-04 16:35:43] [INFO] Convert token to index for target language ...
[2018-11-04 16:35:44] [INFO] Datum list loaded for both src/target languages!


In [7]:
for i in range(5):
    print(mgr.dataloader.data['source'][0][i].tokens)
    print(mgr.dataloader.data['source'][0][i].token_indices, '\n')

['khoa_học', 'đằng_sau', 'một', 'tiêu_đề', 'về', 'khí_hậu']
[192, 1297, 8, 3224, 31, 849, 1] 

['trong', '4', 'phút', ',', 'chuyên_gia', 'hoá_học', 'khí_quyển', 'Rachel', 'Pike', 'giới_thiệu', 'sơ_lược', 'về', 'những', 'nỗ_lực', 'khoa_học', 'miệt_mài', 'đằng_sau', 'những', 'tiêu_đề', 'táo_bạo', 'về', 'biến_đổi', 'khí_hậu', ',', 'cùng', 'với', 'đoàn', 'nghiên_cứu', 'của', 'mình', '-', '-', 'hàng', 'ngàn', 'người', 'đã', 'cống_hiến', 'cho', 'dự_án', 'này', '-', '-', 'một', 'chuyến', 'bay', 'mạo_hiểm', 'qua', 'rừng_già', 'để', 'tìm_kiếm', 'thông_tin', 'về', 'một', 'phân_tử', 'then_chốt', '.']
[15, 413, 383, 3, 882, 1065, 2008, 9396, 16190, 743, 9397, 31, 9, 1022, 192, 7376, 1297, 9, 3224, 2668, 31, 706, 849, 3, 149, 25, 3036, 156, 10, 73, 22, 22, 170, 675, 19, 17, 3109, 27, 318, 21, 22, 22, 8, 610, 465, 1686, 134, 10017, 29, 467, 222, 31, 8, 633, 3931, 4, 1] 

['tôi', 'muốn', 'cho', 'các', 'bạn', 'biết', 'về', 'sự', 'to_lớn', 'của', 'những', 'nỗ_lực', 'khoa_học', 'đã', 'góp_phần', 'làm_nê