# Demo Main

### First, let's import the libraries and configure logger. 
It will print lists of implemented models and data loaders.

In [1]:
from config import basic_conf as conf
from libs import ModelManager as mm
from config.constants import HyperParamKey, PathKey
import logging

import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
conf.init_logger(logging.INFO)
logger = logging.getLogger('__main__')
mgr = mm.ModelManager(mode='notebook')

[2018-11-04 21:36:10] [INFO] Initializing Model Manager, version 0.5.1 ...
[2018-11-04 21:36:10] [INFO] 
=== Models Available ===

[2018-11-04 21:36:10] [INFO] 
=== Loaders Available ===
IWSLT
[2018-11-04 21:36:10] [INFO] 
*********** Model Manager Details ***********
-- self.hparams.num_epochs = 1
-- self.hparams.lr = 0.01
-- self.hparams.voc_size = 100000
-- self.hparams.embedding_dim = 50
-- self.hparams.batch_size = 32
-- self.hparams.train_loop_check_freq = 100
-- self.hparams.check_early_stop = True
-- self.hparams.es_look_back = 5
-- self.hparams.es_req_prog = 0.01
-- self.hparams.optim_enc = <class 'torch.optim.adam.Adam'>
-- self.hparams.optim_dec = <class 'torch.optim.adam.Adam'>
-- self.hparams.scheduler = <class 'torch.optim.lr_scheduler.ExponentialLR'>
-- self.hparams.scheduler_gamma = 0.95
-- self.hparams.criterion = <class 'torch.nn.modules.loss.CrossEntropyLoss'>
-- self.cparams.save_best_model = True
-- self.cparams.save_each_epoch = True
-- self.cparams.input_lang = z

### Now let's load the data for translation task: IWSLT

This might take a few minutes if it needs to generate the vocabulary, instead of load from file.

In [3]:
mgr.load_data(mm.loaderRegister.IWSLT)

[2018-11-04 21:36:13] [INFO] Loading data using IWSLT ...
[2018-11-04 21:36:13] [INFO] Get source language datum list...
[2018-11-04 21:36:14] [INFO] Get target language datum list...
[2018-11-04 21:36:14] [INFO] Vocabulary found and loaded! (token2id, id2token, vocabs)
[2018-11-04 21:36:14] [INFO] Convert token to index for source language ...
[2018-11-04 21:36:15] [INFO] Convert token to index for target language ...
[2018-11-04 21:36:16] [INFO] Datum list loaded for both src/target languages!
[2018-11-04 21:36:16] [INFO] Loading raw data into the DataLoaders ...


Take a look at the actual vocabulary size returned by the data loader:

In [4]:
l = mgr.dataloader

In [5]:
di = l.data['source']['train'][0]

In [6]:
di.token_indices

[3805, 3496, 4, 205, 3, 3, 1759, 3, 3, 34428, 3, 3]

In [7]:
len(l.data['source']['train'])

213377

In [8]:
len(l.data['target']['train'])

213377

In [13]:
di = l.data['source']['train'][0]
di.raw_text

'深海 海中 的 生命   大卫   盖罗 \n'

In [14]:
di = l.data['target']['train'][0]
di.raw_text

'Life in the deep oceans\n'

In [16]:
dl = l.loaders['train']

In [17]:
for batch in dl:
    print(batch)
    break

[tensor([[ 36, 244,  99,  ...,   3,   3,   3],
        [ 11,  29,   5,  ...,   0,   0,   0],
        [ 10,  73,  61,  ...,   0,   0,   0],
        ...,
        [ 11,  55,   5,  ...,   0,   0,   0],
        [ 11,  64, 108,  ...,   0,   0,   0],
        [ 50,   3,   3,  ...,   0,   0,   0]], device='cuda:0'), tensor([[ 29, 226, 125,  ..., 467,   4,   3],
        [ 19,  10,  22,  ...,   0,   0,   0],
        [ 13, 246,  64,  ...,   0,   0,   0],
        ...,
        [ 10,  17,  40,  ...,   0,   0,   0],
        [ 12,  22,  59,  ...,   0,   0,   0],
        [827,   3, 697,  ...,   0,   0,   0]], device='cuda:0'), tensor([93, 73, 58, 52, 49, 46, 42, 41, 40, 31, 31, 31, 29, 27, 22, 22, 21, 19,
        18, 17, 17, 16, 16, 12, 12, 11, 10, 10, 10,  9,  8,  8],
       device='cuda:0'), tensor([66, 56, 50, 40, 25, 36, 42, 28, 41, 25, 26, 29, 24, 20, 16, 19, 18, 12,
        15, 13, 19, 21, 11,  8, 12, 15,  9,  6, 12, 11,  6,  6],
       device='cuda:0')]


In [20]:
batch[0].shape

torch.Size([32, 93])

In [27]:
batch[1].shape

torch.Size([32, 66])

In [25]:
batch[2].max()

tensor(93, device='cuda:0')

In [26]:
batch[3].max()

tensor(66, device='cuda:0')

In [28]:
di = l.data['source']['train'][5]
print(di.raw_text)

di = l.data['target']['train'][5]
print(di.raw_text)

泰坦 泰坦尼克 泰坦尼克号 坦尼 尼克 号   是 拿 了 不少 票房 冠军   但 事实 事实上 它 并 不是 关于 于海洋 海洋 的 最 刺激 的 故事  

The truth of the matter is that the Titanic -- even though it &apos;s breaking all sorts of box office records -- it &apos;s not the most exciting story from the sea .



In [9]:
mgr.lparams

{'act_vocab_size': {'source': 89753, 'target': 69103}}

Let's look at the first 5 samples in the train set:
- tokens 
- token_indices

Notice that there will be a EOS_IDX (1) added to the end of each sentence.

In [None]:
for i in range(5):
    print(mgr.dataloader.data['source'][0][i].tokens)
    print(mgr.dataloader.data['source'][0][i].token_indices, '\n')

### Try load the indexers from file (Double-check the vocab)

In [None]:
mgr.load_data(mm.loaderRegister.IWSLT)

In [None]:
for i in range(5):
    print(mgr.dataloader.data['source'][0][i].tokens)
    print(mgr.dataloader.data['source'][0][i].token_indices, '\n')