In [170]:
# automatically reload edited modules
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [171]:
import logging
import sys
import os
import yaml
import imp
import pprint
import json
from argparse import ArgumentParser
import numpy as np
import torch
from cl.data import *
#from train import train
from cl.model import MLP
import utils
import tensorflow as tf
import numpy as np
from copy import deepcopy

In [172]:
%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
from IPython import display

2019-07-16 07:27:44,386 DEBUG Loaded backend module://ipykernel.pylab.backend_inline version unknown.


In [173]:
# import image captioning model
from models import *

In [174]:
# accuracy plotting (could have graph of bleu, meteor, cider ...)
def plot_test_acc(plot_handles):
    plt.legend(handles=plot_handles, loc="center right")
    plt.xlabel("Iterations")
    plt.ylabel("Test Accuracy")
    plt.ylim(0,1)
    display.display(plt.gcf())
    display.clear_output(wait=True)

In [175]:
logging.basicConfig(format='%(asctime)s %(levelname)s %(message)s',
                    level=logging.DEBUG,
                    stream=sys.stdout)

def make_paths_absolute(dir_, cfg):
    """
    Make all values for keys ending with `_path` absolute to dir_.

    Parameters
    ----------
    dir_ : str
    cfg : dict

    Returns
    -------
    cfg : dict
    """
    for key in cfg.keys():
        if type(cfg[key]) is dict:
            cfg[key] = make_paths_absolute(dir_, cfg[key])
    return cfg

def load_and_print_cfg(yaml_filepath):
    """
    Load and print a YAML configuration file.

    Parameters
    ----------
    yaml_filepath : str

    Returns
    -------
    cfg : dict
    """
    
    # Read YAML experiment definition file
    with open(yaml_filepath, 'r') as stream:
        cfg = yaml.load(stream)
    cfg = make_paths_absolute(os.path.dirname(yaml_filepath), cfg)

    # Print the configuration - just to make sure that you loaded what you
    # wanted to load
    pp = pprint.PrettyPrinter(indent=4)
    pp.pprint(cfg)

    # Here is an example how you load modules of which you put the path in the
    # configuration. Use this for configuring the model you use, for dataset
    # loading, ...
    return cfg

In [176]:
# First we solve the configuration problem
cfg = load_and_print_cfg("sat_cl.yaml")

{   'dataset': {   'data_folder': 'caption data',
                   'data_name': 'coco_5_cap_per_img_5_min_word_freq'},
    'evaluate': {'augmentation_factor': 32, 'batch_size': 1000},
    'model': None,
    'optimizer': {'lr': '1e-1', 'weight_decay': 0},
    'train': {   'batch_size': 128,
                 'epochs_per_task': 3,
                 'fisher_estimation_sample_size': 1024,
                 'hidden_dropout_prob': 0.5,
                 'hidden_layer_num': 2,
                 'hidden_size': 400,
                 'input_dropout_prob': 0.2,
                 'lamda': 40,
                 'task_number': 3,
                 'test_size': 1024}}




In [177]:
cuda = torch.cuda.is_available()
global best_bleu4, epochs_since_improvement, checkpoint, start_epoch, fine_tune_encoder, data_name, word_map

# Read word map
word_map_file = os.path.join(cfg['dataset']['data_folder'], 'WORDMAP_' + cfg['dataset']['data_name'] + '.json')
with open(word_map_file, 'r') as j:
    word_map = json.load(j)

In [178]:
# Second we sove the data loading problem
# Ignore using checkpoint

# Import modules for Show-attend-and-tell
import time
import torch.backends.cudnn as cudnn
import torch.optim
import torch.utils.data
import torchvision.transforms as transforms
from torch import nn
from torch.nn.utils.rnn import pack_padded_sequence
from models import Encoder, DecoderWithAttention
from datasets import *
from utils import *
from nltk.translate.bleu_score import corpus_bleu

In [179]:
# Paramteres for Image Captioning system, should put them in config file later

# Model parameters
emb_dim = 512  # dimension of word embeddings
attention_dim = 512  # dimension of attention linear layers
decoder_dim = 512  # dimension of decoder RNN
dropout = 0.5
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  # sets device for model and PyTorch tensors
cudnn.benchmark = True  # set to true only if inputs to model are fixed size; otherwise lot of computational overhead

# Training parameters
start_epoch = 0
epochs = 1  # number of epochs to train for (if early stopping is not triggered)
epochs_since_improvement = 0  # keeps track of number of epochs since there's been an improvement in validation BLEU
batch_size = 32
workers = 1  # for data-loading; right now, only 1 works with h5py
encoder_lr = 1e-4  # learning rate for encoder if fine-tuning
decoder_lr = 4e-4  # learning rate for decoder
grad_clip = 5.  # clip gradients at an absolute value of
alpha_c = 1.  # regularization parameter for 'doubly stochastic attention', as in the paper
best_bleu4 = 0.  # BLEU-4 score right now
print_freq = 100  # print training/validation stats every __ batches
fine_tune_encoder = False  # fine-tune encoder?
checkpoint = None  # path to checkpoint, None if none

In [180]:
decoder = DecoderWithAttention(attention_dim=attention_dim,
                               embed_dim=emb_dim,
                               decoder_dim=decoder_dim,
                               vocab_size=len(word_map),
                               dropout=dropout)
decoder_optimizer = torch.optim.Adam(params=filter(lambda p: p.requires_grad, decoder.parameters()),
                                     lr=decoder_lr)
encoder = Encoder()
encoder.fine_tune(fine_tune_encoder)
encoder_optimizer = torch.optim.Adam(params=filter(lambda p: p.requires_grad, encoder.parameters()),
                                     lr=encoder_lr) if fine_tune_encoder else None

In [181]:
# Move to GPU, if available, equivalent to mlp.cuda() in EWC implementation
decoder = decoder.to(device)
encoder = encoder.to(device)

In [182]:
# Loss function
criterion = nn.CrossEntropyLoss().to(device)

# Custom dataloaders
normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                 std=[0.229, 0.224, 0.225])

In [184]:
print(type(train_loader), type(train_loader), type(normalize), type(criterion))

<class 'torch.utils.data.dataloader.DataLoader'> <class 'torch.utils.data.dataloader.DataLoader'> <class 'torchvision.transforms.transforms.Normalize'> <class 'torch.nn.modules.loss.CrossEntropyLoss'>


In [189]:
# generate 3 different dataset for 3 tasks
permutations = [
    np.random.permutation((DATASET_CONFIGS['coco']['size']**2)*DATASET_CONFIGS['coco']['channels']) for
    _ in range(cfg['train']['task_number'])
]
print(permutations)
print(np.shape(permutations))

# prepare ms-coco datasets.
train_datasets = [
    get_dataset('coco', permutation=p) for p in permutations
]
test_datasets = [
    get_dataset('coco', train=False, permutation=p) for p in permutations
]

[array([106012, 181293, 118277, ..., 180420, 157178,  47879]), array([ 99834, 123478, 141323, ...,  49483,  53492,  51708]), array([ 72837,  30020,  47935, ..., 165583, 156438,  32777])]
(3, 196608)


In [185]:
# Function to train the model
def train(train_datasets, test_datasets, encoder, decoder, epochs_per_task, batch_size, test_size, consolidate,
          fisher_estimation_sample_size, lr, weight_decay, loss_log_interval, eval_log_interval, cuda,
          criterion, encoder_optimizer, decoder_optimizer, epoch):
    """
    Performs one epoch's training.

    :param 
    :param 
    :param 
    :param 
    :param 
    :param 
    :param 
    """

    decoder.train()  # train mode (dropout and batchnorm is used)
    encoder.train()

    batch_time = AverageMeter()  # forward prop. + back prop. time
    data_time = AverageMeter()  # data loading time
    losses = AverageMeter()  # loss (per word decoded)
    top5accs = AverageMeter()  # top5 accuracy

    start = time.time()
    # loop over multiple tasks
    for task, train_dataset in enumerate(train_datasets, 1):
        
        # Dang xem xet train_loader va val_loader da dung va tuong duong voi data_loader trong ewc chua
        train_loader = torch.utils.data.DataLoader(
            CaptionDataset(cfg['dataset']['data_folder'], cfg['dataset']['data_name'], 'TRAIN', transform=transforms.Compose([normalize])),
            batch_size=batch_size, shuffle=True, num_workers=workers, pin_memory=True)
        val_loader = torch.utils.data.DataLoader(
            CaptionDataset(cfg['dataset']['data_folder'], cfg['dataset']['data_name'], 'VAL', transform=transforms.Compose([normalize])),
            batch_size=batch_size, shuffle=True, num_workers=workers, pin_memory=True)
        
        # loop over epochs
        for epoch in range(start_epoch, epochs):
            # Decay learning rate if there is no improvement for 8 consecutive epochs, and terminate training after 20
            if epochs_since_improvement == 20:
                break
            if epochs_since_improvement > 0 and epochs_since_improvement % 8 == 0:
                adjust_learning_rate(decoder_optimizer, 0.8)
                if fine_tune_encoder:
                    adjust_learning_rate(encoder_optimizer, 0.8)
            
            # loop over batches
            for i, (imgs, caps, caplens) in enumerate(train_loader):
                data_time.update(time.time() - start)


                # Move to GPU, if available
                imgs = imgs.to(device)
                caps = caps.to(device)
                caplens = caplens.to(device)

                # Forward prop.
                imgs = encoder(imgs)
                scores, caps_sorted, decode_lengths, alphas, sort_ind = decoder(imgs, caps, caplens)

                # Since we decoded starting with <start>, the targets are all words after <start>, up to <end>
                targets = caps_sorted[:, 1:]

                # Remove timesteps that we didn't decode at, or are pads
                # pack_padded_sequence is an easy trick to do this
                scores, _ = pack_padded_sequence(scores, decode_lengths, batch_first=True)
                targets, _ = pack_padded_sequence(targets, decode_lengths, batch_first=True)

                # Calculate loss
                loss = criterion(scores, targets)

                # Add doubly stochastic attention regularization
                loss += alpha_c * ((1. - alphas.sum(dim=1)) ** 2).mean()

                ewc_loss = decoder.ewc_loss(cuda=cuda) + encoder.ewc_loss(cuda=cuda)
                ewc_loss.resize_([])
                '''
                print(decoder.ewc_loss(cuda=cuda), encoder.ewc_loss(cuda=cuda), loss, ewc_loss)
                print(type(decoder.ewc_loss(cuda=cuda)), type(encoder.ewc_loss(cuda=cuda)), type(loss), type(ewc_loss))
                print(ewc_loss.size())
                print(loss.size())
                '''
                loss +=  ewc_loss

                # Back prop.
                decoder_optimizer.zero_grad()
                if encoder_optimizer is not None:
                    encoder_optimizer.zero_grad()
                loss.backward() # Convert tensor [1] [0] de ko bi loi

                # Clip gradients
                if grad_clip is not None:
                    clip_gradient(decoder_optimizer, grad_clip)
                    if encoder_optimizer is not None:
                        clip_gradient(encoder_optimizer, grad_clip)

                # Update weights
                decoder_optimizer.step()
                if encoder_optimizer is not None:
                    encoder_optimizer.step()

                # Keep track of metrics
                top5 = accuracy(scores, targets, 5)
                losses.update(loss.item(), sum(decode_lengths)) 
                top5accs.update(top5, sum(decode_lengths))
                batch_time.update(time.time() - start)

                start = time.time()

                # Print status
                if i % print_freq == 0:
                    print('Epoch: [{0}][{1}/{2}]\t'
                          'Batch Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                          'Data Load Time {data_time.val:.3f} ({data_time.avg:.3f})\t'
                          'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
                          'Top-5 Accuracy {top5.val:.3f} ({top5.avg:.3f})'.format(epoch, i, len(train_loader),
                                                                                  batch_time=batch_time,
                                                                                  data_time=data_time, loss=losses,
                                                                                  top5=top5accs))

        if task < len(train_datasets):  
            #if consolidate and task < len(train_datasets):
            # estimate the fisher information of the parameters and consolidate
            # them in the network.
            print(
                '=> Estimating diagonals of the fisher information matrix...',
                flush=True, end='',
            )
            encoder.consolidate(model.estimate_fisher(
                train_dataset, fisher_estimation_sample_size # Working: train_dataset cua ewc tuong ung voi cai nao trong SAT??
            ))
            decoder.consolidate(model.estimate_fisher(
                train_dataset, fisher_estimation_sample_size # Working: train_dataset cua ewc tuong ung voi cai nao trong SAT??
            ))
            print(' Done!')



In [188]:
'''
# Epochs
for epoch in range(start_epoch, epochs):

    # Decay learning rate if there is no improvement for 8 consecutive epochs, and terminate training after 20
    if epochs_since_improvement == 20:
        break
    if epochs_since_improvement > 0 and epochs_since_improvement % 8 == 0:
        adjust_learning_rate(decoder_optimizer, 0.8)
        if fine_tune_encoder:
            adjust_learning_rate(encoder_optimizer, 0.8)
'''         
# One epoch's training
train(train_datasets,
      test_datasets,
      encoder=encoder,
      decoder=decoder,
      epochs_per_task,
      batch_size=64, test_size=1024, consolidate=True,
      fisher_estimation_sample_size=1024,
      lr=1e-3, weight_decay=1e-5,
      loss_log_interval=30,
      eval_log_interval=50,
      cuda=False,
      criterion=criterion,
      encoder_optimizer=encoder_optimizer,
      decoder_optimizer=decoder_optimizer,
      epoch=epoch)

Epoch: [0][0/17702]	Batch Time 2.703 (2.703)	Data Load Time 2.486 (2.486)	Loss 10.0094 (10.0094)	Top-5 Accuracy 1.408 (1.408)
Epoch: [0][100/17702]	Batch Time 0.221 (0.234)	Data Load Time 0.000 (0.025)	Loss 6.0695 (6.6234)	Top-5 Accuracy 39.276 (34.915)


Traceback (most recent call last):
  File "/home/dexter/miniconda3/envs/advanced_ml/lib/python3.5/multiprocessing/queues.py", line 240, in _feed
    send_bytes(obj)
  File "/home/dexter/miniconda3/envs/advanced_ml/lib/python3.5/multiprocessing/connection.py", line 200, in send_bytes
    self._send_bytes(m[offset:offset + size])
  File "/home/dexter/miniconda3/envs/advanced_ml/lib/python3.5/multiprocessing/connection.py", line 404, in _send_bytes
    self._send(header + buf)
  File "/home/dexter/miniconda3/envs/advanced_ml/lib/python3.5/multiprocessing/connection.py", line 368, in _send
    n = write(self._handle, buf)
BrokenPipeError: [Errno 32] Broken pipe


KeyboardInterrupt: 