In [1]:
"""Train the model"""

%load_ext autoreload
%autoreload 2

import argparse
import logging
import os

import numpy as np
import torch
import torch.optim as optim
from torch.autograd import Variable
from tqdm import tqdm

import utils
import model.net as net
import model.data_loader as data_loader
from evaluate import evaluate

In [2]:
#data_dir = '/home/dxue/CS230/data/FETAL'
#data_dir = '/home/sbakr/data'
data_dir = '/home/data'
model_dir = 'experiments/base_model'
restore_file = None

In [3]:
def train(model, optimizer, loss_fn, dataloader, metrics, params):
    """Train the model on `num_steps` batches

    Args:
        model: (torch.nn.Module) the neural network
        optimizer: (torch.optim) optimizer for parameters of model
        loss_fn: a function that takes batch_output and batch_labels and computes the loss for the batch
        dataloader: (DataLoader) a torch.utils.data.DataLoader object that fetches training data
        metrics: (dict) a dictionary of functions that compute a metric using the output and labels of each batch
        params: (Params) hyperparameters
        num_steps: (int) number of batches to train on, each of size params.batch_size
    """

    # set model to training mode
    model.train()

    # summary for current training loop and a running average object for loss
    summ = []
    loss_avg = utils.RunningAverage()

    # Use tqdm for progress bar
    with tqdm(total=len(dataloader)) as t:
        for i, (train_batch, labels_batch) in enumerate(dataloader):
            # move to GPU if available
            if params.cuda:
                train_batch, labels_batch = train_batch.cuda(async=True), labels_batch.cuda(async=True)
            # convert to torch Variables
            train_batch, labels_batch = Variable(train_batch), Variable(labels_batch)

            # compute model output and loss
            #import pdb; pdb.set_trace()
            output_batch = model(train_batch)
            loss = loss_fn(output_batch, labels_batch)

            # clear previous gradients, compute gradients of all variables wrt loss
            optimizer.zero_grad()
            loss.backward()

            # performs updates using calculated gradients
            optimizer.step()

            # Evaluate summaries only once in a while
            if i % params.save_summary_steps == 0:
                # extract data from torch Variable, move to cpu, convert to numpy arrays
                output_batch = output_batch.data.cpu().numpy()
                labels_batch = labels_batch.data.cpu().numpy()

                # compute all metrics on this batch
                summary_batch = {metric:metrics[metric](output_batch, labels_batch)
                                 for metric in metrics}
                summary_batch['loss'] = loss.data[0]
                summ.append(summary_batch)

            # update the average loss
            loss_avg.update(loss.data[0])

            t.set_postfix(loss='{:05.3f}'.format(loss_avg()))
            t.update()

    # compute mean of all metrics in summary
    metrics_mean = {metric:np.mean([x[metric] for x in summ]) for metric in summ[0]}
    metrics_string = " ; ".join("{}: {:05.3f}".format(k, v) for k, v in metrics_mean.items())
    logging.info("- Train metrics: " + metrics_string)



In [4]:
def train_and_evaluate(model, train_dataloader, val_dataloader, optimizer, loss_fn, metrics, params, model_dir,
                       restore_file=None):
    """Train the model and evaluate every epoch.

    Args:
        model: (torch.nn.Module) the neural network
        train_dataloader: (DataLoader) a torch.utils.data.DataLoader object that fetches training data
        val_dataloader: (DataLoader) a torch.utils.data.DataLoader object that fetches validation data
        optimizer: (torch.optim) optimizer for parameters of model
        loss_fn: a function that takes batch_output and batch_labels and computes the loss for the batch
        metrics: (dict) a dictionary of functions that compute a metric using the output and labels of each batch
        params: (Params) hyperparameters
        model_dir: (string) directory containing config, weights and log
        restore_file: (string) optional- name of file to restore from (without its extension .pth.tar)
    """
    # reload weights from restore_file if specified
    if restore_file is not None:
        restore_path = os.path.join(args.model_dir, args.restore_file + '.pth.tar')
        logging.info("Restoring parameters from {}".format(restore_path))
        utils.load_checkpoint(restore_path, model, optimizer)

    best_val_acc = 0.0

    for epoch in range(params.num_epochs):
        # Run one epoch
        logging.info("Epoch {}/{}".format(epoch + 1, params.num_epochs))

        # compute number of batches in one epoch (one full pass over the training set)
        train(model, optimizer, loss_fn, train_dataloader, metrics, params)

        # Evaluate for one epoch on validation set
        val_metrics = evaluate(model, loss_fn, val_dataloader, metrics, params)

        val_acc = val_metrics['accuracy']
        is_best = val_acc>=best_val_acc

        # Save weights
        utils.save_checkpoint({'epoch': epoch + 1,
                               'state_dict': model.state_dict(),
                               'optim_dict' : optimizer.state_dict()},
                               is_best=is_best,
                               checkpoint=model_dir)

        # If best_eval, best_save_path
        if is_best:
            logging.info("- Found new best accuracy")
            best_val_acc = val_acc

            # Save best val metrics in a json file in the model directory
            best_json_path = os.path.join(model_dir, "metrics_val_best_weights.json")
            utils.save_dict_to_json(val_metrics, best_json_path)

        # Save latest val metrics in a json file in the model directory
        last_json_path = os.path.join(model_dir, "metrics_val_last_weights.json")
        utils.save_dict_to_json(val_metrics, last_json_path)



In [5]:


# Load the parameters from json file
#args = parser.parse_args()
json_path = os.path.join(model_dir, 'params.json')
assert os.path.isfile(json_path), "No json configuration file found at {}".format(json_path)
params = utils.Params(json_path)

 


In [6]:
 # use GPU if available
params.cuda = torch.cuda.is_available()

  # Set the random seed for reproducible experiments
torch.manual_seed(230)
if params.cuda: torch.cuda.manual_seed(230)

  # Set the logger
utils.set_logger(os.path.join(model_dir, 'train.log'))

  # Create the input data pipeline


In [7]:
path = os.path.join(data_dir, "{}".format('train'))
print(path)
filenames = os.listdir(path)
print(filenames[0])

/home/data/train
1_4107.npy


In [8]:
logging.info("Loading the datasets...")

  # fetch dataloaders
dataloaders = data_loader.fetch_dataloader(['train', 'val'], data_dir, params)
train_dl = dataloaders['train']
val_dl = dataloaders['val']

logging.info("- done.")

   # Define the model and optimizer
model = net.Net(params).cuda() if params.cuda else net.Net(params)
optimizer = optim.Adam(model.parameters(), lr=params.learning_rate)

   # fetch loss function and metrics
loss_fn = net.loss_fn
metrics = net.metrics

   # Train the model
logging.info("Starting training for {} epoch(s)".format(params.num_epochs))
train_and_evaluate(model, train_dl, val_dl, optimizer, loss_fn, metrics, params, model_dir,
                      restore_file)

Loading the datasets...
- done.
Starting training for 10 epoch(s)
Epoch 1/10
100%|██████████| 11/11 [00:11<00:00,  1.32it/s, loss=106.109]
- Train metrics: accuracy: 0.000 ; loss: 87.034
- Eval metrics : accuracy: 0.000 ; loss: 1308.916


Checkpoint Directory exists! 


- Found new best accuracy
Epoch 2/10
100%|██████████| 11/11 [00:09<00:00,  1.36it/s, loss=78.438]
- Train metrics: accuracy: 0.000 ; loss: 93.560
- Eval metrics : accuracy: 0.000 ; loss: 217.926


Checkpoint Directory exists! 


- Found new best accuracy
Epoch 3/10
100%|██████████| 11/11 [00:10<00:00,  1.50it/s, loss=81.941]
- Train metrics: accuracy: 0.000 ; loss: 82.726
- Eval metrics : accuracy: 0.000 ; loss: 275.462


Checkpoint Directory exists! 


- Found new best accuracy
Epoch 4/10
100%|██████████| 11/11 [00:09<00:00,  1.39it/s, loss=81.497]
- Train metrics: accuracy: 0.000 ; loss: 90.439
- Eval metrics : accuracy: 0.000 ; loss: 441.535


Checkpoint Directory exists! 


- Found new best accuracy
Epoch 5/10
100%|██████████| 11/11 [00:10<00:00,  1.75it/s, loss=81.421]
- Train metrics: accuracy: 0.000 ; loss: 89.472
- Eval metrics : accuracy: 0.000 ; loss: 226.344


Checkpoint Directory exists! 


- Found new best accuracy
Epoch 6/10
100%|██████████| 11/11 [00:09<00:00,  1.71it/s, loss=85.529]
- Train metrics: accuracy: 0.000 ; loss: 96.509
- Eval metrics : accuracy: 0.000 ; loss: 179.661


Checkpoint Directory exists! 


- Found new best accuracy
Epoch 7/10
100%|██████████| 11/11 [00:08<00:00,  2.00it/s, loss=80.954]
- Train metrics: accuracy: 0.000 ; loss: 81.757
- Eval metrics : accuracy: 0.000 ; loss: 254.614


Checkpoint Directory exists! 


- Found new best accuracy
Epoch 8/10
100%|██████████| 11/11 [00:09<00:00,  1.74it/s, loss=86.650]
- Train metrics: accuracy: 0.000 ; loss: 118.363
- Eval metrics : accuracy: 0.000 ; loss: 106.338


Checkpoint Directory exists! 


- Found new best accuracy
Epoch 9/10
100%|██████████| 11/11 [00:10<00:00,  1.56it/s, loss=81.910]
- Train metrics: accuracy: 0.000 ; loss: 91.140
- Eval metrics : accuracy: 0.000 ; loss: 74.227


Checkpoint Directory exists! 


- Found new best accuracy
Epoch 10/10
100%|██████████| 11/11 [00:08<00:00,  2.10it/s, loss=81.449]
- Train metrics: accuracy: 0.000 ; loss: 90.164
- Eval metrics : accuracy: 0.000 ; loss: 79.149


Checkpoint Directory exists! 


- Found new best accuracy
