To Do:
* Review and clean up this script
* Update training script to save best weights periodically
* Update training script to save plots periodically
* Update training script to save performance information periodically
* Add means to detect training completion (and optionally shut down the instance)
* Develop a method to run inference using the weights
* Develop an method to visualize the inference results



In [1]:
# This cell loads test and training splits.

from torchvision import transforms
from dvmcar import DvmCarDataset

# load_data - return dvmcar training and validation datasets
#
# locale - specifies environment in which code is exectuting since paths may need
#   to be modified.
#     "Lambda Labs"
#     "Default"
# scale - fraction of dataset to use, 1 uses the entire dataset

def load_data(locale="Default", scale=1):

    # Depending on the locale...
    if locale=="Lambda Labs":
        # Use lambda labs paths
        work_def = '/home/ubuntu/WorkLab/data/dvmcar/dvmcar.zip'
        persist_def = '/home/ubuntu/worklab/dvmcar.zip'
    else:
        # Use default paths
        work_def = '/data/dvmcar/dvmcar.zip'
        persist_def = None

    # Set partitions for train, test, and validate subsets
    partition0  = 0.8*scale
    partition1  = 0.9*scale
    partition2  = 1.0*scale

    # Define corresponding split arguments for the dataset constructor
    train_split = [0,          partition0]
    val_split   = [partition0, partition1]
    test_split  = [partition1, partition2]
    
    # Resnet input height & width?
    input_size  = 224

    # Specify training transform stack
    # Not too sure what random resize crop does...
    # Per Derek - maybe color space & other distortions would be useful?
    train_transform = transforms.Compose([
            transforms.RandomResizedCrop(input_size),
            transforms.RandomHorizontalFlip(),
            transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) ])

    # Specify validation transform stack
    val_transform = transforms.Compose([
            transforms.Resize(input_size),
            transforms.CenterCrop(input_size),
            transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) ])

    train_data  = DvmCarDataset(split = train_split, transform = train_transform, work = work_def, persist = persist_def)
    val_data    = DvmCarDataset(split =   val_split, transform =   val_transform, work = work_def, persist = persist_def)
    
    return(train_data, val_data)

(train_data, val_data) = load_data()

print('Training split contains {:7} images.'.format(len(train_data)))
print('Validation split contains {:7} images.'.format(len(val_data)))

Work file /data/dvmcar/dvmcar.zip is already available.
Using existing /data/dvmcar\resized_DVM_v2.zip.
Using existing /data/dvmcar\Confirmed_fronts.zip.
Using existing /data/dvmcar\tables_V2.0.zip.
Work file /data/dvmcar/dvmcar.zip is already available.
Using existing /data/dvmcar\resized_DVM_v2.zip.
Using existing /data/dvmcar\Confirmed_fronts.zip.
Using existing /data/dvmcar\tables_V2.0.zip.
Training split contains 1161427 images.
Validation split contains  145178 images.


In [5]:
# Import a lot of stuff
from IPython.display import display, clear_output
import matplotlib.pyplot as plt
import time
import os
import copy
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
import torchvision
from torchvision import models

%matplotlib inline

class model_manager:
    
    def __init__(self, datasets):
        
        # Training and validation datasets
        self.datasets = datasets
        
        # Output class count
        self.classes = datasets[0].classes
        
        # Maximum number of epochs 
        self.max_epochs = 100

        # Maximum time in minutes
        self.max_time = 24*60

        # Model save interval (minutes)
        self.save_interval = 15

        # Flag for feature extracting. When False, we finetune the whole model, 
        #   when True we only update the reshaped layer params
        self.feature_extract = True
        
        #
        self.model_path = "dvmcar.weights"

        # Models to choose from [resnet, alexnet, vgg, squeezenet, densenet, inception]
        self.model_name = "resnet"

        # Training minibatch size
        self.batch_size  = 50

        # Shuffle data between epochs
        self.shuffle = True 
        
        # Use pretrained model weights
        self.use_pretrained = True
        
        self.model, self.image_size = self.initialize_model()
        
        self.training_start = 0
        self.epoch_start = 0
        self.chunk_start = 0
        
        
    # Initialize a pretrained model 
    #
    # model_name - specifies the pretrained model type
    # self.classes - specifies the number of output classes

    def initialize_model(self):

        model = None
        image_size = 0

        if self.model_name == "resnet":
            """ Resnet50
            """
            model = models.resnet50(weights=ResNet50_Weights.DEFAULT)
            fc_input_features = model.fc.in_features
            model.fc = nn.Linear(fc_input_features, self.classes)
            image_size = 224

        elif self.model_name == "alexnet":
            """ Alexnet
            """
            model = models.alexnet(pretrained=self.use_pretrained)
            fc_input_features = model.classifier[6].in_features
            model.classifier[6] = nn.Linear(fc_input_features,self.classes)
            image_size = 224

        elif self.model_name == "vgg":
            """ VGG11_bn
            """
            model = models.vgg11_bn(pretrained=self.use_pretrained)
            #set_parameter_requires_grad(model, self.feature_extract)
            fc_input_features = model.classifier[6].in_features
            model.classifier[6] = nn.Linear(fc_input_features,self.classes)
            image_size = 224

        elif self.model_name == "squeezenet":
            """ Squeezenet
            """
            model = models.squeezenet1_0(pretrained=self.use_pretrained)
            model.classifier[1] = nn.Conv2d(512, self.classes, kernel_size=(1,1), stride=(1,1))
            model.self.classes = self.classes
            image_size = 224

        elif self.model_name == "densenet":
            """ Densenet
            """
            model = models.densenet121(pretrained=self.use_pretrained)
            fc_input_features = model.classifier.in_features
            model.classifier = nn.Linear(fc_input_features, self.classes) 
            image_size = 224

        elif self.model_name == "inception":
            """ Inception v3 
            Be careful, expects (299,299) sized images and has auxiliary output
            """
            model = models.inception_v3(pretrained=self.use_pretrained)
            # Handle the auxilary net
            fc_input_features = model.AuxLogits.fc.in_features
            model.AuxLogits.fc = nn.Linear(fc_input_features, self.classes)
            # Handle the primary net
            fc_input_features = model.fc.in_features
            model.fc = nn.Linear(fc_input_features,self.classes)
            image_size = 299

        else:
            print("Invalid model name, exiting...")
            exit()
            
        if self.feature_extract:
            for param in model.parameters():
                param.requires_grad = False
                
        return model, image_size
    
    def log_training_start(self):
        now = time.time()
        self.training_start = now
        print('Started training at {}'.format(now))
    
    def log_epoch_start(self, epoch):
        now = time.time()
        self.epoch_start = now
        print('Started epoch {}/{} at {}'.format(epoch, self.max_epochs, now))
        
    def log_chunk_start(self, chunk, chunks):
        now = time.time()
        self.chunk_start = now
        print('Started chunk {}/{} at {}'.format(chunk, chunks, now))
    
    def log_chunk_complete(self, chunk, chunks):
        now = time.time()
        print('Completed chunk {}/{} at {} elapsed {}'.format(chunk, chunks, now, now-self.chunk_start))
    
    def log_epoch_complete(self, epoch):
        now = time.time()
        print('Completed epoch {}/{} at {} elapsed {}'.format(epoch, self.max_epochs, now, now-self.epoch_start))
    
    def log_training_complete(self, criteria):
        now = time.time()
        print('Completed training based on {} at {} elapsed {}'.format(criteria, now, now-self.training_start))
        
    def update_plots(self, outputs):
        
        # Get indices (labels) sorting the vector in decreasing order
        top_indices = torch.argsort(outputs, 1, descending=True)

        # Find predicted ranking of the true class for each result in the batch
        n = [(top_indices[k]==labels.data[k]).nonzero().squeeze().item() for k in range(len(labels.data))]

        # For each result in the batch...
        for idx in n:

            # Increment
            rank_count[idx] += 1

        # Update plot

        fig.suptitle('Top N Summary: phase={}, epoch={:3}, sample={:8}'.format(
            phase, epoch, sample))

        clear_output(wait = True)
        x = np.arange(classes)+1

        ax1.cla()
        ax1.plot(x,          rank_count /sample, label='P(rank==N)')
        ax1.plot(x,np.cumsum(rank_count)/sample, label='P(rank in 1..N)')
        ax1.set_title('All classes')
        ax1.legend()
        ax1.grid()
        ax1.set_ylabel('Probability')
        ax1.set_xlabel('N')    

        max_n = 20
        ax2.cla()
        ax2.plot(x[:max_n],          rank_count[:max_n] /sample, label='P(rank==N)', linestyle='None', marker='o')
        ax2.plot(x[:max_n],np.cumsum(rank_count[:max_n])/sample, label='P(rank in 1..N)', linestyle='None', marker='o')
        ax2.set_title('Top {} classes'.format(max_n))
        ax2.legend()
        ax2.grid()
        ax2.set_ylabel('Probability')
        ax2.set_xlabel('N')    

        display(fig)    

        #print('Epoch={:3} / Sample={:8}'.format(epoch, sample), end='\r', flush=True)        
    
    def train_model(self, dataloaders, criterion, optimizer, is_inception=False):

        # Log training start
        self.log_training_start()
        
        # Clear validation accuracy history
        val_acc_history = []

        # Save best weights
        torch.save(self.model.state_dict(), self.model_path)
        
        # Clear best accuracy
        best_acc = 0.0
        
        # Clear epoch counter
        epoch = 0
        
        # Clear got worse counter
        got_worse = 0

        # Clear completion flag
        training_complete = False
        
        # While training remains incomplete...
        while not training_complete:
        
            # Log epoch start
            self.log_epoch_start(epoch)

            # For each phase...
            for phase in ['train', 'val']:
                
                # Depending on the phase
                if phase == 'train':
                    model.train()  # Set model to training mode
                else:
                    model.eval()   # Set model to evaluate mode

                # Clear accumulators
                running_loss = 0.0
                running_corrects = 0
                sample = 0

                # Clear class accumulators
                rank_count = np.zeros(classes, dtype=np.float)

                # For each minibatch...
                for inputs, labels in dataloaders[phase]:
                    
                    # Move inputs and labels to GPU
                    inputs = inputs.to(device)
                    labels = labels.to(device)

                    # Update sample counter
                    sample += inputs.size(0)

                    # Clear gradients
                    optimizer.zero_grad()

                    # forward
                    # track history if only in train
                    with torch.set_grad_enabled(phase == 'train'):
                        # Get model outputs and calculate loss
                        # Special case for inception because in training it has an auxiliary output. In train
                        #   mode we calculate the loss by summing the final output and the auxiliary output
                        #   but in testing we only consider the final output.
                        if is_inception and phase == 'train':
                            # From https://discuss.pytorch.org/t/how-to-optimize-inception-model-with-auxiliary-classifiers/7958
                            outputs, aux_outputs = model(inputs)
                            loss1 = criterion(outputs, labels)
                            loss2 = criterion(aux_outputs, labels)
                            loss = loss1 + 0.4*loss2
                        else:
                            outputs = model(inputs)
                            loss = criterion(outputs, labels)

                        # Use maximal class activations as predictions
                        _, preds = torch.max(outputs, 1)
                        
                        self.update_metrics(outputs)

                        # backward + optimize only if in training phase
                        if phase == 'train':
                            loss.backward()
                            optimizer.step()

                    # statistics
                    running_loss += loss.item() * inputs.size(0)
                    running_corrects += torch.sum(preds == labels.data)
                    
                    now = time.time()
                    elapsed = now - self.training_start
                    
                    if elapsed/60 >= self.max_time:
                        training_complete = True
                        completion_criteria = "Time"

                epoch_loss = running_loss / len(dataloaders[phase].dataset)
                epoch_acc = running_corrects.double() / len(dataloaders[phase].dataset)

                print('{} Loss: {:.4f} Acc: {:.4f}'.format(phase, epoch_loss, epoch_acc))

                # If validation phase...
                if phase == 'val':
                
                    # Update eopoch accuracy list
                    val_acc_history.append(epoch_acc)
                    
                    # If validation performance improved...
                    if epoch_acc > best_acc:
                        
                        # Update best performance
                        best_acc = epoch_acc
                        
                        # Save best model weights
                        torch.save(model.state_dict(), self.model_path)
                        
                    else:
                        got_worse += 1
                        if got_worse > 1:
                            training_complete = True
                            completion_criteria = "Accuracy"
                            

            # Log epoch complete
            self.log_epoch_complete(epoch)
            
            # Increment epoch counter
            epoch += 1
            
            if epoch>=self.max_epochs:
                training_complete = True
                completion_criteria = "Epochs"
                
        # Report completion
        self.log_training_complete(completion_criteria)

        #time_elapsed = time.time() - start_time
        #print('Training complete in {:.0f}m {:.0f}s'.format(time_elapsed // 60, time_elapsed % 60))
        #print('Best val Acc: {:4f}'.format(best_acc))

        # load best model weights
        model.load_state_dict(best_model_wts)
        model.load_state_dict(torch.load(self.model_path))
        model.eval()
        return model, val_acc_history
    
manager = model_manager((train_data, val_data))

NameError: name 'ResNet50_Weights' is not defined

In [3]:
# Initialize the model for this run
model_ft, input_size = initialize_model(model_name, train_data.classes, feature_extract, use_pretrained=True)

# Print the model we just instantiated
# print(model_ft)

print("Initializing Datasets and Dataloaders...")

# Create training and validation datasets
image_datasets = {'train': train_data, 'val' : val_data}

# Create training and validation dataloaders
dataloaders_dict = {x: torch.utils.data.DataLoader(image_datasets[x], batch_size=batch_size, shuffle=True, num_workers=4) for x in ['train', 'val']}

# Detect if we have a GPU available
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# Send the model to GPU
model_ft = model_ft.to(device)

# Gather the parameters to be optimized/updated in this run. If we are
#  finetuning we will be updating all parameters. However, if we are 
#  doing feature extract method, we will only update the parameters
#  that we have just initialized, i.e. the parameters with requires_grad
#  is True.
params_to_update = model_ft.parameters()
print("Params to learn:")
if feature_extract:
    params_to_update = []
    for name,param in model_ft.named_parameters():
        if param.requires_grad == True:
            params_to_update.append(param)
            print("\t",name)
else:
    for name,param in model_ft.named_parameters():
        if param.requires_grad == True:
            print("\t",name)

# Observe that all parameters are being optimized
optimizer_ft = optim.Adam(params_to_update, lr=0.05)

# Setup the loss fxn
criterion = nn.CrossEntropyLoss()

print("Training model...")

# Train and evaluate
model_ft, hist = train_model(model_ft, dataloaders_dict, criterion, optimizer_ft, num_epochs=max_epochs, is_inception=(model_name=="inception"))

NameError: name 'initialize_model' is not defined

In [None]:
# Writes a csv containing every make and model combination in the dataset
def util_list_make_model(dataset):
    
    maker_list = list(dataset.basic_df['Automaker'])
    model_list = list(dataset.basic_df['Genmodel'])

    mm_fields = list(set([r[0]+','+r[1] for r in zip(maker_list, model_list)]))
    mm_fields.sort()

    with open('dvmcars_make_model.csv','w') as out:
        out.write('dvmcars-make,dvmcars-model\n')
        for r in mm_fields:
            out.write(r+'\n')
            
# util_list_make_model(train_data)