In [114]:
%pip install accelerate

Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com
Note: you may need to restart the kernel to use updated packages.


In [94]:
import os
import torch
import torchvision
from torch import nn
from torchvision.datasets import CIFAR10
from torch.utils.data import DataLoader
from torchvision import transforms
from collections import OrderedDict

In [95]:
import accelerate
import models
import importlib
import helper
import config

In [96]:
import time

In [97]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt

# Config

In [134]:
global_config = config.get_global_configuration()
device = global_config['device']

cpu


In [None]:
importlib.reload(models)
importlib.reload(helper)
importlib.reload(config)

In [105]:
m1 = models.LayerwiseConfigurableCNN()
m2 = models.LayerwiseConfigurableMLP()

# Training

In [126]:
def get_top1_pos(outputs, targets):
    pred = np.argmax(outputs, axis=1)
    assert(len(pred) == len(targets))
    
    return np.sum(np.where(pred == targets, 1, 0))

def get_top5_pos(outputs, targets):
    sm = 0
    for i in range(len(targets)):
        top_5 = np.argpartition(outputs[i], -5)[-5:]
        sm += 1 if targets[i] in set(top_5) else 0 
    
    return sm

def evaluate_model(model, data_loader, loss_function, device='cpu'):    
    output_data = []
    targets_data = []
    current_loss = 0
    
    for i, data in enumerate(data_loader):
        inputs, targets = data
        inputs, targets = inputs.to(device), targets.to(device)

        # Perform forward pass
        outputs = model(inputs)
        
        if str(device) != 'cpu':
            outputs = outputs.cpu()
            targets = targets.cpu()
            
        output_data.extend(outputs.detach().numpy())
        targets_data.extend(targets.detach().numpy())

        loss = loss_function(outputs, targets)
        current_loss += loss.item()
        
    N = len(targets_data)
    top1_acc = get_top1_pos(output_data, targets_data) / N
    top5_acc = get_top5_pos(output_data, targets_data) / N
    
    return current_loss, top1_acc, top5_acc    


def train_model(model, device='cpu', epochs=None, invariant=False, debug=False):
    """ Train a model. """
    model_config = config.get_model_configuration()
    
    loss_function = model_config.get("loss_function")()
    optimizer = model_config.get("optimizer")(model.parameters(), lr=1e-4)
    trainloader = helper.get_dataset(train=True, invariant=invariant)
    testloader = helper.get_dataset(train=False, invariant=invariant)

#     Accelerate model
#     accelerator = accelerate.Accelerator()  
#     model, optimizer, trainloader = accelerator.prepare(model, optimizer, trainloader)

    # Iterate over the number of epochs
    entries = []
    
    if epochs is None:
        epochs = model_config.get("num_epochs")
    
    for epoch in range(epochs):
        # Print epoch
        print(f'Starting epoch {epoch+1}')

        # Set current loss value
        current_loss = 0.0
        
        output_data = []
        targets_data = []
 
        # Iterate over the DataLoader for training data
        st_time = time.time()
        for i, data in enumerate(trainloader, 0):
#             print(i)

            # Get inputs
            inputs, targets = data
            inputs, targets = inputs.to(device), targets.to(device)

            # Zero the gradients
            optimizer.zero_grad()
            
            # Perform forward pass
            outputs = model(inputs)

            # Compute loss
            loss = loss_function(outputs, targets)

            current_loss += loss.item()
            
            # Perform backward pass
            loss.backward()

            # Perform optimization
            optimizer.step()

        end_time = time.time()
        
        if epoch % 5 == 0 or epoch == (epochs - 1):
            test_loss, test_top1_acc, test_top5_acc = evaluate_model(model, testloader ,loss_function)
            train_loss, train_top1_acc, train_top5_acc = evaluate_model(model, trainloader ,loss_function)
            print(f'Train Acc: {train_top1_acc}')
            print(f'Test Acc: {test_top1_acc}')
        else:
            test_loss, test_top1_acc, test_top5_acc = pd.NA, pd.NA, pd.NA
            train_loss, train_top1_acc, train_top5_acc = pd.NA, pd.NA, pd.NA
        
        elapsed_time = round(end_time - st_time, 1)
        train_entry = {'type': 'train', 'epoch': epoch, 'top1': train_top1_acc, 'top5': train_top5_acc,
                       'loss': current_loss, 'time': elapsed_time}
        
        print(f'Loss: {current_loss}')
        print(f'Time: {elapsed_time}')

        test_entry = {'type': 'test', 'epoch': epoch, 'top1': test_top1_acc, 'top5': test_top5_acc,
                      'loss': test_loss, 'time': pd.NA}
        
        entries.extend([train_entry, test_entry])
        
#         break


    # Return trained model
    return model, pd.DataFrame(entries), current_loss

In [72]:
cnn = models.LayerwiseConfigurableCNN()
# mlp = mlp.to(device)
# cnn, cnn_df, loss = train_model(cnn)

In [73]:
mlp = models.LayerwiseConfigurableMLP()
# mlp = mlp.to(device)
# mlp, mlp_df, loss = train_model(mlp)

In [127]:
def add_config_columns(results_df):
    model_config = config.get_model_configuration()
    results_df['optimizer'] = str(model_config['optimizer'])
    results_df['hidden_layer_dim'] = model_config['hidden_layer_dim']
    results_df['batch_size'] = model_config['batch_size']
    results_df['invariant'] = global_config['invariant']
    
    return results_df

In [137]:
def greedy_layerwise_training(model, rnd=0):
    """ Perform greedy layer-wise training. """    
    print("NEW!")
    global_config = config.get_global_configuration()
    device = global_config.get('device')
    model = model.to(device)
    torch.manual_seed(42)

    # Loss comparison
    loss_comparable = float('inf')

    # Iterate over the number of layers to add
    training_losses = []
    top5_accs = []
    top1_accs = []
    
    dfs = []
    for num_layers in range(global_config.get("num_layers_to_add")):
        if len(model.hidden_blocks) < num_layers:
            # Add layer to model
            model.add_hidden_block()
            model = model.to(device)
        
        active_layer = self.input_block if num_layers == 0 else self.hidden_block[num_layers - 1]
        model.activate_layers([active_layer])
        if num_layers > 0:
            model.freeze_layers([model.input_block] + [self.hidden_blocks[i] for i in range(num_layers-1)])
        
        # Print which model is trained
        print("="*100)
        if num_layers > 0:
            print(f">>> TRAINING THE MODEL WITH {num_layers} ADDITIONAL LAYERS:")
        else:
            print(f">>> TRAINING THE BASE MODEL:")

        # Train the model
        model, df, end_loss = train_model(model, device=device, invariant=global_config['invariant'])
        df['layer'] = num_layers
        df['layer_params'] = model.num_trainable_weights()
        dfs.append(df)

        # Compare loss
        if num_layers > 0 and end_loss < loss_comparable:
            print("="*50)
            print(f">>> RESULTS: Adding this layer has improved the model loss from {loss_comparable} to {end_loss}")
            loss_comparable = end_loss
        elif num_layers > 0:
            print("="*50)
            print(f">>> RESULTS: Adding this layer did not improve the model loss from {loss_comparable} to {end_loss}")
        elif num_layers == 0:
            loss_comparable = end_loss

        # Add layer to model
#         break

    # Process is complete
    print("Training process has finished.")
    
    results_df = pd.concat(dfs)
    results_df = add_config_columns(results_df)
    results_df['model'] = model.get_name()

    
    return model, results_df

In [140]:
def full_backprop_training(model):
    """ Perform greedy layer-wise training. """    
    print("NEW!")
    global_config = config.get_global_configuration()
    device = global_config.get('device')
    model = model.to(device)
    torch.manual_seed(42)

    # Loss comparison
    loss_comparable = float('inf')

    # Iterate over the number of layers to add
    training_losses = []
    top5_accs = []
    top1_accs = []
    
    dfs = []
    for i in range(global_config.get("num_layers_to_add")):        
        model = model.to(device)
        model, df, end_loss = train_model(model, device=device, invariant=global_config['invariant'])
        print(i)
        print(end_loss)

        df['layer'] = len(model.hidden_blocks)
        df['layer_params'] = model.num_trainable_weights()
        dfs.append(df)
        
        model.add_hidden_block()
#         break
    
    results_df = pd.concat(dfs)
    results_df = add_config_columns(results_df)
    results_df['model'] = model.get_name()
    
    return model, results_df

# Section

In [None]:
mlp_bp_model, mlp_bp_results_df = full_backprop_training(models.LayerwiseConfigurableMLP())

NEW!
cpu
Files already downloaded and verified
Files already downloaded and verified
Starting epoch 1
Train Acc: 0.36598
Test Acc: 0.3645
Loss: 388.7958953380585
Time: 13.6
Starting epoch 2
Loss: 358.3926159143448
Time: 12.5
Starting epoch 3
Loss: 345.3890894651413
Time: 12.4
Starting epoch 4
Loss: 336.8449845314026
Time: 12.4
Starting epoch 5
Loss: 328.4441432952881
Time: 13.3
Starting epoch 6
Train Acc: 0.44436
Test Acc: 0.4379
Loss: 322.1038612127304
Time: 15.1
Starting epoch 7
Loss: 317.38067531585693
Time: 16.0
Starting epoch 8
Loss: 312.77988970279694
Time: 13.1
Starting epoch 9
Loss: 308.75481486320496
Time: 15.0
Starting epoch 10
Loss: 304.6808851957321
Time: 12.8
Starting epoch 11
Train Acc: 0.48186
Test Acc: 0.4639
Loss: 302.07377898693085
Time: 13.3
0
302.07377898693085
Files already downloaded and verified
Files already downloaded and verified
Starting epoch 1
Train Acc: 0.45498
Test Acc: 0.4414
Loss: 319.44641733169556
Time: 13.2
Starting epoch 2
Loss: 310.14966428279877
T

In [None]:
mlp_model, mlp_results_df = greedy_layerwise_training(models.LayerwiseConfigurableMLP())

In [None]:
cnn_model, cnn_results_df = greedy_layerwise_training(models.LayerwiseConfigurableCNN())

In [None]:
cnn_bp_model, cnn_bp_results_df = full_backprop_training(models.LayerwiseConfigurableCNN())

Questions

1) Understanding the curvature of the loss function - how to compute the Hessian
2) Should I freeze the output layer?
3) What does it mean to set W_L using the neural collapse property?
4) 

Lecture

1) Understanding, as a mathematician, the critical points of L
2) Goal


For deep (l >= 3) nonlinear networks, bad local (non-global) minimima exist - that are difficult to escape
Morse Function. A function L: R^d -> R is MOrse if at every critical point p in R^d the Hessian Hess(L) (p) is nonsingular (i.e. has no 0 eigenvalues)

1) If L is Morse, can understand the topology of u by computing all the critical points of L and geometry near them
2) Almost every c^2 function is Morse (Morse functions are open, dense in C^2)