In [114]:
%pip install accelerate

Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com
Note: you may need to restart the kernel to use updated packages.


In [1]:
import os
import torch
import torchvision
from torch import nn
from torchvision.datasets import CIFAR10
from torch.utils.data import DataLoader
from torchvision import transforms
from collections import OrderedDict
import datetime

In [2]:
# import accelerate
import models
import importlib
import helper
import config

In [3]:
import time

In [4]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt

In [5]:
# workdir = '/Users/erichansen/Desktop/Classes/9.520/project/'
workdir = './'

# Config

In [6]:
global_config = config.get_global_configuration()
device = global_config['device']

In [21]:
importlib.reload(models)
importlib.reload(helper)
importlib.reload(config)

<module 'config' from '/home/ec2-user/SageMaker/layerwise-nn-optimization/config.py'>

In [125]:
# m1 = models.LayerwiseConfigurableCNN()
m2 = models.LayerwiseConfigurableMLP(device)

# Training

In [8]:
def get_top1_pos(outputs, targets):
    pred = np.argmax(outputs, axis=1)
    assert(len(pred) == len(targets))
    
    return np.sum(np.where(pred == targets, 1, 0))

def get_top5_pos(outputs, targets):
    sm = 0
    for i in range(len(targets)):
        top_5 = np.argpartition(outputs[i], -5)[-5:]
        sm += 1 if targets[i] in set(top_5) else 0 
    
    return sm

def evaluate_model(model, data_loader, loss_function, device='cpu'):    
    output_data = []
    targets_data = []
    current_loss = 0
    
    for i, data in enumerate(data_loader):
        inputs, targets = data
        inputs, targets = inputs.to(device), targets.to(device)

        # Perform forward pass
        outputs = model(inputs)
        
        if str(device) != 'cpu':
            outputs = outputs.cpu()
            targets = targets.cpu()
            
        output_data.extend(outputs.detach().numpy())
        targets_data.extend(targets.detach().numpy())

        loss = loss_function(outputs, targets)
        current_loss += loss.item()
        
    N = len(targets_data)
    top1_acc = get_top1_pos(output_data, targets_data) / N
    top5_acc = get_top5_pos(output_data, targets_data) / N
    
    return current_loss / float(N), top1_acc, top5_acc    


def train_model(model, device='cpu', epochs=None, invariant=False, output=False):
    """ Train a model. """
    model_config = config.get_model_configuration()
    print(device)
    
    loss_function = model_config.get("loss_function")()
    optimizer = model_config.get("optimizer")(model.parameters(), 
                                              lr=model_config.get('learning_rate'),
                                              weight_decay=model_config.get('weight_decay'))
    trainloader = helper.get_dataset(train=True, invariant=invariant)
    testloader = helper.get_dataset(train=False, invariant=invariant)

#     Accelerate model
#     accelerator = accelerate.Accelerator()  
#     model, optimizer, trainloader = accelerator.prepare(model, optimizer, trainloader)

    # Iterate over the number of epochs
    entries = []
    
    if epochs is None:
        epochs = model_config.get("num_epochs")
    
    for epoch in range(epochs):
        # Print epoch
        if output:
            print(f'Starting epoch {epoch+1}')

        # Set current loss value
        current_loss = 0.0
        
        output_data = []
        targets_data = []
 
        # Iterate over the DataLoader for training data
        st_time = time.time()
        for i, data in enumerate(trainloader, 0):
#             print(i)

            # Get inputs
            inputs, targets = data
            inputs, targets = inputs.to(device), targets.to(device)

            # Zero the gradients
            optimizer.zero_grad()
            
            # Perform forward pass
            outputs = model(inputs)

            # Compute loss
            loss = loss_function(outputs, targets)

            current_loss += loss.item()
            
            # Perform backward pass
            loss.backward()

            # Perform optimization
            optimizer.step()

        end_time = time.time()
        
        if (epoch % 1 == 0) or (epoch == (epochs - 1)):
            test_loss, test_top1_acc, test_top5_acc = evaluate_model(model, testloader ,loss_function, device=device)
            train_loss, train_top1_acc, train_top5_acc = evaluate_model(model, trainloader ,loss_function, device=device)
            if output:
                print(f'Train Acc: {train_top1_acc}')
                print(f'Test Acc: {test_top1_acc}')
        else:
            test_loss, test_top1_acc, test_top5_acc = pd.NA, pd.NA, pd.NA
            train_loss, train_top1_acc, train_top5_acc = pd.NA, pd.NA, pd.NA
        
        elapsed_time = round(end_time - st_time, 1)
        train_entry = {'type': 'train', 'epoch': epoch, 'top1': train_top1_acc, 'top5': train_top5_acc,
                       'loss': train_loss, 'time': elapsed_time,
                        'network_norm': model.network_norm()
                      }
        
        if output:
            print(f'Loss: {current_loss}')
            print(f'Time: {elapsed_time}')
        
        test_entry = {'type': 'test', 'epoch': epoch, 'top1': test_top1_acc, 'top5': test_top5_acc,
                      'loss': test_loss, 'time': pd.NA}
        
        entries.extend([train_entry, test_entry])
        
#         break


    # Return trained model
    return model, pd.DataFrame(entries), current_loss

In [11]:
def add_config_columns(model, strategy, results_df):
    model_config = config.get_model_configuration()
    global_config = config.get_global_configuration()
    
    results_df['optimizer'] = str(model_config['optimizer'])
    results_df['hidden_layer_dim'] = model_config['hidden_layer_dim']
    results_df['batch_size'] = model_config['batch_size']
    results_df['batch_norm'] = model_config['batch_norm']
    results_df['weight_decay'] = model_config['weight_decay']
    results_df['learning_rate'] = model_config['learning_rate']
    results_df['invariant'] = global_config['invariant']
    results_df['condition'] = global_config['condition']
    results_df['max_epochs'] = model_config['num_epochs']
    
    results_df['model'] = model.get_name()
    results_df['train_strategy'] = strategy
    results_df['model_strategy'] = results_df['model'] + '_' + results_df['train_strategy']

    return results_df

In [12]:
def prep_model(model, num_layers):
    while len(model.hidden_blocks) + 1 < num_layers:
        model.add_hidden_block(device)
    
    model = model.to(device)
    
    return model

In [13]:
def greedy_layerwise_training(model, output=False):
    """ Perform greedy layer-wise training. """    
    global_config = config.get_global_configuration()
    torch.manual_seed(42)

    rnds = global_config['rounds']
    device = global_config.get('device')
    num_layers = global_config.get("num_layers")

    # Loss comparison
    loss_comparable = float('inf')
    
    dfs = []

    model = prep_model(model, num_layers)
    model.freeze_layers([model.input_block] + model.hidden_blocks)

    for rnd in range(rnds):
        t0 = time.time()
        print(f"Round: {rnd}")
        for layer_num in range(num_layers):
            active_block = model.input_block if layer_num == 0 else model.hidden_blocks[layer_num - 1]
            model.activate_layer(active_block)
            
            # Print which model is trained
            if output:
                print("="*100)
                if num_layers > 0:
                    print(f">>> TRAINING THE MODEL WITH {num_layers} ADDITIONAL LAYERS:")
                else:
                    print(f">>> TRAINING THE BASE MODEL:")

            # Train the model
            model, df, end_loss = train_model(model, device=device, invariant=global_config['invariant'])
            df['layer'] = layer_num
            df['round'] = rnd
            
            trainable_weights = model.num_trainable_weights()
            df['trainable_params'] = trainable_weights
            dfs.append(df)

            # Compare loss
            if output:
                print(f'Num Trainable Weights: {trainable_weights}')
                print(f'Expected Trainable Weights: {sum(p.numel() for p in active_block.parameters() if p.requires_grad)}')
                if num_layers > 0 and end_loss < loss_comparable:
                    print("="*50)
                    print(f">>> RESULTS: Adding this layer has improved the model loss from {loss_comparable} to {end_loss}")
                elif num_layers > 0:
                    print("="*50)
                    print(f">>> RESULTS: Adding this layer did not improve the model loss from {loss_comparable} to {end_loss}")
            loss_comparable = end_loss
            
            # Freeze Active Layer
            model.freeze_layer(active_block)
#             break
        t1 = time.time()
        elapsed_time = round(t1 - t0, 1)
        print(elapsed_time)

#         break
    # Process is complete
    print("Training process has finished.")
    
    results_df = pd.concat(dfs)
    strat = 'layerwise'
    results_df = add_config_columns(model, strat, results_df)

    results_df.to_csv(workdir + f'results/{helper.get_datetime_str(datetime.datetime.now())}_{model.get_name()}_{strat}.csv')
    print('finished')
    
    return model, results_df

In [14]:
def full_backprop_training(model, output=False, expand_rounds=False):
    """ Perform full backprop training. """    
    global_config = config.get_global_configuration()
    torch.manual_seed(42)
    
    device = global_config.get('device')
    rnds = global_config['rounds']
    num_layers = global_config.get("num_layers")
    
    model = prep_model(model, num_layers)
    # Loss comparison
    dfs = []
    
    rnds = rnds * num_layers if expand_rounds else rnds
    for rnd in range(rnds):
        print(f"Round: {rnd}")
        t0 = time.time()
        model, df, end_loss = train_model(model, device=device, invariant=global_config['invariant'])
        t1 = time.time()
        elapsed_time = round(t1 - t0, 1)
        print(elapsed_time)
        
        if output:
            print(i)
            print(end_loss)

        df['trainable_params'] = model.num_trainable_weights()
        df['round'] = rnd
        dfs.append(df)
#         break
    
    results_df = pd.concat(dfs)
    strat = 'backprop'
    results_df = add_config_columns(model, strat, results_df)
        
    results_df.to_csv(workdir + f'results/{helper.get_datetime_str(datetime.datetime.now())}_{model.get_name()}_{strat}.csv')
    print('finished')
    
    return model, results_df

In [15]:
def last_epoch_df(df):
#     def last_epoch_grp(grp):
#         return grp[grp['epoch'] == grp['epoch'].max()]
    return df.groupby(by=['model', 'rnd', 'train_strategy'])

# Run Training Methods

In [16]:
def generate_simplex_etf(in_dim, out_dim, seed=520):
    print(in_dim)
    print(out_dim)

    M = np.sqrt(out_dim / (out_dim - 1)) * np.identity(out_dim) - \
        (1 / float(out_dim)) * np.ones((out_dim, out_dim))
    U = scipy.stats.ortho_group.rvs(in_dim, random_state=seed)[:, :out_dim]

    ETF = np.matmul(U, M)

    return torch.from_numpy(ETF.astype(np.float32))

In [17]:
mlp_model = models.LayerwiseConfigurableMLP(device)
mlp_model, mlp_results_df = greedy_layerwise_training(mlp_model)

Round: 0
cuda:0
cuda:0
cuda:0
85.0
Round: 1
cuda:0
cuda:0
cuda:0
84.5
Round: 2
cuda:0
cuda:0
cuda:0
86.0
Round: 3
cuda:0
cuda:0
cuda:0
86.8
Round: 4
cuda:0
cuda:0
cuda:0
85.7
Training process has finished.
finished


In [None]:
mlp_bp_model = models.LayerwiseConfigurableMLP(device)
mlp_bp_model, mlp_bp_results_df = full_backprop_training(mlp_bp_model, expand_rounds=True)

Round: 0
cuda:0
43.9
Round: 1
cuda:0


In [42]:
etf = torch.load('./CNN16_simplex_etf.pt')

In [19]:
cnn_model = models.LayerwiseConfigurableCNN(device)
cnn_model, cnn_results_df = greedy_layerwise_training(cnn_model)

Round: 0
cuda:0
cuda:0
cuda:0
85.5
Round: 1
cuda:0
cuda:0
cuda:0
86.5
Round: 2
cuda:0
cuda:0
cuda:0
86.2
Round: 3
cuda:0
cuda:0
cuda:0
86.4
Round: 4
cuda:0
cuda:0
cuda:0
86.8
Training process has finished.
finished


In [20]:
cnn_model = models.LayerwiseConfigurableCNN(device)
cnn_model, cnn_results_df = full_backprop_training(cnn_model, expand_rounds=True)

Round: 0
cuda:0
28.9
Round: 1
cuda:0
28.7
Round: 2
cuda:0
29.2
Round: 3
cuda:0
28.7
Round: 4
cuda:0
28.1
Round: 5
cuda:0
28.9
Round: 6
cuda:0
29.0
Round: 7
cuda:0
28.6
Round: 8
cuda:0
29.4
Round: 9
cuda:0
28.7
Round: 10
cuda:0
29.0
Round: 11
cuda:0
29.1
Round: 12
cuda:0
28.5
Round: 13
cuda:0
29.1
Round: 14
cuda:0
29.1
finished


# By Epoch Graphs

Questions

1) Understanding the curvature of the loss function - how to compute the Hessian
2) Should I freeze the output layer?
3) What does it mean to set W_L using the neural collapse property?
4) Skip connections - does that mean that each layer needs to have the same output dimension as the final output dimension? Do we just sum them up at the end
5) Training Resources
6) Weight Decay? Batch Normalization?
7) Width of Hidden Layers in MLP

8) Depth of MLP
9) Number of Channels in CNN
10) Kernel Size in CNN

Lecture

1) Understanding, as a mathematician, the critical points of L
2) Goal


For deep (l >= 3) nonlinear networks, bad local (non-global) minimima exist - that are difficult to escape
Morse Function. A function L: R^d -> R is MOrse if at every critical point p in R^d the Hessian Hess(L) (p) is nonsingular (i.e. has no 0 eigenvalues)

1) If L is Morse, can understand the topology of u by computing all the critical points of L and geometry near them
2) Almost every c^2 function is Morse (Morse functions are open, dense in C^2)

Morse-Bott function
Allow for non-isolated critical points
L: R^d -> R is Morse Bott f critical locus is a closed submanifold and Hess(L) is nonsingular in normal directions to that submanifold

Geometry changes significantly across regimes
- width > n
- width > poly(n)
- width > sqrt(n)

As soon as there exists 