##  Double descent in one layer neural network - loss surfaces

In [None]:
#!/usr/bin/env python
# coding: utf-8

%matplotlib inline
from matplotlib import cm
import matplotlib.pyplot as plt
import pandas as pd

import argparse
import os
import datetime
import pathlib
import random
import json
import numpy as np
import math

import torch

import sys
sys.path.append('../code/')
from linear_utils import linear_model
from train_utils import save_config, prune_data

In [None]:
# argument written in command line format
cli_args = '--seed 12 --save-results --jacobian --risk-loss L2 -d 2 -n 10 --sigmas geo --s-range 0.5 0.5 --beta 5.0 5.0 --sigma_noise 5.0 --num-layers 1 --no-bias'


In [None]:
parser = argparse.ArgumentParser(description='CLI parameters for training')
parser.add_argument('--config', type=str, default='', metavar='CONFIG',
                    help='Config file')
parser.add_argument('--root', type=str, default='', metavar='DIR',
                    help='Root directory')
parser.add_argument('-t', '--iterations', type=int, default=1e4, metavar='ITERATIONS',
                    help='Iterations (default: 1e4)')
parser.add_argument('-n', '--samples', type=int, default=100, metavar='N',
                    help='Number of samples (default: 100)')
parser.add_argument('--print-freq', type=int, default=100,
                    help='CLI output printing frequency (default: 1000)')
parser.add_argument('--gpu', type=int, default=None,
                    help='Number of GPUS to use')
parser.add_argument('--disable-cuda', action='store_true', default=False,
                    help='Disable CUDA')
parser.add_argument('--seed', type=int, default=None,
                    help='Random seed')
parser.add_argument('-d', '--dim', type=int, default=50, metavar='DIMENSION',
                    help='Feature dimension (default: 50)')
parser.add_argument('--hidden', type=int, default=200, metavar='DIMENSION',
                    help='Hidden layer dimension (default: 200)')
parser.add_argument('--batch-norm', action='store_true', default=False,
                    help='Use batch norm')
parser.add_argument('--no-bias', action='store_true', default=False,
                    help='Do not use bias')
parser.add_argument('--linear', action='store_true', default=False,
                    help='Linear activation function')
parser.add_argument('--sigmas', type=str, default=None,
                    help='Sigmas')
parser.add_argument('--sigma_noise', nargs='*', type=float, default=0.0,
                    help='Output noise.')
parser.add_argument('--beta', nargs='*', type=float, default=None,
                    help='True model parameters.')
parser.add_argument('--coupled_noise', action='store_true', default=False,
                    help='Couple noise in output to large eigenvalues.')
parser.add_argument('-r', '--s-range', nargs='*', type=float,
                    help='Range for sigmas')
parser.add_argument('-w', '--scales', nargs='*', type=float,
                    help='scale of the weights')
parser.add_argument('--first_layer_lr', type=float, default=1e-4, metavar='FIRST LR',
                    help='First layer lr')
parser.add_argument('--lr_factor', type=float, default=1e-4, metavar='LR RATIO',
                    help='Factor with which first layer lr i multiplied to obtain second layer lr')
parser.add_argument('--normalized', action='store_true', default=False,
                    help='normalize sample norm across features')
parser.add_argument('--risk-loss', type=str, default='MSE', metavar='LOSS',
                    help='Loss for validation')
parser.add_argument('--jacobian', action='store_true', default=False,
                    help='compute the SVD of the jacobian of the network')
parser.add_argument('--save-results', action='store_true', default=False,
                    help='Save the results for plots')
parser.add_argument('--plot', action='store_true', default=False,
                    help='Plot the results')
parser.add_argument('--eigen', action='store_true', default=False,
                    help='Compute eigenvalue')
parser.add_argument('--pcs', type=int, default=None, 
                    help='Number of PCs to use in data.')
parser.add_argument('--transform-data', action='store_true', default=False, 
                    help='Use data in transformed space')
parser.add_argument('--low-rank-eval', action='store_true', default=False, 
                    help='Evaluate performance of low-rank train data.')
parser.add_argument('--weight-eval', action='store_true', default=False, 
                    help='Evaluate MSE of weights (linear model).')
parser.add_argument('--details', type=str, metavar='N',
                    default='no_detail_given',
                    help='details about the experimental setup')
parser.add_argument('--num-layers', type=int, default=2, 
                    help='number of model layers (1, 2 or 5)')
parser.add_argument('--freeze-layer', type=int, default=None, 
                    help='Freezing model layer.')
parser.add_argument('--scaling-layer', action='store_true', default=False,
                    help='Use ScalingLayer as last layer (for analysis).')

args = parser.parse_args(cli_args.split())

# directories
root = pathlib.Path(args.root) if args.root else pathlib.Path.cwd().parent

current_date = str(datetime.datetime.today().strftime('%Y-%m-%d-%H-%M-%S'))
args.outpath = (pathlib.Path.cwd().parent / 'results' / 'two_layer_nn' / current_date)

if args.save_results:
    args.outpath.mkdir(exist_ok=True, parents=True)

if args.seed is not None:
    random.seed(args.seed)
    torch.manual_seed(args.seed)
    np.random.seed(args.seed)

args.device = 'cuda' if (not args.disable_cuda and torch.cuda.is_available()) else 'cpu'
print(args.device)

args.lr = [args.first_layer_lr, args.first_layer_lr*args.lr_factor]

if len(args.sigma_noise) == 1:
    args.sigma_noise = args.sigma_noise[0]


In [None]:
def get_dataset(args, return_weights=False):
    # sample training set from the linear model
    
    if args.beta is not None:
        args.beta = np.array(args.beta)
    
    lin_model = linear_model(args.dim, sigma_noise=args.sigma_noise, beta=args.beta, normalized=False, sigmas=args.sigmas, s_range=args.s_range, coupled_noise=args.coupled_noise, transform_data=args.transform_data)
    Xs, ys = lin_model.sample(args.samples, train=True)
    Xs = torch.Tensor(Xs).to(args.device)
    ys = torch.Tensor(ys.reshape((-1, 1))).to(args.device)

    # sample the set for empirical risk calculation
    Xt, yt = lin_model.sample(args.samples, train=False) # * 1000
    Xt = torch.Tensor(Xt).to(args.device)
    yt = torch.Tensor(yt.reshape((-1, 1))).to(args.device)
    
    if return_weights:
        return Xs, ys, Xt, yt, lin_model.beta
    else:
        return Xs, ys, Xt, yt
    
    
def get_model(args):
    model = torch.nn.Sequential(
        torch.nn.Linear(args.dim, 1, bias=not args.no_bias),
    ).to(args.device)
    
    return model 


In [None]:
Xs, ys, Xt, yt, ws = get_dataset(args, return_weights=True)

u, s, vh = np.linalg.svd(Xs)
print(s)
    
model = get_model(args)

In [None]:
# define loss functions
loss_fn = torch.nn.MSELoss(reduction='sum')
risk_fn = torch.nn.L1Loss(reduction='mean') if args.risk_loss == 'L1' else loss_fn

In [None]:
w_min, w_max, w_step = -10.0, 10.0, 0.1
u, v = np.arange(w_min, w_max, w_step), np.arange(w_min, w_max, w_step)
U, V = np.meshgrid(u, v)

U, V = U.astype(np.float32), V.astype(np.float32)

nu, nv = u.shape[0], v.shape[0]
losses, risks = np.zeros((nv, nu)), np.zeros((nv, nu))
for i in range(nu):
    for j in range(nv):
        
        with torch.no_grad():
            
            # Set model weights
            model[0].weight = torch.nn.Parameter(torch.tensor([[U[j, i], V[j, i]]]))
            
            # Train
            y_pred = model(Xs)
            losses[j, i] = loss_fn(y_pred, ys)
            
            # Test
            yt_pred = model(Xt)
            risks[j, i] = risk_fn(yt_pred, yt)

            #MSE for weights?
        

In [None]:
RESULTS_DIR = "../results/one_layer_results_l2"

sup = ''
if args.transform_data:
    RESULTS_DIR += "/transform_data"
    sup = "transformed_data"

def append_id(filename, id):
    return "{0}_{2}.{1}".format(*filename.rsplit('.', 1) + [id])

def get_run(lr1, lr2, batch_norm, uniform_noise=False, coupled_noise=None, ext=''):
    file_path = os.path.join(RESULTS_DIR, f"lr={lr1}_{lr2}.csv")
    if batch_norm:
        file_path = append_id(file_path, "batch_norm") 
    elif uniform_noise:
        file_path = append_id(file_path, "uniform_noise") 

    if coupled_noise is not None:
        file_path = append_id(file_path, f"coupled_noise_{coupled_noise}") 

    file_path = append_id(file_path, ext)

    data = pd.read_csv(file_path, header=None)
    
    return data[0], data[1], data[4], data[5] # This is a bit "arbitrary"
    
risk_sample, loss_sample, u_sample, v_sample = get_run(0.001, 0.001, batch_norm=False, uniform_noise=False, coupled_noise=None, ext='5.0_dim_2_samples_10_linear')

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(10, 3))#, subplot_kw={"projection": "3d"})

# Surfaces
im1 = ax[0].imshow(losses)
im2 = ax[1].imshow(risks)
ax[0].plot()
fig.colorbar(im1, ax=ax[0])
fig.colorbar(im2, ax=ax[1])

tick_step = int(nu / 4)
for k in range(ax.shape[0]):
    ax[k].set_xticks([i for i in range(nu + 1) if np.mod(i, tick_step) == 0])
    ax[k].set_xticklabels([w_min + i * (w_max - w_min) / nu for i in range(nu + 1) if np.mod(i, tick_step) == 0])
    ax[k].set_xlabel("u")
    
    ax[k].set_yticks([i for i in range(nv + 1) if np.mod(i, tick_step) == 0])
    ax[k].set_yticklabels([w_min + i * (w_max - w_min) / nu for i in range(nv + 1) if np.mod(i, tick_step) == 0])
    ax[k].set_ylabel("v")

# Sampled path
ax[0].plot((u_sample - w_min) * nu / (w_max - w_min), (v_sample - w_min) * nv / (w_max - w_min), '*', color='r', markersize=1)
ax[1].plot((u_sample - w_min) * nu / (w_max - w_min), (v_sample - w_min) * nv / (w_max - w_min), '*', color='r', markersize=1)

# Start
ax[0].plot((u_sample[0] - w_min) * nu / (w_max - w_min), (v_sample[0] - w_min) * nv / (w_max - w_min), '*', color='g')
ax[1].plot((u_sample[0] - w_min) * nu / (w_max - w_min), (v_sample[0] - w_min) * nv / (w_max - w_min), '*', color='g')

# End
ax[0].plot((u_sample[100000] - w_min) * nu / (w_max - w_min), (v_sample[100000] - w_min) * nv / (w_max - w_min), '*', color='r')
ax[1].plot((u_sample[100000] - w_min) * nu / (w_max - w_min), (v_sample[100000] - w_min) * nv / (w_max - w_min), '*', color='r')
    
plt.savefig("../plots/one_layer_loss_surface_" + sup + "_seed_" + str(args.seed))
#surf = ax[0].plot_surface(U, V, losses, cmap=cm.coolwarm, linewidth=0, antialiased=False)
#surf = ax[1].plot_surface(U, V, risks, cmap=cm.coolwarm, linewidth=0, antialiased=False)

In [None]:
fig, ax = plt.subplots()
ax.set_xscale('log')
ax.plot(risk_sample)

In [None]:
u_sample