##  Double descent in two layer neural network
This notebook contains the relevant code for the following figures in the paper "*Early stopping in deep networks: Double descent and how to eliminate it*":

- Figure 3

In [1]:
#!/usr/bin/env python
# coding: utf-8

%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt

import argparse
import os
import datetime
import pathlib
import random
import json
import numpy as np
import math

import torch

import sys
sys.path.append('code/')
from linear_utils import linear_model
from train_utils import save_config, prune_data

In [11]:
# argument written in command line format
cli_args = '--seed 12 -d 2 -n 10 --sigmas geo --s-range 0.15 1.0 --beta 10.0 1.5 --sigma_noise 5.0'


In [12]:
"""
A fully-connected ReLU network with one hidden layer, trained to predict y from x
by minimizing the MSE loss.
"""

# get CLI parameters
parser = argparse.ArgumentParser(description='CLI parameters for training')
parser.add_argument('--root', type=str, default='', metavar='DIR',
                    help='Root directory')
parser.add_argument('-t', '--iterations', type=int, default=1e4, metavar='ITERATIONS',
                    help='Iterations (default: 1e4)')
parser.add_argument('-n', '--samples', type=int, default=100, metavar='N',
                    help='Number of samples (default: 100)')
parser.add_argument('--print-freq', type=int, default=100,
                    help='CLI output printing frequency (default: 1000)')
parser.add_argument('--gpu', type=int, default=None,
                    help='Number of GPUS to use')
parser.add_argument('--seed', type=int, default=None,
                    help='Random seed')                        
parser.add_argument('-d', '--dim', type=int, default=50, metavar='DIMENSION',
                    help='Feature dimension (default: 50)')
parser.add_argument('--hidden', type=int, default=200, metavar='DIMENSION',
                    help='Hidden layer dimension (default: 200)')
parser.add_argument('--no-bias', action='store_true', default=False,
                    help='Do not use bias')
parser.add_argument('--linear', action='store_true', default=False,
                    help='Linear activation function')
parser.add_argument('--sigmas', type=str, default=None,
                    help='Sigmas')   
parser.add_argument('--sigma_noise', nargs='*', type=float, default=0.0,
                    help='Output noise.')
parser.add_argument('--beta', nargs='*', type=float, default=None,
                    help='True model parameters.')
parser.add_argument('--coupled_noise', action='store_true', default=False,
                    help='Couple noise in output to large eigenvalues.')
parser.add_argument('-r','--s-range', nargs='*', type=float,
                    help='Range for sigmas')
parser.add_argument('-w','--scales', nargs='*', type=float,
                    help='scale of the weights')
parser.add_argument('--lr', type=float, default=1e-4, nargs='*', metavar='LR',
                    help='learning rate (default: 1e-4)')              
parser.add_argument('--normalized', action='store_true', default=False,
                    help='normalize sample norm across features')
parser.add_argument('--risk-loss', type=str, default='MSE', metavar='LOSS',
                    help='Loss for validation')
parser.add_argument('--jacobian', action='store_true', default=False,
                    help='compute the SVD of the jacobian of the network')
parser.add_argument('--save-results', action='store_true', default=False,
                    help='Save the results for plots')
parser.add_argument('--pcs', type=int, default=None, 
                    help='Number of PCs to use in data.')
parser.add_argument('--transform-data', action='store_true', default=False, 
                    help='Use data in transformed space')
parser.add_argument('--details', type=str, metavar='N',
                    default='no_detail_given',
                    help='details about the experimental setup')

args = parser.parse_args(cli_args.split())

# directories
root = pathlib.Path(args.root) if args.root else pathlib.Path.cwd().parent

current_date = str(datetime.datetime.today().strftime('%Y-%m-%d-%H-%M-%S'))
args.outpath = (pathlib.Path.cwd().parent / 'results' / 'two_layer_nn' /  current_date)

if len(args.sigma_noise) == 1:
        args.sigma_noise = args.sigma_noise[0]
        
if args.save_results:
    args.outpath.mkdir(exist_ok=True, parents=True)

if args.seed is not None:
    random.seed(args.seed)
    torch.manual_seed(args.seed)
    np.random.seed(args.seed)
    
device = torch.device('cpu')
# device = torch.device('cuda') # Uncomment this to run on GPU

In [13]:
d_out = 1      # dimension of y

# sample training set from the linear model
lin_model = linear_model(args.dim, sigma_noise=args.sigma_noise, beta=args.beta, normalized=False, sigmas=args.sigmas, s_range=args.s_range, coupled_noise=args.coupled_noise, transform_data=args.transform_data)
Xs, ys = lin_model.sample(args.samples)
Xs = torch.Tensor(Xs).to(device)
ys = torch.Tensor(ys.reshape((-1,1))).to(device)

if args.pcs:
    Xs = prune_data(Xs, args.pcs)

# sample the set for empirical risk calculation
Xt, yt = lin_model.sample(args.samples)
Xt = torch.Tensor(Xt).to(device)
yt = torch.Tensor(yt.reshape((-1,1))).to(device)

In [16]:
U, S, Vh = np.linalg.svd(Xs.T, full_matrices=True)
print(S)

[4.251015   0.26966172]
