In [None]:
import numpy as np
import matplotlib.pyplot as plt
from scipy.special import legendre
%matplotlib inline

In [None]:
#Function that returns Legendre number when given k and x
def Legendre(k, x): 
    leg = legendre(k)
    P_n = leg(x)
    return P_n
#Calculates f_x using the formula f_x = sum(a_q*L_q(x))
def f_x(x, Q_f,aq):
    Lq = np.zeros(Q_f+1)
    for k in range(0,Q_f+1):
        Lq[k] = Legendre(k, x)
    return sum(aq*Lq)

#we generate values for x, then calculate y using the formula y = f(x_n) +sigma*epsilon
#then we find x_2 and x_10 and calculate the weights for both by expressing the weights
#in terms of x and y(which we already have) we do this for both x_2 and x_10
# we use that y = w^T*x and express w = (xT*x)'xT*y
def get_weights(Q_f,sigma,N):
    a_q = np.random.normal(size = Q_f+1)
    norm = np.zeros(Q_f+1)
    for q in range(1,Q_f+1):
        norm[q] = 1 / (2 * q + 1)
    rescale = 1 / np.sqrt(sum(norm))
    a_q = rescale * a_q
    x = np.random.uniform(-1,1,N)
    eps = np.random.normal(size = N)
    y = np.zeros(N)
    for i in range(0,N):
        y[i]=f_x(x[i],Q_f,a_q)
    y = y + sigma * eps
    x_2 = np.zeros((3, N))
    x_10 = np.zeros((11, N))
    x_2[0] = 1
    x_10[0] = 1
    for i in range(1,11):
        if i <= 2:
            x_2[i] =  x**i
        x_10[i] = x**i
    w_2 = np.dot(np.dot(np.linalg.inv(np.dot(x_2,x_2.T)),x_2),y)
    w_10 = np.dot(np.dot(np.linalg.inv(np.dot(x_10,x_10.T)),x_10),y)
    return w_2,w_10,a_q
#here we calculate the out of sample error on a test set
#we generate new values for x and calculate y
#to get the ouf of sample error we get the mean for all points for (w^Tx - y)^2
#we do this for x_2 and x_10
def get_out_of_sample(Q_f,sigma,N,w_2,w_10,a_q):
    x = np.random.uniform(-1,1,N)
    y =  np.zeros(N)
    x_2 = np.zeros((3, N))
    x_10 = np.zeros((11, N))
    x_2[0] = 1
    x_10[0] = 1
    for i in range(0,N):
        y[i]=f_x(x[i],Q_f,a_q)
    eps = np.random.normal(size = N)
    y = y + sigma * eps

    for i in range(1,11):
        if i <= 2:
            x_2[i] =  x**i
        x_10[i] = x**i
    Eout2 = np.mean((np.dot(x_2.T,w_2) - y)**2)
    Eout10 = np.mean((np.dot(x_10.T,w_10) - y)**2)
    return Eout2,Eout10

In [None]:
# N is from 20 to 120 with step 5
N = np.array(range(20,120+1,5))
nsize = N.size
# Q_f is from 1 to 100 with step 1
Q_f = np.array(range(1,100+1,1))
qfsize = Q_f.size
N = np.repeat(N,qfsize)
Q_f = np.tile(Q_f,nsize)
sigma = np.sqrt(0.1)
Eout2 = np.zeros(nsize*qfsize)
Eout10 = np.zeros(nsize*qfsize)
Eout2_test = np.zeros(100)
Eout10_test = np.zeros(100)
#we calculate the experiment for all the values of N and for each value of N 
#we do the experiment for each value of Q_f
#For each N,Q_f pair we do the experiment 100 times and get the mean
for i in range(0,nsize*qfsize):
    w_2,w_10,a_q = get_weights(Q_f[i],sigma, N[i])
    for j in range(0,100):
        Eout2_test[j],Eout10_test[j] = get_out_of_sample(Q_f[i],sigma, 10, w_2,w_10,a_q)
    Eout2[i] = np.mean(Eout2_test)
    Eout10[i] = np.mean(Eout10_test)

In [None]:
import pandas as pd
Overfit = Eout10 -Eout2
for i in range(0,Overfit.size):
    if Overfit[i] > 0.2:
        Overfit[i] = 0.2
    elif Overfit[i] < -0.2:
        Overfit[i] = -0.2
data = pd.DataFrame({'Overfit':Overfit,'Q_f':Q_f,'N':N})
data.to_csv("overfit.csv")

In [None]:
def frange(x, y, jump):
  while x < y:
    yield x
    x += jump
# N is from 20 to 120 with step 5
N = np.array(range(20,120+1,5))
nsize = N.size
# sigma is from 0 to 2 with step 0.05
sigma = np.sqrt(np.array(list(frange(0,2,0.05))))
sigmasize = sigma.size
N = np.repeat(N, sigmasize)
sigma = np.tile(sigma,nsize)
Q_f = 20
Eout2 = np.zeros(nsize*sigmasize)
Eout10 = np.zeros(nsize*sigmasize)
Eout2_test = np.zeros(100)
Eout10_test = np.zeros(100)
#we calculate the experiment for all the values of N and for each value of N 
#we do the experiment for each value of sigma
#For each N,sigma pair we do the experiment 100 times and get the mean
for i in range(0,nsize*sigmasize):
    w_2,w_10,a_q = get_weights(Q_f,sigma[i], N[i])
    for j in range(0,100):
        Eout2_test[j],Eout10_test[j] = get_out_of_sample(Q_f,sigma[i], 10, w_2,w_10,a_q)
    Eout2[i] = np.mean(Eout2_test)
    Eout10[i] = np.mean(Eout10_test)
    print(i)

In [None]:
import pandas as pd
Overfit = Eout10 -Eout2
#If for Overfit we have values greater than 0.2 we set them to 0.2
#for values less than -0.2 we set them to -0.2
for i in range(0,Overfit.size):
    if Overfit[i] > 0.2:
        Overfit[i] = 0.2
    elif Overfit[i] < -0.2:
        Overfit[i] = -0.2
data = pd.DataFrame({'Overfit':Overfit,'sigma':sigma,'N':N})
data.to_csv("overfit_sigma2.csv")