## Part h: Cross-validation and resampling techniques

In [1]:
# Import libraies and functions:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

# import our own implementations
import importlib, OLS, prepare_data, polynomial_features,K_fold
importlib.reload(OLS)
importlib.reload(prepare_data)
importlib.reload(polynomial_features)

from prepare_data import prepare_data
from polynomial_features import polynomial_features
from OLS import OLS_parameters
from Ridge import Ridge_parameters
from K_fold import k_fold_split
from sklearn.model_selection import KFold
from sklearn.linear_model import Lasso

# Styling function
from Set_latex_params import set_mpl_latex_style
#set_mpl_latex_style()

In [2]:
#Prepare data
x, y, _, _, _, _, y_noisy = prepare_data(n=100)


In [3]:
deg = 5

rs = 6114

folds = k_fold_split(x, y, k=5, shuffle=False, random_state=rs)
mses = []

for i, (train_idx, test_idx) in enumerate(folds):
    x_train_f = x[train_idx]
    x_test_f = x[test_idx]

    y_train_f = y[train_idx]
    y_test_f = y[test_idx]

    y_offset = y_train_f.mean()

    X_train_f = polynomial_features(x_train_f,deg,intercept=False)
    X_test_f = polynomial_features(x_test_f,deg,intercept=False)

    ols_betas = OLS_parameters(X_train_f,y_train_f)
    pred_y_f = np.dot(X_test_f,ols_betas)+y_offset
    mse_f = mean_squared_error(pred_y_f, y_test_f)

    mses.append(mse_f)

print(f"Mean across folds {np.mean(mses)}")


Mean across folds 11.549058220651961


### Sanity check with imported library
Since there is a randomness to what elements are put in train and test sets, we cannot expect the results to be the same. But we can expect them to interact similarly

In [4]:
deg = 5

folds = KFold(n_splits=5, shuffle=False)
mses = []

for fold, (train_idx, test_idx) in enumerate(folds.split(x)):
    x_train_f = x[train_idx]
    x_test_f = x[test_idx]

    y_train_f = y[train_idx]
    y_test_f = y[test_idx]

    y_offset = y_train_f.mean()

    X_train_f = polynomial_features(x_train_f,deg,intercept=False)
    X_test_f = polynomial_features(x_test_f,deg,intercept=False)

    ols_betas = OLS_parameters(X_train_f,y_train_f)
    pred_y_f = np.dot(X_test_f,ols_betas)+y_offset
    mse_f = mean_squared_error(pred_y_f, y_test_f)

    mses.append(mse_f)

print(f"Mean across folds {np.mean(mses)}")




Mean across folds 11.549058220651961


Here we didnt use shuffle and tried on both the imported and selfmade K fold cross validation. We got the MSE which means our algorithm works as intended

In [None]:
max_degrees = 15


alpha_lasso  = 1e-1
alpha_ridge = 1e-3

mse_plot_OLS = []
mse_plot_Ridge = []
mse_plot_Lasso = []

for deg in range(1,max_degrees):
    print(deg)

    folds = k_fold_split(x, y, k=2, shuffle=False, random_state=rs)
    mses_OLS = []
    mses_Ridge = []
    mses_Lasso = []

    for i, (train_idx, test_idx) in enumerate(folds):
        print(i)
        x_train_f = x[train_idx]
        x_test_f = x[test_idx]

        y_train_f = y[train_idx]
        y_test_f = y[test_idx]

        y_offset = y_train_f.mean()

        X_train_f = polynomial_features(x_train_f,deg,intercept=False)
        X_test_f = polynomial_features(x_test_f,deg,intercept=False)

        # Scale 
        scaler = StandardScaler()
        X_train_fs = scaler.fit_transform(X_train_f)
        X_test_fs = scaler.transform(X_test_f)


        # OlS
        ols_betas = OLS_parameters(X_train_fs,y_train_f)
        pred_y_f_ols = np.dot(X_test_fs,ols_betas)+y_offset
        mse_f_ols = mean_squared_error(pred_y_f_ols, y_test_f) 
        
        #Ridge
        ridge_betas = Ridge_parameters(X_train_fs,y_train_f)
        pred_y_f_ridge = np.dot(X_test_fs,ols_betas)+y_offset
        mse_f_ridge = mean_squared_error(pred_y_f_ridge, y_test_f)
        
        #Lasso
        lasso = Lasso(alpha=alpha_lasso, fit_intercept=False, max_iter=10000, random_state=rs)
        lasso.fit(X_train_fs, y_train_f)
        pred_y_f_lasso = lasso.predict(X_test_fs)
        mse_f_lasso = mean_squared_error(pred_y_f_lasso, y_test_f)

        mses_Ridge.append(mse_f_ridge)
        mses_OLS.append(mse_f_ols)
        mses_Lasso.append(mses_Lasso)
    
    mse_plot_OLS.append(np.mean(mses_OLS))
    mse_plot_Ridge.append(np.mean(mses_Ridge))
    mses_Lasso.append(np.mean(mses_Lasso))



1


NameError: name 'k_fold_split' is not defined

1
0
1


KeyboardInterrupt: 