In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge, Lasso
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
import warnings

In [2]:
df_college = pd.read_csv('college.csv')
df_college.isna().any().any()

False

In [3]:
x_list = df_college.columns.to_list()
x_list.remove('Apps')
x_list.remove('Private')
x = df_college[x_list]
y = df_college['Apps']
x_std = (x - x.mean())/y.std(0)
y_std = y - y.mean()

In [4]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
x_train_std, x_test_std, y_train_std, y_test_std = train_test_split(x_std, y_std, test_size=0.2, random_state=42)

In [5]:
def repeat_test(ridge_true, std_true):
    warnings.filterwarnings("ignore")
    if ridge_true:
        model = Ridge()
        result_str = 'For ridge model '
    else:
        model = Lasso()
        result_str = 'For lasso model '
    if std_true:
        x_train_temp = x_train_std
        y_train_temp = y_train_std
        x_test_temp = x_test_std
        y_test_temp = y_test_std
        result_str += 'with standardization'
    else:
        x_train_temp = x_train
        y_train_temp = y_train
        x_test_temp = x_test
        y_test_temp = y_test
        result_str += 'without standardization'
    alphas = np.logspace(0,8,100)
    grid_ridge= GridSearchCV(model, param_grid = dict(alpha = alphas), cv=5, scoring='neg_mean_squared_error')
    grid_ridge.fit(x_train_temp, y_train_temp)
    alpha_without_ridge = grid_ridge.best_estimator_.alpha
    result_str += "\nthe best alpha is %.2f and the best score is %.4f" %(alpha_without_ridge, grid_ridge.best_score_)
    ridge = Ridge(alpha = alpha_without_ridge)
    ridge.fit(x_train_temp, y_train_temp)
    RSS = np.sum((ridge.predict(x_test_temp) - y_test_temp) ** 2)
    tr_R2 = ridge.score(x_train_temp, y_train_temp)
    te_R2 = ridge.score(x_test_temp, y_test_temp)
    tr_er = mean_squared_error(y_train_temp, ridge.predict(x_train_temp))
    te_er = mean_squared_error(y_test_temp, ridge.predict(x_test_temp))
    result_str += '\ntraining error: ' + str(tr_er) + '\ntesting error: ' + str(te_er) + "\nThe R^2 for training set:" + str(tr_R2)
    result_str += "\nThe R^2 for test set:" + str(te_R2)
    result_str += "\nRSS: %.2f" %(RSS)
    result1 = pd.DataFrame(ridge.coef_).transpose()
    result1.columns = x_list
    result1['intercept'] = ridge.intercept_ 
    result1 = result1.transpose()
    result1.columns = ['coefficient']
    result1
    result_dict = {'RSS': RSS, 'training R2': tr_R2, 'testing R2': te_R2, 'training error': tr_er, 'testing R2': te_er, 'alpha': alpha_without_ridge}
    return result_str, result1, result_dict

In [6]:
rs_str, rs1, rs_dict = repeat_test(True, True)
rn_str, rn1, rn_dict = repeat_test(True, False)
ls_str, ls1, ls_dict = repeat_test(False, True)
ln_str, ln1, ln_dict = repeat_test(False, False)


In [7]:
print(rs_str)
print('-'*20)
print(rn_str)
print('-'*20)
print(ls_str)
print('-'*20)
print(ln_str)

For ridge model with standardization
the best alpha is 1.00 and the best score is -1572313.6346
training error: 1250765.8458008112
testing error: 1589538.1301434354
The R^2 for training set:0.9185173955752879
The R^2 for test set:0.8804561448160135
RSS: 247967948.30
--------------------
For ridge model without standardization
the best alpha is 2056.51 and the best score is -1298199.1190
training error: 1022037.3601631394
testing error: 1444655.7529332454
The R^2 for training set:0.9334181803852097
The R^2 for test set:0.8913522646331353
RSS: 225366297.46
--------------------
For lasso model with standardization
the best alpha is 1.00 and the best score is -1470487.6623
training error: 1250765.8458008112
testing error: 1589538.1301434354
The R^2 for training set:0.9185173955752879
The R^2 for test set:0.8804561448160135
RSS: 247967948.30
--------------------
For lasso model without standardization
the best alpha is 220.51 and the best score is -1297819.5340
training error: 1021140.80741

In [8]:
coefficient = pd.concat([rs1, rn1, ls1, ln1], axis=1)
rename_columns = ['ridge_std', 'ridge_not_std', 'lasso_std', 'lasso_not_std']
coefficient.columns = rename_columns
coefficient

Unnamed: 0,ridge_std,ridge_not_std,lasso_std,lasso_not_std
Accept,5970.593651,1.661353,5970.593651,1.664409
Enroll,-1726.879874,-0.996341,-1726.879874,-1.001874
Top10perc,843.214745,47.019417,843.214745,50.525712
Top25perc,690.596803,-12.245495,690.596803,-14.661253
F.Undergrad,227.410203,0.074535,227.410203,0.073456
P.Undergrad,-433.924746,-0.01774,-433.924746,-0.015181
Outstate,-227.68031,-0.101645,-227.68031,-0.101315
Room.Board,579.110401,0.136495,579.110401,0.136372
Books,639.138318,0.097486,639.138318,0.083943
Personal,31.189998,0.046206,31.189998,0.047102


In [9]:
factors = pd.DataFrame.from_dict([rs_dict, rn_dict, ls_dict, rn_dict])
factors['type'] = rename_columns
factors.set_index('type')
factors

Unnamed: 0,RSS,training R2,testing R2,training error,alpha,type
0,247967900.0,0.918517,1589538.0,1250766.0,1.0,ridge_std
1,225366300.0,0.933418,1444656.0,1022037.0,2056.512308,ridge_not_std
2,247967900.0,0.918517,1589538.0,1250766.0,1.0,lasso_std
3,225366300.0,0.933418,1444656.0,1022037.0,2056.512308,lasso_not_std
