In [16]:
import pandas as pd
import numpy as np 
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.model_selection import cross_validate, GridSearchCV
from sklearn.ensemble import RandomForestRegressor

In [9]:
df = pd.read_csv("https://liangfgithub.github.io/Data/Ames_data.csv")
testID = pd.read_csv(
    'https://liangfgithub.github.io/Data/project1_testIDs.dat',delim_whitespace=' ',header=None)


In [10]:
def numeric_convert(frame):
    # We may want to normalize data as well 
    for col in frame:
        try:
            frame[col] = pd.to_numeric(frame[col])
        except:
            frame[col] = pd.factorize(frame[col])[0]
    
    return frame

def get_split(frame, index):

    frame = frame.drop('Garage_Yr_Blt', axis=1)

    num_rows = np.arange(len(frame))

    test_index = testID.iloc[:,index]
    train_index = np.array([i for i in num_rows if i not in test_index])

    xtest = numeric_convert(frame.iloc[test_index,1:-1].copy())
    xtrain = numeric_convert(frame.iloc[train_index,1:-1].copy())

    # convert to log to get better model
    ytest = np.log(frame.iloc[test_index,-1].copy())
    ytrain = np.log(frame.iloc[train_index,-1].copy())

    return xtrain,xtest,ytrain,ytest




In [11]:
def clean_data(df, cols):
    for col in cols:
        df = df.drop(col, axis=1)
    return df

In [12]:
bad_cols = ['PID', 'Garage_Yr_Blt']
# Set up data for use with scikit-learn
frame = clean_data(df, bad_cols)
cvsplits = []
num_rows = np.arange(len(frame))
for index in range(0,10):
    test_index = testID.iloc[:,index]
    train_index = np.array([i for i in num_rows if i not in test_index])
    cvsplits.append((train_index, test_index.values))

x = numeric_convert(frame.iloc[:,:-1].copy())
for c in x:
    x[c] = x[c] / np.max(np.abs(x[c]))

# convert to log to get better model
y = np.log(frame.iloc[:,-1].copy())

In [17]:
lasso = Lasso()
lambda_dict = {
    'alpha': np.exp(np.linspace(-8, -20, num=80))
}
ridge = Ridge()
lambda_dict = {
    'alpha': np.exp(np.linspace(-8, -20, num=80))
}
ridge_cv_res = GridSearchCV(ridge, lambda_dict, verbose=4, cv=cvsplits, scoring='neg_root_mean_squared_error', return_train_score=True).fit(x,y)
#for i in range(0,10):
#    print(f'Split {i} lowest RMS, and runtime, lambda for lowest RMS: {np.min(np.abs(lasso_cv_res.cv_results_[f'split{i}_test_score']))}, )
#np.min(np.abs(las.cv_results_['split0_test_score']))


Fitting 10 folds for each of 80 candidates, totalling 800 fits
[CV 1/10] END alpha=0.00033546262790251185;, score=(train=-0.149, test=-0.146) total time=   0.0s
[CV 2/10] END alpha=0.00033546262790251185;, score=(train=-0.149, test=-0.149) total time=   0.0s
[CV 3/10] END alpha=0.00033546262790251185;, score=(train=-0.149, test=-0.167) total time=   0.0s
[CV 4/10] END alpha=0.00033546262790251185;, score=(train=-0.149, test=-0.163) total time=   0.0s
[CV 5/10] END alpha=0.00033546262790251185;, score=(train=-0.149, test=-0.155) total time=   0.0s
[CV 6/10] END alpha=0.00033546262790251185;, score=(train=-0.149, test=-0.146) total time=   0.0s
[CV 7/10] END alpha=0.00033546262790251185;, score=(train=-0.149, test=-0.149) total time=   0.0s
[CV 8/10] END alpha=0.00033546262790251185;, score=(train=-0.149, test=-0.166) total time=   0.0s
[CV 9/10] END alpha=0.00033546262790251185;, score=(train=-0.149, test=-0.162) total time=   0.0s
[CV 10/10] END alpha=0.00033546262790251185;, score=(tr

[CV 6/10] END alpha=9.951622384054425e-05;, score=(train=-0.149, test=-0.145) total time=   0.0s
[CV 7/10] END alpha=9.951622384054425e-05;, score=(train=-0.149, test=-0.149) total time=   0.0s
[CV 8/10] END alpha=9.951622384054425e-05;, score=(train=-0.149, test=-0.166) total time=   0.0s
[CV 9/10] END alpha=9.951622384054425e-05;, score=(train=-0.149, test=-0.162) total time=   0.0s
[CV 10/10] END alpha=9.951622384054425e-05;, score=(train=-0.149, test=-0.155) total time=   0.0s
[CV 1/10] END alpha=8.549192699463132e-05;, score=(train=-0.149, test=-0.145) total time=   0.0s
[CV 2/10] END alpha=8.549192699463132e-05;, score=(train=-0.149, test=-0.149) total time=   0.0s
[CV 3/10] END alpha=8.549192699463132e-05;, score=(train=-0.149, test=-0.166) total time=   0.0s
[CV 4/10] END alpha=8.549192699463132e-05;, score=(train=-0.149, test=-0.162) total time=   0.0s
[CV 5/10] END alpha=8.549192699463132e-05;, score=(train=-0.149, test=-0.155) total time=   0.0s
[CV 6/10] END alpha=8.5491926

[CV 1/10] END alpha=2.5361494949683764e-05;, score=(train=-0.149, test=-0.145) total time=   0.0s
[CV 2/10] END alpha=2.5361494949683764e-05;, score=(train=-0.149, test=-0.149) total time=   0.0s
[CV 3/10] END alpha=2.5361494949683764e-05;, score=(train=-0.149, test=-0.166) total time=   0.0s
[CV 4/10] END alpha=2.5361494949683764e-05;, score=(train=-0.149, test=-0.161) total time=   0.0s
[CV 5/10] END alpha=2.5361494949683764e-05;, score=(train=-0.149, test=-0.154) total time=   0.0s
[CV 6/10] END alpha=2.5361494949683764e-05;, score=(train=-0.149, test=-0.145) total time=   0.0s
[CV 7/10] END alpha=2.5361494949683764e-05;, score=(train=-0.149, test=-0.149) total time=   0.0s
[CV 8/10] END alpha=2.5361494949683764e-05;, score=(train=-0.149, test=-0.166) total time=   0.0s
[CV 9/10] END alpha=2.5361494949683764e-05;, score=(train=-0.149, test=-0.161) total time=   0.0s
[CV 10/10] END alpha=2.5361494949683764e-05;, score=(train=-0.149, test=-0.154) total time=   0.0s
[CV 1/10] END alpha

[CV 2/10] END alpha=6.463322273094375e-06;, score=(train=-0.149, test=-0.149) total time=   0.0s
[CV 3/10] END alpha=6.463322273094375e-06;, score=(train=-0.149, test=-0.166) total time=   0.0s
[CV 4/10] END alpha=6.463322273094375e-06;, score=(train=-0.149, test=-0.161) total time=   0.0s
[CV 5/10] END alpha=6.463322273094375e-06;, score=(train=-0.149, test=-0.154) total time=   0.0s
[CV 6/10] END alpha=6.463322273094375e-06;, score=(train=-0.149, test=-0.145) total time=   0.0s
[CV 7/10] END alpha=6.463322273094375e-06;, score=(train=-0.149, test=-0.149) total time=   0.0s
[CV 8/10] END alpha=6.463322273094375e-06;, score=(train=-0.149, test=-0.166) total time=   0.0s
[CV 9/10] END alpha=6.463322273094375e-06;, score=(train=-0.149, test=-0.161) total time=   0.0s
[CV 10/10] END alpha=6.463322273094375e-06;, score=(train=-0.149, test=-0.154) total time=   0.0s
[CV 1/10] END alpha=5.552480335262054e-06;, score=(train=-0.149, test=-0.145) total time=   0.0s
[CV 2/10] END alpha=5.5524803

[CV 10/10] END alpha=1.917368352190205e-06;, score=(train=-0.149, test=-0.154) total time=   0.0s
[CV 1/10] END alpha=1.6471637373410722e-06;, score=(train=-0.149, test=-0.145) total time=   0.0s
[CV 2/10] END alpha=1.6471637373410722e-06;, score=(train=-0.149, test=-0.149) total time=   0.0s
[CV 3/10] END alpha=1.6471637373410722e-06;, score=(train=-0.149, test=-0.166) total time=   0.0s
[CV 4/10] END alpha=1.6471637373410722e-06;, score=(train=-0.149, test=-0.161) total time=   0.0s
[CV 5/10] END alpha=1.6471637373410722e-06;, score=(train=-0.149, test=-0.154) total time=   0.0s
[CV 6/10] END alpha=1.6471637373410722e-06;, score=(train=-0.149, test=-0.145) total time=   0.0s
[CV 7/10] END alpha=1.6471637373410722e-06;, score=(train=-0.149, test=-0.149) total time=   0.0s
[CV 8/10] END alpha=1.6471637373410722e-06;, score=(train=-0.149, test=-0.166) total time=   0.0s
[CV 9/10] END alpha=1.6471637373410722e-06;, score=(train=-0.149, test=-0.161) total time=   0.0s
[CV 10/10] END alpha

[CV 3/10] END alpha=4.19776124874006e-07;, score=(train=-0.149, test=-0.166) total time=   0.0s
[CV 4/10] END alpha=4.19776124874006e-07;, score=(train=-0.149, test=-0.161) total time=   0.0s
[CV 5/10] END alpha=4.19776124874006e-07;, score=(train=-0.149, test=-0.154) total time=   0.0s
[CV 6/10] END alpha=4.19776124874006e-07;, score=(train=-0.149, test=-0.145) total time=   0.0s
[CV 7/10] END alpha=4.19776124874006e-07;, score=(train=-0.149, test=-0.149) total time=   0.0s
[CV 8/10] END alpha=4.19776124874006e-07;, score=(train=-0.149, test=-0.166) total time=   0.0s
[CV 9/10] END alpha=4.19776124874006e-07;, score=(train=-0.149, test=-0.161) total time=   0.0s
[CV 10/10] END alpha=4.19776124874006e-07;, score=(train=-0.149, test=-0.154) total time=   0.0s
[CV 1/10] END alpha=3.6061928836167033e-07;, score=(train=-0.149, test=-0.145) total time=   0.0s
[CV 2/10] END alpha=3.6061928836167033e-07;, score=(train=-0.149, test=-0.149) total time=   0.0s
[CV 3/10] END alpha=3.6061928836167

[CV 7/10] END alpha=1.0697903980006664e-07;, score=(train=-0.149, test=-0.149) total time=   0.0s
[CV 8/10] END alpha=1.0697903980006664e-07;, score=(train=-0.149, test=-0.166) total time=   0.0s
[CV 9/10] END alpha=1.0697903980006664e-07;, score=(train=-0.149, test=-0.161) total time=   0.0s
[CV 10/10] END alpha=1.0697903980006664e-07;, score=(train=-0.149, test=-0.154) total time=   0.0s
[CV 1/10] END alpha=9.190304763972481e-08;, score=(train=-0.149, test=-0.145) total time=   0.0s
[CV 2/10] END alpha=9.190304763972481e-08;, score=(train=-0.149, test=-0.149) total time=   0.0s
[CV 3/10] END alpha=9.190304763972481e-08;, score=(train=-0.149, test=-0.166) total time=   0.0s
[CV 4/10] END alpha=9.190304763972481e-08;, score=(train=-0.149, test=-0.161) total time=   0.0s
[CV 5/10] END alpha=9.190304763972481e-08;, score=(train=-0.149, test=-0.154) total time=   0.0s
[CV 6/10] END alpha=9.190304763972481e-08;, score=(train=-0.149, test=-0.145) total time=   0.0s
[CV 7/10] END alpha=9.190

[CV 5/10] END alpha=2.726337749670557e-08;, score=(train=-0.149, test=-0.154) total time=   0.0s
[CV 6/10] END alpha=2.726337749670557e-08;, score=(train=-0.149, test=-0.145) total time=   0.0s
[CV 7/10] END alpha=2.726337749670557e-08;, score=(train=-0.149, test=-0.149) total time=   0.0s
[CV 8/10] END alpha=2.726337749670557e-08;, score=(train=-0.149, test=-0.166) total time=   0.0s
[CV 9/10] END alpha=2.726337749670557e-08;, score=(train=-0.149, test=-0.161) total time=   0.0s
[CV 10/10] END alpha=2.726337749670557e-08;, score=(train=-0.149, test=-0.154) total time=   0.0s
[CV 1/10] END alpha=2.342129341955425e-08;, score=(train=-0.149, test=-0.145) total time=   0.0s
[CV 2/10] END alpha=2.342129341955425e-08;, score=(train=-0.149, test=-0.149) total time=   0.0s
[CV 3/10] END alpha=2.342129341955425e-08;, score=(train=-0.149, test=-0.166) total time=   0.0s
[CV 4/10] END alpha=2.342129341955425e-08;, score=(train=-0.149, test=-0.161) total time=   0.0s
[CV 5/10] END alpha=2.3421293

[CV 9/10] END alpha=8.087781326270034e-09;, score=(train=-0.149, test=-0.161) total time=   0.0s
[CV 10/10] END alpha=8.087781326270034e-09;, score=(train=-0.149, test=-0.154) total time=   0.0s
[CV 1/10] END alpha=6.948012937085725e-09;, score=(train=-0.149, test=-0.145) total time=   0.0s
[CV 2/10] END alpha=6.948012937085725e-09;, score=(train=-0.149, test=-0.149) total time=   0.0s
[CV 3/10] END alpha=6.948012937085725e-09;, score=(train=-0.149, test=-0.166) total time=   0.0s
[CV 4/10] END alpha=6.948012937085725e-09;, score=(train=-0.149, test=-0.161) total time=   0.0s
[CV 5/10] END alpha=6.948012937085725e-09;, score=(train=-0.149, test=-0.154) total time=   0.0s
[CV 6/10] END alpha=6.948012937085725e-09;, score=(train=-0.149, test=-0.145) total time=   0.0s
[CV 7/10] END alpha=6.948012937085725e-09;, score=(train=-0.149, test=-0.149) total time=   0.0s
[CV 8/10] END alpha=6.948012937085725e-09;, score=(train=-0.149, test=-0.166) total time=   0.0s
[CV 9/10] END alpha=6.9480129

[CV 7/10] END alpha=2.061153622438558e-09;, score=(train=-0.149, test=-0.149) total time=   0.0s
[CV 8/10] END alpha=2.061153622438558e-09;, score=(train=-0.149, test=-0.166) total time=   0.0s
[CV 9/10] END alpha=2.061153622438558e-09;, score=(train=-0.149, test=-0.161) total time=   0.0s
[CV 10/10] END alpha=2.061153622438558e-09;, score=(train=-0.149, test=-0.154) total time=   0.0s


In [18]:
for i in range(0, 10):
    score = np.abs(lasso_cv_res.cv_results_[f'split{i}_test_score'])
    best_idx = np.where(score == np.min(score))
    score = score[best_idx][0]
    a = lasso_cv_res.cv_results_['param_alpha'][best_idx][0]
    print(f'Split {i} RMSE: {score}, alpha: {a}')


Split 0 RMSE: 0.14475847299862998, alpha: 2.061153622438558e-09
Split 1 RMSE: 0.14884987236690522, alpha: 2.061153622438558e-09
Split 2 RMSE: 0.1656972360347579, alpha: 2.061153622438558e-09
Split 3 RMSE: 0.16106085948906285, alpha: 2.061153622438558e-09
Split 4 RMSE: 0.1541795410120169, alpha: 2.061153622438558e-09
Split 5 RMSE: 0.1447323471093002, alpha: 2.061153622438558e-09
Split 6 RMSE: 0.14885521289644119, alpha: 2.061153622438558e-09
Split 7 RMSE: 0.16562979056799257, alpha: 2.061153622438558e-09
Split 8 RMSE: 0.160902892439138, alpha: 2.061153622438558e-09
Split 9 RMSE: 0.15417083004121765, alpha: 2.061153622438558e-09


In [20]:
rf = RandomForestRegressor(criterion='squared_error')
lambda_dict = {
    'alpha': np.exp(np.linspace(2, -13, num=80))
}
rv_cv_res = GridSearchCV(rf, lambda_dict, verbose=4, cv=cvsplits, scoring='neg_root_mean_squared_error', return_train_score=True).fit(x,y)
#for i in range(0,10):

Fitting 10 folds for each of 200 candidates, totalling 2000 fits


ValueError: Invalid parameter alpha for estimator RandomForestRegressor(). Check the list of available parameters with `estimator.get_params().keys()`.