In [2]:
# Import libraries
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression as Lin_Reg
from sklearn.ensemble import RandomForestRegressor as RFR
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.cm as cmx
import matplotlib.colors as colors
import scipy as sp
%matplotlib inline

from sklearn.model_selection import KFold
def kfold(k, predictor, X):
    kf = KFold(n_splits = k, random_state = 0)
    count = 0
    coeff_used = 0 
    for trains, tests in kf.split(X):
        x_train = X.iloc[trains, 1:-1]
        y_train = X.iloc[trains, -1]
        x_test = X.iloc[tests, 1:-1]
        y_test = X.iloc[tests, -1]
        predictor.fit(x_train, y_train)
        predictions = predictor.predict(x_test)
        count += np.sqrt(mean_squared_error(predictions, y_test))
    return count/k

from collections import Counter

  from numpy.core.umath_tests import inner1d


In [4]:
# Load training and test sets (assumes you have these in current working directory)
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [5]:
# Split training set into X and y (removing first column containing IDs)
X_train = train.iloc[:, 1:-1]
y_train = train.iloc[:, -1]

In [9]:
print y_train

0       0.901355
1       0.913550
2       0.884824
3       0.977236
4       0.921138
5       0.902891
6       0.913731
7       0.964770
8       0.906504
9       0.915537
10      0.910659
11      0.963505
12      0.910840
13      0.921590
14      0.913821
15      0.905601
16      0.913821
17      0.923848
18      0.910840
19      0.908762
20      0.909485
21      0.943812
22      0.879313
23      0.913550
24      0.933424
25      0.897832
26      0.910750
27      0.885637
28      0.890696
29      0.938663
          ...   
5301    0.923848
5302    0.908220
5303    0.932701
5304    0.990967
5305    0.909756
5306    0.934056
5307    0.887624
5308    0.966125
5309    0.906865
5310    0.907498
5311    0.991147
5312    0.894761
5313    0.917706
5314    0.910117
5315    0.897561
5316    0.880036
5317    0.902439
5318    0.914905
5319    0.907498
5320    0.939115
5321    0.908582
5322    0.896387
5323    0.914905
5324    0.938844
5325    0.916441
5326    0.976694
5327    0.993044
5328    0.9185

In [7]:
# Define function to compute RMSE
def scoreRMSE(predictor, X, true_y):
    predictions = predictor.predict(X)
    return np.sqrt(mean_squared_error(predictions, true_y))

In [23]:
for max_depth in [2,4,6,8,10]:
    print max_depth, kfold(max_depth, GradientBoostingRegressor(max_depth = max_depth), train)

2 0.027159124300025404
4 0.027087834207946002
6 0.027236975974814486
8 0.027420647191375807
10

KeyboardInterrupt: 

In [24]:
clf = GradientBoostingRegressor(n_estimators=1000, max_depth=4)
clf.fit(X_train, y_train)
print ("Training RMSE: ", scoreRMSE(clf, X_train, y_train))

 ('Training RMSE: ', 0.012151221347168686)


In [27]:
# Remove first column to make predictions
X_test = test.iloc[:, 1:]
X_test.head()

Unnamed: 0,Feat 1,Feat 2,Feat 3,Feat 4,Feat 5,Feat 6,Feat 7,Feat 8,Feat 9,Feat 10,...,Feat 242,Feat 243,Feat 244,Feat 245,Feat 246,Feat 247,Feat 248,Feat 249,Feat 250,Feat 251
0,0.999849,0.174118,0.999819,0.997841,0.133333,0.2,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0,0.728471,0.054397,0.649,0.416164,0.053998,0.667391
1,0.999958,0.164706,1.0,0.996741,0.066667,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0,0.497255,0.037736,0.375,0.165514,0.101973,0.50665
2,0.999666,0.174118,0.999479,0.997376,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0,0.688941,0.019309,1.0,0.192069,0.1207,0.498784
3,0.999735,0.174118,0.999655,0.997173,0.133333,0.0,0.0,0.0,0.363636,0.166667,...,0.0,0.0,0.0,0,0.654118,0.019089,0.333,0.451252,0.16418,0.774466
4,0.999806,0.164706,0.999551,0.997234,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0,0.627451,0.160433,0.882,0.147407,0.0,0.48124


In [28]:
# Make predictions using linear regression model fitted above
predictions = clf.predict(X_test)

In [29]:
# Format predictions to be compatible with Kaggle upload
sample_submission = pd.DataFrame(data=predictions, columns=['Predicted'])
sample_submission.insert(0, "Id", range(1, 1 + X_test.shape[0]))
sample_submission['Id'] = sample_submission['Id'].astype(str)
sample_submission.head()
# Save predictions to .csv file for upload to Kaggle
sample_submission.to_csv("clf1000,4.csv", index=False)

In [10]:
# Save predictions to .csv file for upload to Kaggle
sample_submission.to_csv("sample_submission.csv", index=False)

In [59]:
sample_submission = pd.DataFrame(data=predictions, columns=['Predicted'])
sample_submission.insert(0, "Id", range(1, 1 + X_test.shape[0]))
sample_submission['Id'] = sample_submission['Id'].astype(str)
sample_submission.head()

Unnamed: 0,Id,Predicted
0,1,0.933655
1,2,0.909929
2,3,0.91578
3,4,0.924992
4,5,0.935605


In [60]:
# Save predictions to .csv file for upload to Kaggle
sample_submission.to_csv("rfr_imp.csv", index=False)

In [31]:
from sklearn.model_selection import GridSearchCV

params = {
    'min_samples_split': [2, 4, 6, 8, 10, 12, 14],
    'min_samples_leaf': [5, 15, 25, 35, 45, 55, 65]
}

gbR = GradientBoostingRegressor(n_estimators=80, max_depth=6, max_features='sqrt', subsample=0.8)
gs = GridSearchCV(estimator=gbR, param_grid=params, cv=4, n_jobs=4, verbose=3)
gs.fit(X_train, y_train)

print(gs.best_params_)

Fitting 4 folds for each of 49 candidates, totalling 196 fits
[CV] min_samples_leaf=5, min_samples_split=2 .........................
[CV] min_samples_leaf=5, min_samples_split=2 .........................
[CV] min_samples_leaf=5, min_samples_split=2 .........................
[CV] min_samples_leaf=5, min_samples_split=2 .........................
[CV]  min_samples_leaf=5, min_samples_split=2, score=0.06489819361928284, total=   0.8s
[CV]  min_samples_leaf=5, min_samples_split=2, score=0.05908813808983959, total=   0.8s
[CV]  min_samples_leaf=5, min_samples_split=2, score=0.06647496959134724, total=   0.8s
[CV] min_samples_leaf=5, min_samples_split=4 .........................
[CV]  min_samples_leaf=5, min_samples_split=2, score=0.09423728523129671, total=   0.8s
[CV] min_samples_leaf=5, min_samples_split=4 .........................
[CV] min_samples_leaf=5, min_samples_split=4 .........................
[CV] min_samples_leaf=5, min_samples_split=4 .........................
[CV]  min_samples_

[Parallel(n_jobs=4)]: Done  24 tasks      | elapsed:    5.7s


[CV]  min_samples_leaf=5, min_samples_split=14, score=0.051211510885879274, total=   0.9s
[CV]  min_samples_leaf=5, min_samples_split=14, score=0.06353651001431238, total=   0.8s
[CV] min_samples_leaf=15, min_samples_split=2 ........................
[CV] min_samples_leaf=15, min_samples_split=2 ........................
[CV]  min_samples_leaf=5, min_samples_split=14, score=0.08685585684315977, total=   0.9s
[CV]  min_samples_leaf=5, min_samples_split=14, score=0.052296640789551874, total=   0.9s
[CV] min_samples_leaf=15, min_samples_split=2 ........................
[CV] min_samples_leaf=15, min_samples_split=2 ........................
[CV]  min_samples_leaf=15, min_samples_split=2, score=0.06091703231260948, total=   0.8s
[CV] min_samples_leaf=15, min_samples_split=4 ........................
[CV]  min_samples_leaf=15, min_samples_split=2, score=0.058984941088492386, total=   0.9s
[CV] min_samples_leaf=15, min_samples_split=4 ........................
[CV]  min_samples_leaf=15, min_sample

[CV] min_samples_leaf=25, min_samples_split=12 .......................
[CV]  min_samples_leaf=25, min_samples_split=12, score=0.05259485252622631, total=   0.8s
[CV] min_samples_leaf=25, min_samples_split=14 .......................
[CV]  min_samples_leaf=25, min_samples_split=12, score=0.07800426386376569, total=   0.8s
[CV] min_samples_leaf=25, min_samples_split=14 .......................
[CV]  min_samples_leaf=25, min_samples_split=12, score=0.061460123702442464, total=   0.9s
[CV] min_samples_leaf=25, min_samples_split=14 .......................
[CV]  min_samples_leaf=25, min_samples_split=12, score=0.08404789302047855, total=   1.0s
[CV] min_samples_leaf=25, min_samples_split=14 .......................
[CV]  min_samples_leaf=25, min_samples_split=14, score=0.05516111883187835, total=   0.9s
[CV] min_samples_leaf=35, min_samples_split=2 ........................
[CV]  min_samples_leaf=25, min_samples_split=14, score=0.06866031116278648, total=   0.9s
[CV] min_samples_leaf=35, min_sam

[Parallel(n_jobs=4)]: Done 120 tasks      | elapsed:   28.9s


[CV]  min_samples_leaf=45, min_samples_split=6, score=0.05725541752969809, total=   0.8s
[CV] min_samples_leaf=45, min_samples_split=8 ........................
[CV]  min_samples_leaf=45, min_samples_split=6, score=0.06924781451996265, total=   0.8s
[CV] min_samples_leaf=45, min_samples_split=8 ........................
[CV]  min_samples_leaf=45, min_samples_split=6, score=0.07591099636426735, total=   0.8s
[CV] min_samples_leaf=45, min_samples_split=8 ........................
[CV]  min_samples_leaf=45, min_samples_split=6, score=0.08109953535486925, total=   0.8s
[CV] min_samples_leaf=45, min_samples_split=8 ........................
[CV]  min_samples_leaf=45, min_samples_split=8, score=0.04839318792125069, total=   0.7s
[CV] min_samples_leaf=45, min_samples_split=10 .......................
[CV]  min_samples_leaf=45, min_samples_split=8, score=0.07768280812564854, total=   0.7s
[CV] min_samples_leaf=45, min_samples_split=10 .......................
[CV]  min_samples_leaf=45, min_samples_s

[CV] min_samples_leaf=65, min_samples_split=4 ........................
[CV]  min_samples_leaf=65, min_samples_split=4, score=0.06264464497592648, total=   0.8s
[CV] min_samples_leaf=65, min_samples_split=6 ........................
[CV]  min_samples_leaf=65, min_samples_split=4, score=0.06432428517958755, total=   0.8s
[CV] min_samples_leaf=65, min_samples_split=6 ........................
[CV]  min_samples_leaf=65, min_samples_split=4, score=0.07510601343101353, total=   0.8s
[CV]  min_samples_leaf=65, min_samples_split=4, score=0.08954041110466027, total=   0.7s
[CV] min_samples_leaf=65, min_samples_split=6 ........................
[CV] min_samples_leaf=65, min_samples_split=6 ........................
[CV]  min_samples_leaf=65, min_samples_split=6, score=0.055507601715092614, total=   0.7s
[CV] min_samples_leaf=65, min_samples_split=8 ........................
[CV]  min_samples_leaf=65, min_samples_split=6, score=0.06863209125708847, total=   0.6s
[CV] min_samples_leaf=65, min_samples_s

[Parallel(n_jobs=4)]: Done 196 out of 196 | elapsed:   45.0s finished


{'min_samples_leaf': 45, 'min_samples_split': 4}


In [33]:
params = {
    'max_features': [11, 13, 15, 17, 19]
}

gbR = GradientBoostingRegressor(n_estimators=80, max_depth=6, min_samples_split=4, min_samples_leaf=45, subsample=0.8)
gs = GridSearchCV(estimator=gbR, param_grid=params, cv=4, n_jobs=4, verbose=3)
gs.fit(X_train, y_train)

print(gs.best_params_)

Fitting 4 folds for each of 5 candidates, totalling 20 fits
[CV] max_features=11 .................................................
[CV] max_features=11 .................................................
[CV] max_features=11 .................................................
[CV] max_features=11 .................................................
[CV] ....... max_features=11, score=0.05388722334820051, total=   0.6s
[CV] ....... max_features=11, score=0.06117859054834251, total=   0.6s
[CV] ....... max_features=11, score=0.07561895835158172, total=   0.6s
[CV] max_features=13 .................................................
[CV] max_features=13 .................................................
[CV] ....... max_features=11, score=0.08245608460486387, total=   0.6s
[CV] max_features=13 .................................................
[CV] max_features=13 .................................................
[CV] ....... max_features=13, score=0.06150012539134053, total=   0.7s
[CV] max_features

[Parallel(n_jobs=4)]: Done  20 out of  20 | elapsed:    4.6s remaining:    0.0s
[Parallel(n_jobs=4)]: Done  20 out of  20 | elapsed:    4.6s finished


{'max_features': 13}


In [36]:
params = {
    'subsample': [0.6, 0.65, 0.7, 0.75, 0.8, 0.85]
}

gbR = GradientBoostingRegressor(n_estimators=80, max_depth=6, min_samples_split=4, min_samples_leaf=45, max_features=13)
gs = GridSearchCV(estimator=gbR, param_grid=params, cv=4, n_jobs=4, verbose=3)
gs.fit(X_train, y_train)

print(gs.best_params_)

Fitting 4 folds for each of 6 candidates, totalling 24 fits
[CV] subsample=0.6 ...................................................
[CV] subsample=0.6 ...................................................
[CV] subsample=0.6 ...................................................
[CV] subsample=0.6 ...................................................
[CV] ........ subsample=0.6, score=0.061974258748876965, total=   0.7s
[CV] ........ subsample=0.6, score=0.059311594020733605, total=   0.7s
[CV] subsample=0.65 ..................................................
[CV] ......... subsample=0.6, score=0.07215398022295316, total=   0.7s
[CV] subsample=0.65 ..................................................
[CV] subsample=0.65 ..................................................
[CV] ......... subsample=0.6, score=0.07236006463706679, total=   0.7s
[CV] subsample=0.65 ..................................................
[CV] ......... subsample=0.65, score=0.0565595148553697, total=   0.6s
[CV] ........ sub

[Parallel(n_jobs=4)]: Done  24 out of  24 | elapsed:    5.1s finished


{'subsample': 0.8}


In [43]:
params = {
    'n_estimators': [300, 400, 500, 600]
}

gbR = GradientBoostingRegressor(learning_rate=0.01, max_depth=6, min_samples_split=4, min_samples_leaf=45, subsample=0.8)
gs = GridSearchCV(estimator=gbR, param_grid=params, cv=4, n_jobs=4, verbose=3)
gs.fit(X_train, y_train)

print(gs.best_params_)

Fitting 4 folds for each of 4 candidates, totalling 16 fits
[CV] n_estimators=300 ................................................
[CV] n_estimators=300 ................................................
[CV] n_estimators=300 ................................................
[CV] n_estimators=300 ................................................
[CV] ...... n_estimators=300, score=0.06801625668927713, total=  24.4s
[CV] n_estimators=400 ................................................
[CV] ...... n_estimators=300, score=0.09040542307890742, total=  24.8s
[CV] n_estimators=400 ................................................
[CV] ...... n_estimators=300, score=0.07791422879202248, total=  24.9s
[CV] n_estimators=400 ................................................
[CV] ...... n_estimators=300, score=0.06493616105500932, total=  24.9s
[CV] n_estimators=400 ................................................
[CV] ....... n_estimators=400, score=0.0668936697487349, total=  30.6s
[CV] n_estimators

[Parallel(n_jobs=4)]: Done  16 out of  16 | elapsed:  2.3min finished


{'n_estimators': 400}


In [51]:
params = {
    'max_depth': [26, 30, 35],
    'min_samples_split': [8, 12, 16]
}

gbR = GradientBoostingRegressor(learning_rate=0.01, n_estimators=400, min_samples_leaf=45, max_features=13, subsample=0.8)
gs = GridSearchCV(estimator=gbR, param_grid=params, cv=4, n_jobs=4, verbose=3)
gs.fit(X_train, y_train)

print(gs.best_params_)

Fitting 4 folds for each of 9 candidates, totalling 36 fits
[CV] max_depth=26, min_samples_split=8 ...............................
[CV] max_depth=26, min_samples_split=8 ...............................
[CV] max_depth=26, min_samples_split=8 ...............................
[CV] max_depth=26, min_samples_split=8 ...............................
[CV]  max_depth=26, min_samples_split=8, score=0.07161568274125973, total=   4.9s
[CV]  max_depth=26, min_samples_split=8, score=0.09454373768274471, total=   4.7s
[CV]  max_depth=26, min_samples_split=8, score=0.07932834165418556, total=   4.9s
[CV] max_depth=26, min_samples_split=12 ..............................
[CV]  max_depth=26, min_samples_split=8, score=0.08091927916876851, total=   4.8s
[CV] max_depth=26, min_samples_split=12 ..............................
[CV] max_depth=26, min_samples_split=12 ..............................
[CV] max_depth=26, min_samples_split=12 ..............................
[CV]  max_depth=26, min_samples_split=12, sc

[Parallel(n_jobs=4)]: Done  24 tasks      | elapsed:   29.5s


[CV]  max_depth=35, min_samples_split=8, score=0.07891738445413632, total=   5.7s
[CV]  max_depth=35, min_samples_split=8, score=0.06720723392651418, total=   5.7s
[CV] max_depth=35, min_samples_split=12 ..............................
[CV] max_depth=35, min_samples_split=12 ..............................
[CV]  max_depth=35, min_samples_split=8, score=0.08561673424594596, total=   5.7s
[CV] max_depth=35, min_samples_split=12 ..............................
[CV]  max_depth=35, min_samples_split=8, score=0.09124292814537938, total=   5.7s
[CV] max_depth=35, min_samples_split=12 ..............................
[CV]  max_depth=35, min_samples_split=12, score=0.07000843636163923, total=   5.5s
[CV]  max_depth=35, min_samples_split=12, score=0.0824485484791696, total=   5.5s
[CV] max_depth=35, min_samples_split=16 ..............................
[CV] max_depth=35, min_samples_split=16 ..............................
[CV]  max_depth=35, min_samples_split=12, score=0.0765703266648069, total=   5.5s

[Parallel(n_jobs=4)]: Done  36 out of  36 | elapsed:   46.9s finished


{'max_depth': 26, 'min_samples_split': 8}


In [53]:
params = {
    'min_samples_split': [4, 12, 20],
    'min_samples_leaf': [5, 10, 15]
}

gbR = GradientBoostingRegressor(learning_rate=0.01, n_estimators=400, max_depth=26, min_samples_split=4, min_samples_leaf=10, max_features=13, subsample=0.8)
gs = GridSearchCV(estimator=gbR, param_grid=params, cv=4, n_jobs=4, verbose=3)
gs.fit(X_train, y_train)

print(gs.best_params_)

Fitting 4 folds for each of 9 candidates, totalling 36 fits
[CV] min_samples_leaf=5, min_samples_split=4 .........................
[CV] min_samples_leaf=5, min_samples_split=4 .........................
[CV] min_samples_leaf=5, min_samples_split=4 .........................
[CV] min_samples_leaf=5, min_samples_split=4 .........................
[CV]  min_samples_leaf=5, min_samples_split=4, score=0.08495548281614862, total=  17.3s
[CV]  min_samples_leaf=5, min_samples_split=4, score=0.08190060757991124, total=  17.3s
[CV] min_samples_leaf=5, min_samples_split=12 ........................
[CV] min_samples_leaf=5, min_samples_split=12 ........................
[CV]  min_samples_leaf=5, min_samples_split=4, score=0.0764281448220564, total=  17.5s
[CV] min_samples_leaf=5, min_samples_split=12 ........................
[CV]  min_samples_leaf=5, min_samples_split=4, score=0.11001763837874523, total=  17.4s
[CV] min_samples_leaf=5, min_samples_split=12 ........................
[CV]  min_samples_lea

[Parallel(n_jobs=4)]: Done  24 tasks      | elapsed:  1.4min


[CV]  min_samples_leaf=15, min_samples_split=4, score=0.07166367094820758, total=   8.9s
[CV] min_samples_leaf=15, min_samples_split=12 .......................
[CV]  min_samples_leaf=15, min_samples_split=4, score=0.0859936090762018, total=   8.9s
[CV] min_samples_leaf=15, min_samples_split=12 .......................
[CV]  min_samples_leaf=15, min_samples_split=4, score=0.0828794990740086, total=   9.0s
[CV] min_samples_leaf=15, min_samples_split=12 .......................
[CV]  min_samples_leaf=15, min_samples_split=4, score=0.10783823176390639, total=   8.9s
[CV] min_samples_leaf=15, min_samples_split=12 .......................
[CV]  min_samples_leaf=15, min_samples_split=12, score=0.08840620873323812, total=   8.8s
[CV]  min_samples_leaf=15, min_samples_split=12, score=0.07436141535561347, total=   8.9s
[CV] min_samples_leaf=15, min_samples_split=20 .......................
[CV] min_samples_leaf=15, min_samples_split=20 .......................
[CV]  min_samples_leaf=15, min_samples_s

[Parallel(n_jobs=4)]: Done  36 out of  36 | elapsed:  1.8min finished


{'min_samples_leaf': 10, 'min_samples_split': 4}


In [55]:
params = {
    'max_features': [10, 13, 16],
    'subsample': [0.7, 0.8, 0.85]
}

gbR = GradientBoostingRegressor(learning_rate=0.01, n_estimators=400, max_depth=26, min_samples_split=4, min_samples_leaf=10, max_features=13, subsample=0.8)
gs = GridSearchCV(estimator=gbR, param_grid=params, cv=4, n_jobs=4, verbose=3)
gs.fit(X_train, y_train)

print(gs.best_params_)

Fitting 4 folds for each of 9 candidates, totalling 36 fits
[CV] max_features=10, subsample=0.7 ..................................
[CV] max_features=10, subsample=0.7 ..................................
[CV] max_features=10, subsample=0.7 ..................................
[CV] max_features=10, subsample=0.7 ..................................
[CV]  max_features=10, subsample=0.7, score=0.07609571069255983, total=   9.3s
[CV] max_features=10, subsample=0.8 ..................................
[CV]  max_features=10, subsample=0.7, score=0.10752313526305612, total=   9.3s
[CV] max_features=10, subsample=0.8 ..................................
[CV]  max_features=10, subsample=0.7, score=0.08725070165731752, total=   9.6s
[CV]  max_features=10, subsample=0.7, score=0.0885290888346495, total=   9.6s
[CV] max_features=10, subsample=0.8 ..................................
[CV] max_features=10, subsample=0.8 ..................................
[CV]  max_features=10, subsample=0.8, score=0.08814876816

[Parallel(n_jobs=4)]: Done  24 tasks      | elapsed:  1.1min


[CV]  max_features=16, subsample=0.7, score=0.08267466590903638, total=  12.7s
[CV]  max_features=16, subsample=0.7, score=0.07212711311936348, total=  12.8s
[CV] max_features=16, subsample=0.8 ..................................
[CV] max_features=16, subsample=0.8 ..................................
[CV]  max_features=16, subsample=0.7, score=0.0851857647915184, total=  13.0s
[CV] max_features=16, subsample=0.8 ..................................
[CV]  max_features=16, subsample=0.7, score=0.10474721127572639, total=  13.0s
[CV] max_features=16, subsample=0.8 ..................................
[CV]  max_features=16, subsample=0.8, score=0.08062072931758968, total=  15.5s
[CV] max_features=16, subsample=0.85 .................................
[CV]  max_features=16, subsample=0.8, score=0.07681924412623065, total=  15.7s
[CV] max_features=16, subsample=0.85 .................................
[CV]  max_features=16, subsample=0.8, score=0.08882599604319152, total=  15.6s
[CV] max_features=16, 

[Parallel(n_jobs=4)]: Done  36 out of  36 | elapsed:  1.8min finished


{'max_features': 10, 'subsample': 0.8}


In [58]:
kfold(10, gs.best_estimator_, train)

0.026523640480080095

In [59]:
predictions = gs.best_estimator_.predict(test.iloc[:, 1:])

In [60]:
sample_submission = pd.DataFrame(data=predictions, columns=['Predicted'])
sample_submission.insert(0, "Id", range(1, 1 + test.shape[0]))
sample_submission['Id'] = sample_submission['Id'].astype(str)
sample_submission.head()

Unnamed: 0,Id,Predicted
0,1,0.9396
1,2,0.91155
2,3,0.913394
3,4,0.928524
4,5,0.933152


In [61]:
sample_submission.to_csv("tunedgbr.csv", index=False)