In [9]:
# Import libraries
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression as Lin_Reg
from sklearn.ensemble import RandomForestRegressor as RFR, AdaBoostRegressor as ABR, ExtraTreesRegressor as ETR, BaggingRegressor as BR
from sklearn.metrics import mean_squared_error
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.cm as cmx
import matplotlib.colors as colors
import scipy as sp
%matplotlib inline
from sklearn.model_selection import KFold
def kfold(k, predictor, X):
    kf = KFold(n_splits = k, random_state = 0)
    count = 0
    for trains, tests in kf.split(X):
        x_train = X.iloc[trains, 1:-1]
        y_train = X.iloc[trains, -1]
        x_test = X.iloc[tests, 1:-1]
        y_test = X.iloc[tests, -1]
        predictor.fit(x_train, y_train)
        predictions = predictor.predict(x_test)
        count += np.sqrt(mean_squared_error(predictions, y_test))
    return count/k

from collections import Counter

In [2]:
# Load training and test sets (assumes you have these in current working directory)
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [3]:
# Inspect training set
train.head()

Unnamed: 0,Id,Feat 1,Feat 2,Feat 3,Feat 4,Feat 5,Feat 6,Feat 7,Feat 8,Feat 9,...,Feat 243,Feat 244,Feat 245,Feat 246,Feat 247,Feat 248,Feat 249,Feat 250,Feat 251,Target
0,1,0.998952,0.174118,0.999211,0.99646,0.133333,0.057143,0.0,0.0,0.0,...,0.0,0.0,0,0.612863,0.026812,0.522,0.217791,0.233629,0.540962,0.901355
1,2,0.999445,0.174118,0.999329,0.997079,0.133333,0.0,0.0,0.0,0.0,...,0.0,0.0,0,0.688941,0.07503,0.704,0.246119,0.14386,0.525384,0.91355
2,3,0.998759,0.0,0.99726,0.996325,0.0,0.085714,0.125,0.0,0.0,...,0.0,0.0,0,0.156863,0.436279,0.0,0.119091,0.162869,0.361124,0.884824
3,4,0.999619,0.174118,0.997969,0.997321,0.266667,0.057143,0.125,0.0,0.0,...,0.0,0.0,0,0.709647,0.075472,0.513,0.392743,0.377302,0.613776,0.977236
4,5,0.998278,0.174118,0.998427,0.996269,0.2,0.0,0.0,0.0,0.0,...,0.0,0.0,0,0.364235,0.041818,0.2,0.096297,0.166459,0.408322,0.921138


In [4]:
# Inspect test set
test.head()

Unnamed: 0,Id,Feat 1,Feat 2,Feat 3,Feat 4,Feat 5,Feat 6,Feat 7,Feat 8,Feat 9,...,Feat 242,Feat 243,Feat 244,Feat 245,Feat 246,Feat 247,Feat 248,Feat 249,Feat 250,Feat 251
0,1,0.999849,0.174118,0.999819,0.997841,0.133333,0.2,0.0,0.0,0.0,...,0.0,0.0,0.0,0,0.728471,0.054397,0.649,0.416164,0.053998,0.667391
1,2,0.999958,0.164706,1.0,0.996741,0.066667,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0,0.497255,0.037736,0.375,0.165514,0.101973,0.50665
2,3,0.999666,0.174118,0.999479,0.997376,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0,0.688941,0.019309,1.0,0.192069,0.1207,0.498784
3,4,0.999735,0.174118,0.999655,0.997173,0.133333,0.0,0.0,0.0,0.363636,...,0.0,0.0,0.0,0,0.654118,0.019089,0.333,0.451252,0.16418,0.774466
4,5,0.999806,0.164706,0.999551,0.997234,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0,0.627451,0.160433,0.882,0.147407,0.0,0.48124


In [5]:
# Split training set into X and y (removing first column containing IDs)
X_train = train.iloc[:, 1:-1]
y_train = train.iloc[:, -1]

In [6]:
# Define function to compute RMSE
def scoreRMSE(predictor, X, true_y):
    predictions = predictor.predict(X)
    return np.sqrt(mean_squared_error(predictions, true_y))

In [10]:
for n_est in range(50, 350, 25):
    print("RFR n =", n_est, ":", kfold(10, RFR(n_estimators=n_est), train))
    print("ABR n =", n_est, ":", kfold(10, ABR(n_estimators=n_est), train))
    print("BR n =", n_est, ":", kfold(10, BR(n_estimators=n_est), train))
    print("ETR n =", n_est, ":", kfold(10, ETR(n_estimators=n_est), train))

RFR n = 50 : 0.027152880683807758
ABR n = 50 : 0.029322638314763914
BR n = 50 : 0.027034923144826695
ETR n = 50 : 0.02718804400477707
RFR n = 75 : 0.026947865558143304
ABR n = 75 : 0.02944534852472768
BR n = 75 : 0.027098553115467532
ETR n = 75 : 0.027135860388625505
RFR n = 100 : 0.026913787983221815
ABR n = 100 : 0.029054524755758704
BR n = 100 : 0.026940555757043738
ETR n = 100 : 0.027120922569592427
RFR n = 125 : 0.026905848566217323
ABR n = 125 : 0.02933265211059597
BR n = 125 : 0.02689425214395493
ETR n = 125 : 0.027102500282994647
RFR n = 150 : 0.02686969225376329
ABR n = 150 : 0.028953887313950648
BR n = 150 : 0.02687091485696306
ETR n = 150 : 0.027009915629994073
RFR n = 175 : 0.026902252522980803
ABR n = 175 : 0.029551181493814267
BR n = 175 : 0.026846609024094546
ETR n = 175 : 0.02702222898198551
RFR n = 200 : 0.026848159874397166
ABR n = 200 : 0.029416299994313038
BR n = 200 : 0.026886047812234914
ETR n = 200 : 0.026992788363546842
RFR n = 225 : 0.0268363225102952
ABR n = 2

In [12]:
for n_est in range(50, 350, 25):
    rfr = RFR(n_estimators=n_est)
    rfr.fit(X_train, y_train)
    print("RFR n =", n_est, ":", scoreRMSE(rfr, X_train, y_train))
    
    abr = ABR(n_estimators=n_est)
    abr.fit(X_train, y_train)
    print("ABR n =", n_est, ":", scoreRMSE(abr, X_train, y_train))
    
    br = BR(n_estimators=n_est)
    br.fit(X_train, y_train)
    print("BR n =", n_est, ":", scoreRMSE(br, X_train, y_train))
    
    etr = ETR(n_estimators=n_est)
    etr.fit(X_train, y_train)
    print("ETR n =", n_est, ":", scoreRMSE(etr, X_train, y_train))

RFR n = 50 : 0.010459998106587457
ABR n = 50 : 0.028151063821895743
BR n = 50 : 0.01034430202642949
ETR n = 50 : 0.0021228399088563978
RFR n = 75 : 0.01032399737093872
ABR n = 75 : 0.029926560081168434
BR n = 75 : 0.010313168581899531
ETR n = 75 : 0.002122831449061378
RFR n = 100 : 0.010229984597762165
ABR n = 100 : 0.028404736816260247
BR n = 100 : 0.010205892617720832
ETR n = 100 : 0.002122830802281152
RFR n = 125 : 0.0102090449043317
ABR n = 125 : 0.029077167839701157
BR n = 125 : 0.010267924406875814
ETR n = 125 : 0.0021228265017234987
RFR n = 150 : 0.010116688958378568
ABR n = 150 : 0.02890635250886352
BR n = 150 : 0.010167699278428623
ETR n = 150 : 0.002122825485695412
RFR n = 175 : 0.010127141080715897
ABR n = 175 : 0.028213203332107056
BR n = 175 : 0.010124540857739922
ETR n = 175 : 0.002122825932854721
RFR n = 200 : 0.01014598470168394
ABR n = 200 : 0.02865194032684834
BR n = 200 : 0.010157547813377125
ETR n = 200 : 0.0021228240376605246
RFR n = 225 : 0.0100724017670848
ABR n 

In [13]:
from sklearn.neural_network import MLPRegressor as MLP

mlp = MLP()
mlp.fit(X_train, y_train)

print("Training RMSE: ", scoreRMSE(mlp, X_train, y_train))

Training RMSE:  0.032418983721719974


In [98]:
kfold(10, MLP(hidden_layer_sizes=(1000,500,100,100,100),activation='logistic', alpha=0.00005, solver='sgd', learning_rate_init=0.0005), train)

0.02796802133524843

In [104]:
from sklearn.kernel_ridge import KernelRidge as KR

kfold(10, KR(alpha=1), train)

0.029742836179759823

In [107]:
from sklearn.svm import SVR

kfold(10, SVR(), train)

0.028141823013705915

In [122]:
from sklearn.tree import DecisionTreeRegressor as DTR

kfold(10, DTR(), train)

0.037553002433404196

In [124]:
from sklearn.neighbors import KNeighborsRegressor as KNR

kfold(10, KNR(n_neighbors=10, weights='distance'), train)

0.027492723581277672

In [136]:
kfold(10, KNR(n_neighbors=30, weights='uniform', algorithm='kd_tree'), train)

0.027032063335554347