In [22]:
# Import libraries
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression as Lin_Reg
from sklearn.ensemble import RandomForestRegressor as RFR
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.cm as cmx
import matplotlib.colors as colors
import scipy as sp
%matplotlib inline

from sklearn.model_selection import KFold
def kfold(k, predictor, X):
    kf = KFold(n_splits = k, random_state = 0)
    count = 0
    coeff_used = 0 
    for trains, tests in kf.split(X):
        x_train = X.iloc[trains, 1:-1]
        y_train = X.iloc[trains, -1]
        x_test = X.iloc[tests, 1:-1]
        y_test = X.iloc[tests, -1]
        predictor.fit(x_train, y_train)
        predictions = predictor.predict(x_test)
        count += np.sqrt(mean_squared_error(predictions, y_test))
    return count/k

from collections import Counter




In [5]:
# Load training and test sets (assumes you have these in current working directory)
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [6]:
# Split training set into X and y (removing first column containing IDs)
X_train = train.iloc[:, 1:-1]
y_train = train.iloc[:, -1]

In [9]:
print y_train

0       0.901355
1       0.913550
2       0.884824
3       0.977236
4       0.921138
5       0.902891
6       0.913731
7       0.964770
8       0.906504
9       0.915537
10      0.910659
11      0.963505
12      0.910840
13      0.921590
14      0.913821
15      0.905601
16      0.913821
17      0.923848
18      0.910840
19      0.908762
20      0.909485
21      0.943812
22      0.879313
23      0.913550
24      0.933424
25      0.897832
26      0.910750
27      0.885637
28      0.890696
29      0.938663
          ...   
5301    0.923848
5302    0.908220
5303    0.932701
5304    0.990967
5305    0.909756
5306    0.934056
5307    0.887624
5308    0.966125
5309    0.906865
5310    0.907498
5311    0.991147
5312    0.894761
5313    0.917706
5314    0.910117
5315    0.897561
5316    0.880036
5317    0.902439
5318    0.914905
5319    0.907498
5320    0.939115
5321    0.908582
5322    0.896387
5323    0.914905
5324    0.938844
5325    0.916441
5326    0.976694
5327    0.993044
5328    0.9185

In [7]:
# Define function to compute RMSE
def scoreRMSE(predictor, X, true_y):
    predictions = predictor.predict(X)
    return np.sqrt(mean_squared_error(predictions, true_y))

In [23]:
for max_depth in [2,4,6,8,10]:
    print max_depth, kfold(max_depth, GradientBoostingRegressor(max_depth = max_depth), train)

2 0.027159124300025404
4 0.027087834207946002
6 0.027236975974814486
8 0.027420647191375807
10

KeyboardInterrupt: 

In [24]:
clf = GradientBoostingRegressor(n_estimators=1000, max_depth=4)
clf.fit(X_train, y_train)
print ("Training RMSE: ", scoreRMSE(clf, X_train, y_train))

 ('Training RMSE: ', 0.012151221347168686)


In [27]:
# Remove first column to make predictions
X_test = test.iloc[:, 1:]
X_test.head()

Unnamed: 0,Feat 1,Feat 2,Feat 3,Feat 4,Feat 5,Feat 6,Feat 7,Feat 8,Feat 9,Feat 10,...,Feat 242,Feat 243,Feat 244,Feat 245,Feat 246,Feat 247,Feat 248,Feat 249,Feat 250,Feat 251
0,0.999849,0.174118,0.999819,0.997841,0.133333,0.2,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0,0.728471,0.054397,0.649,0.416164,0.053998,0.667391
1,0.999958,0.164706,1.0,0.996741,0.066667,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0,0.497255,0.037736,0.375,0.165514,0.101973,0.50665
2,0.999666,0.174118,0.999479,0.997376,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0,0.688941,0.019309,1.0,0.192069,0.1207,0.498784
3,0.999735,0.174118,0.999655,0.997173,0.133333,0.0,0.0,0.0,0.363636,0.166667,...,0.0,0.0,0.0,0,0.654118,0.019089,0.333,0.451252,0.16418,0.774466
4,0.999806,0.164706,0.999551,0.997234,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0,0.627451,0.160433,0.882,0.147407,0.0,0.48124


In [28]:
# Make predictions using linear regression model fitted above
predictions = clf.predict(X_test)

In [29]:
# Format predictions to be compatible with Kaggle upload
sample_submission = pd.DataFrame(data=predictions, columns=['Predicted'])
sample_submission.insert(0, "Id", range(1, 1 + X_test.shape[0]))
sample_submission['Id'] = sample_submission['Id'].astype(str)
sample_submission.head()
# Save predictions to .csv file for upload to Kaggle
sample_submission.to_csv("clf1000,4.csv", index=False)

In [10]:
# Save predictions to .csv file for upload to Kaggle
sample_submission.to_csv("sample_submission.csv", index=False)

In [59]:
sample_submission = pd.DataFrame(data=predictions, columns=['Predicted'])
sample_submission.insert(0, "Id", range(1, 1 + X_test.shape[0]))
sample_submission['Id'] = sample_submission['Id'].astype(str)
sample_submission.head()

Unnamed: 0,Id,Predicted
0,1,0.933655
1,2,0.909929
2,3,0.91578
3,4,0.924992
4,5,0.935605


In [60]:
# Save predictions to .csv file for upload to Kaggle
sample_submission.to_csv("rfr_imp.csv", index=False)