In [16]:
# Import libraries
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression as Lin_Reg
from sklearn.ensemble import RandomForestRegressor as RFR
from sklearn.metrics import mean_squared_error
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.cm as cmx
import matplotlib.colors as colors
import scipy as sp
%matplotlib inline
from sklearn.model_selection import KFold
def kfold(k, predictor, X):
    kf = KFold(n_splits = k, random_state = 0)
    count = 0
    for trains, tests in kf.split(X):
        x_train = X.iloc[trains, 1:-1]
        y_train = X.iloc[trains, -1]
        x_test = X.iloc[tests, 1:-1]
        y_test = X.iloc[tests, -1]
        predictor.fit(x_train, y_train)
        predictions = predictor.predict(x_test)
        count += np.sqrt(mean_squared_error(predictions, y_test))
    return count/k

from collections import Counter

In [17]:
# Load training and test sets (assumes you have these in current working directory)
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [18]:
# Split training set into X and y (removing first column containing IDs)
X_train = train.iloc[:, 1:-1]
y_train = train.iloc[:, -1]

# Remove first column to make predictions
X_test = test.iloc[:, 1:]

In [19]:
# Define function to compute RMSE
def scoreRMSE(predictor, X, true_y):
    predictions = predictor.predict(X)
    return np.sqrt(mean_squared_error(predictions, true_y))

In [101]:
# tune min_leaf
for min_leaf in [1,2,4, 6, 8, 10, 12, 15, 20]:
    print min_leaf
    rfr = RFR(n_estimators=50, min_samples_leaf=min_leaf)
    print kfold(5, rfr, train)
for min_leaf in [6,8,10]:
    print min_leaf
    rfr = RFR(n_estimators=50, min_samples_leaf=min_leaf)
    print kfold(5, rfr, train)

1
0.027169137663525363
2
0.027079539239580064
4
0.027066751093652107


In [20]:
# Fit unregularized linear regression and see RMSE on training set

# >>> regr = RandomForestRegressor(max_depth=2, random_state=0,
# ...                              n_estimators=100)
# >>> regr.fit(X, y)

rfR = RFR(n_estimators=100, min_samples_leaf=1)
rfR.fit(X_train, y_train)

print ("Training RMSE: ", scoreRMSE(rfR, X_train, y_train))

('Training RMSE: ', 0.010238567468084216)


In [112]:
# Make predictions using linear regression model fitted above
predictions = rfR.predict(X_test)

# Format predictions to be compatible with Kaggle upload
sample_submission = pd.DataFrame(data=predictions, columns=['Predicted'])
sample_submission.insert(0, "Id", range(1, 1 + X_test.shape[0]))
sample_submission['Id'] = sample_submission['Id'].astype(str)
sample_submission.head()
# Save predictions to .csv file for upload to Kaggle
sample_submission.to_csv("rfr2000.csv", index=False)

# Save predictions to .csv file for upload to Kaggle
sample_submission.to_csv("sample_submission.csv", index=False)

In [21]:
# Get numerical feature importances
importances = list(rfR.feature_importances_)
featimp = [(feature, importance) for feature, importance in zip(X_train.columns, importances)]
featimp = sorted(featimp, key = lambda x: x[1], reverse = True)

In [22]:
# List of features sorted from most to least important
sorted_importances = [importance[1] for importance in featimp]
sorted_features = [importance[0] for importance in featimp]
print sum(sorted_importances)
# Cumulative importances
cumulative_importances = np.cumsum(sorted_importances)
for x in cumulative_importances:
    print x

0.9999999999999999
0.07991096431842995
0.13764876709783117
0.18961021499353706
0.22784413915442492
0.26549583332384646
0.2928338536446564
0.3201569324283597
0.34581825351223167
0.36991628371175195
0.393092466447579
0.4126117949179578
0.43186026826031054
0.44993705276758345
0.4673435908183174
0.48367656961462174
0.49877813786915753
0.5117460699088983
0.5231074120061423
0.5341294953233564
0.545031345715784
0.5556273165645229
0.5656178902146551
0.5755825273159407
0.5854691042005319
0.5951558308410391
0.6046481683429656
0.6140023265458029
0.6228857226959982
0.631478023436093
0.6400674034014762
0.6482552634042883
0.6562609516613919
0.6638668681490998
0.6714339901961427
0.6781750999437189
0.6848160414868039
0.6912589125143355
0.6971927411090463
0.7031082249118669
0.7089219226905774
0.7146796275373197
0.7203214393868185
0.7259559571962219
0.7314207297447991
0.73660148807594
0.7417145398071763
0.7467158603157223
0.751468184785936
0.7561848059835785
0.7608644917251207
0.765225483219405
0.769382

In [39]:
# Extract the names of the most important features
important_feature_names = [feature[0] for feature in featimp[0:49]]
target = "Target"
important_feature_names.append(target)

In [40]:
print important_feature_names

['Feat 251', 'Feat 249', 'Feat 247', 'Feat 250', 'Feat 248', 'Feat 246', 'Feat 4', 'Feat 1', 'Feat 3', 'Feat 172', 'Feat 151', 'Feat 13', 'Feat 125', 'Feat 177', 'Feat 171', 'Feat 150', 'Feat 6', 'Feat 149', 'Feat 131', 'Feat 127', 'Feat 12', 'Feat 5', 'Feat 9', 'Feat 175', 'Feat 153', 'Feat 120', 'Feat 124', 'Feat 176', 'Feat 183', 'Feat 126', 'Feat 128', 'Feat 10', 'Feat 214', 'Feat 165', 'Feat 233', 'Feat 232', 'Feat 14', 'Feat 130', 'Feat 84', 'Feat 219', 'Feat 7', 'Feat 200', 'Feat 197', 'Feat 154', 'Feat 182', 'Feat 133', 'Feat 185', 'Feat 39', 'Feat 166', 'Target']


In [41]:
kfold(10, rfR, train[important_feature_names])

0.026982833380728453