In [1]:
%matplotlib inline

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import r2_score
from sklearn.model_selection import KFold
import pdb
from sklearn.linear_model import LogisticRegression
import time


# For producing decision tree diagrams.
from IPython.core.display import Image, display
from sklearn.externals.six import StringIO

  'Matplotlib is building the font cache using fc-list. '


In [2]:
# import training data into frame. normalize, and split into train and dev. 

train_df = pd.read_csv('train.csv')

# check for missing values in frame. 
train_df.isnull().values.any()
print("Full training set shape:",train_df.shape)
predictors = list(train_df) # includes ID and target/class columns
train_id = predictors.pop(0) # pop out ID column
target = list.pop(predictors) # pop out target column
print(predictors)

# Do not want -ve vals, so applying a min-max scaler. Many columns are already 0 or 1 valued, so stick to the first 11 columns
train_df[predictors[0:10]] = (train_df[predictors[0:10]]-train_df[predictors[0:10]].min())/(train_df[predictors[0:10]].max()-train_df[predictors[0:10]].min())

train_df, dev_df = train_test_split(train_df)
print("Train and Dev dataset shapes are:",train_df.shape, dev_df.shape)


Full training set shape: (15120, 56)
['Elevation', 'Aspect', 'Slope', 'Horizontal_Distance_To_Hydrology', 'Vertical_Distance_To_Hydrology', 'Horizontal_Distance_To_Roadways', 'Hillshade_9am', 'Hillshade_Noon', 'Hillshade_3pm', 'Horizontal_Distance_To_Fire_Points', 'Wilderness_Area1', 'Wilderness_Area2', 'Wilderness_Area3', 'Wilderness_Area4', 'Soil_Type1', 'Soil_Type2', 'Soil_Type3', 'Soil_Type4', 'Soil_Type5', 'Soil_Type6', 'Soil_Type7', 'Soil_Type8', 'Soil_Type9', 'Soil_Type10', 'Soil_Type11', 'Soil_Type12', 'Soil_Type13', 'Soil_Type14', 'Soil_Type15', 'Soil_Type16', 'Soil_Type17', 'Soil_Type18', 'Soil_Type19', 'Soil_Type20', 'Soil_Type21', 'Soil_Type22', 'Soil_Type23', 'Soil_Type24', 'Soil_Type25', 'Soil_Type26', 'Soil_Type27', 'Soil_Type28', 'Soil_Type29', 'Soil_Type30', 'Soil_Type31', 'Soil_Type32', 'Soil_Type33', 'Soil_Type34', 'Soil_Type35', 'Soil_Type36', 'Soil_Type37', 'Soil_Type38', 'Soil_Type39', 'Soil_Type40']
Train and Dev dataset shapes are: (11340, 56) (3780, 56)


In [None]:
# Attempting to use a tree-based classifier as the base model. Evaluate Random Forest and Gradient Boosting. 
# But first, try a basic decision tree. 

dt = DecisionTreeClassifier()
parameter_grid = {'min_impurity_decrease': 3. ** np.arange(-10, 5)}
param_searcher = GridSearchCV(dt, parameter_grid, cv=10)
param_searcher.fit(train_df[predictors], train_df[target])
dt = DecisionTreeClassifier(**param_searcher.best_params_)
scores = cross_val_score(dt, train_df[predictors], train_df[target], cv=10)
print("best gridsearch score with vanilla decision tree and params", param_searcher.best_score_,param_searcher.best_params_)
print("mean gridsearch score with vanilla decision tree",scores.mean())

In [5]:
# Next, evaluate a Random Forest with 500 trees. 

rf = RandomForestClassifier(n_estimators=500, oob_score=True)
scores = cross_val_score(rf, train_df[predictors], train_df[target], cv=10)
print ("Mean R^2 = {:.3}".format(scores.mean()))

Mean R^2 = 0.858


In [185]:
# Evaluate Random Forest on dev data. 

rf.fit(train_df[predictors], train_df[target])
rf.score(dev_df[predictors], dev_df[target])

0.8574074074074074

In [186]:
# Evaluate Gradient Boosting classifier with 500 estimators. 

gb = GradientBoostingClassifier(subsample=.7, n_estimators=500)
parameter_grid = {
    'max_depth': range(1, 6),
    'learning_rate': [.01, .05, .1],
    'max_features': [2, 5, 'auto']
}
param_searcher = GridSearchCV(gb, parameter_grid, cv=5)
param_searcher.fit(train_df[predictors], train_df[target])

# Evaluate GBC on dev data. 

gb = GradientBoostingRegressor(subsample=.7, n_estimators=500, **param_searcher.best_params_)
gb.fit(train_df[predictors], train_df[target])
gb.score(dev_df[predictors],dev_df[target])


0.72839848950018049

It looks like a Random Forest with 500 estimators exhibiting 86% accuracy on dev data is a fairly good  initial choice for a base model to improve upon. Will revisit data normalization, consider demensionality reduction and other prediction models in cells below. 