## Random Forest Regression 

Reference: 

http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html

http://scikit-learn.org/stable/auto_examples/ensemble/plot_bias_variance.html 

In [1]:
# Import modules
%matplotlib inline

import time
import random
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.model_selection import KFold, cross_val_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from urllib.request import urlopen

# regression library for random forest
from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import make_regression


plt.style.use('ggplot')
pd.set_option('display.max_columns', 500)

from sklearn.metrics import accuracy_score
import sklearn.metrics as metrics 

from sklearn.metrics import mean_squared_error
from math import sqrt

### 1.0 Data Cleaning

In [2]:
# Loading data and cleaning dataset
UCI_data_URL = 'https://archive.ics.uci.edu/ml/machine-learning-databases\
/breast-cancer-wisconsin/wdbc.data'

In [3]:
names = ['id_number', 'diagnosis', 'radius_mean',
         'texture_mean', 'perimeter_mean', 'area_mean',
         'smoothness_mean', 'compactness_mean',
         'concavity_mean','concave_points_mean',
         'symmetry_mean', 'fractal_dimension_mean',
         'radius_se', 'texture_se', 'perimeter_se',
         'area_se', 'smoothness_se', 'compactness_se',
         'concavity_se', 'concave_points_se',
         'symmetry_se', 'fractal_dimension_se',
         'radius_worst', 'texture_worst',
         'perimeter_worst', 'area_worst',
         'smoothness_worst', 'compactness_worst',
         'concavity_worst', 'concave_points_worst',
         'symmetry_worst', 'fractal_dimension_worst']

In [4]:
breast_cancer = pd.read_csv(urlopen(UCI_data_URL), names=names)

In [5]:
# Setting 'id_number' as our index
breast_cancer.set_index(['id_number'], inplace = True)
# Converted to binary to help later on with models and plots
breast_cancer['diagnosis'] = breast_cancer['diagnosis'].map({'M':1, 'B':0})

In [6]:
breast_cancer.head()

Unnamed: 0_level_0,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave_points_mean,symmetry_mean,fractal_dimension_mean,radius_se,texture_se,perimeter_se,area_se,smoothness_se,compactness_se,concavity_se,concave_points_se,symmetry_se,fractal_dimension_se,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave_points_worst,symmetry_worst,fractal_dimension_worst
id_number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1
842302,1,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,1.095,0.9053,8.589,153.4,0.006399,0.04904,0.05373,0.01587,0.03003,0.006193,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
842517,1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,0.5435,0.7339,3.398,74.08,0.005225,0.01308,0.0186,0.0134,0.01389,0.003532,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
84300903,1,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,0.7456,0.7869,4.585,94.03,0.00615,0.04006,0.03832,0.02058,0.0225,0.004571,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
84348301,1,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,0.4956,1.156,3.445,27.23,0.00911,0.07458,0.05661,0.01867,0.05963,0.009208,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
84358402,1,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,0.7572,0.7813,5.438,94.44,0.01149,0.02461,0.05688,0.01885,0.01756,0.005115,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [7]:
for col in breast_cancer:
    if ((breast_cancer[col].isnull().values.ravel().sum()) == 0):
        pass
    else:
        print(col)
        print((breast_cancer[col].isnull().values.ravel().sum()))
print('No missing Values found!')

No missing Values found!


In [8]:
# For later use in CART models
names_index = names[2:]

### 2.0 Feature Selection & Model Fitting

In [9]:
X = breast_cancer.iloc[:, breast_cancer.columns != 'diagnosis']
y = breast_cancer.iloc[:, breast_cancer.columns == 'diagnosis']

In [10]:
regr = RandomForestRegressor(max_depth=2, random_state=0)
training_set, test_set, class_set, test_class_set = train_test_split(X, y, test_size = 0.30, random_state = 42)
# Cleaning test sets to avoid future warning messages
class_set = class_set.values.ravel()
test_class_set = test_class_set.values.ravel()

clfModel = regr.fit(training_set, class_set)

In [11]:
np.random.seed(42)

param_dist = { 
        "n_estimators"      : [10,20,30, 40, 50],
        "max_features"      : ["sqrt", "log2", None],
        "min_samples_leaf" : [2,4,8],
        "max_depth" : [4,8,10,20,30,40],
        "bootstrap": [True, False],
        }

cv_rf = GridSearchCV(regr, cv = 5, param_grid=param_dist)

# use model to fit in X_train and y_train data. y_train as output 
cv_rf.fit(training_set, class_set)

print('Best Parameters using grid search: \n',
      cv_rf.best_params_)

Best Parameters using grid search: 
 {'bootstrap': False, 'max_depth': 8, 'max_features': 'log2', 'min_samples_leaf': 4, 'n_estimators': 30}


In [19]:
regr.set_params(n_estimators=30, max_features='log2', min_samples_leaf=4, max_depth=8)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=8,
           max_features='log2', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=4, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=30, n_jobs=1,
           oob_score=True, random_state=0, verbose=0, warm_start=False)

In [25]:
RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=8,
           max_features='log2', max_leaf_nodes=None,
           min_impurity_split=1e-07,
           min_samples_leaf=4, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=30, n_jobs=1,
           oob_score=True, random_state=0, verbose=0, warm_start=False)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=8,
           max_features='log2', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=1e-07,
           min_samples_leaf=4, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=30, n_jobs=1,
           oob_score=True, random_state=0, verbose=0, warm_start=False)

In [26]:
# use model with new settings
regr.fit(training_set, class_set)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=8,
           max_features='log2', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=4, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=30, n_jobs=1,
           oob_score=True, random_state=0, verbose=0, warm_start=False)

### 3.0 Result Evaluation - Test square root of error

In [27]:
target_pred = regr.predict(test_set)
target_pred

array([ 0.09178571,  0.98787879,  0.99333333,  0.        ,  0.        ,
        1.        ,  1.        ,  0.90218254,  0.83888889,  0.02222222,
        0.07599003,  0.97676768,  0.10108974,  0.74707792,  0.01212121,
        0.94666667,  0.03888889,  0.        ,  0.        ,  1.        ,
        0.11190476,  0.0037037 ,  1.        ,  0.00673077,  0.        ,
        0.07314935,  0.04018519,  0.0775641 ,  0.        ,  1.        ,
        0.        ,  0.        ,  0.19667388,  0.03      ,  0.        ,
        0.02222222,  0.56601038,  0.14166667,  0.97025641,  0.12933169,
        0.        ,  0.98690476,  0.        ,  0.00673077,  0.24993025,
        0.02698413,  0.02222222,  0.05386243,  0.02723341,  0.04583333,
        0.93803419,  1.        ,  0.18874339,  0.22514671,  0.        ,
        0.06165825,  0.        ,  0.98666667,  0.77669312,  0.0037037 ,
        0.        ,  1.        ,  0.97357143,  0.13382275,  0.03034188,
        0.19698209,  1.        ,  0.96190476,  0.0125    ,  0.05

In [28]:
regr.set_params(oob_score=True)

regr.fit(training_set, class_set)
print('Accuracy of decision tree classifier on Train set: {:.3f}'.format(regr.oob_score_))

Accuracy of decision tree classifier on Train set: 0.834


In [29]:
regr.fit(test_set, test_class_set)
print('Accuracy of decision tree classifier on Test set: {:.3f}'.format(regr.oob_score_))

Accuracy of decision tree classifier on Test set: 0.829


In [30]:
# also a standard way to evaluate regression models 
rms = sqrt(mean_squared_error(test_class_set, target_pred))
rms

0.17347409738937866