## Comparing Ensamble Methods
Here I compare different ensemble methods: Random Forest, Gradient Boosting Decision Tree, and XGBoost.

There are 2 datasets, split into train and test sets the same way: test_size=0.33, random_state=42.

- Compare several parameters options by cross validating with GridSearch.
- Compare mean error rates and the confusion matrices on test data.
- Compare run times and overall model quality for these datasets

In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from xgboost.sklearn import XGBClassifier
from time import time
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import timeit

%matplotlib inline



In [2]:
X1 = pd.read_csv('X1.csv', header=None)
y1 = pd.read_csv('y1.csv', header=None)
X1_train, X1_test, y1_train, y1_test = train_test_split(X1,y1,test_size = .33, random_state=42)

X2 = pd.read_csv('X2.csv', header=None)
y2 = pd.read_csv('y2.csv', header=None)
X2_train, X2_test, y2_train, y2_test = train_test_split(X2,y2,test_size = .33, random_state=42)

In [3]:
X1.shape, y1.shape, X2.shape, y2.shape

((10000, 30), (10000, 1), (5000, 20), (5000, 1))

In [4]:
#Random Forest 1
RFparam_grid = {'n_estimators':[11,12,13],
              'criterion':['gini','entropy'],
              'max_depth':[8,9,10]}

RF = RandomForestClassifier(random_state = 42)
RFmodel = GridSearchCV(RF, param_grid = RFparam_grid, cv=3)
start_time = timeit.default_timer()
RFmodel.fit(X1_train, y1_train[0])
traintime = timeit.default_timer() - start_time

print (RFmodel.best_params_)

start_time = timeit.default_timer()
RF_prediction_test = RFmodel.predict(X1_test)
predicttime = timeit.default_timer() - start_time

RFcf = confusion_matrix(y1_test, RF_prediction_test)
print (RFcf)

RF1_mean_error_rate = (1.0*RFcf[0,1] + RFcf[1,0])/(RFcf[0,0] + RFcf[0,1] + RFcf[1,0] + RFcf[1,1])
print ("mean error rate = " + str(RF1_mean_error_rate))
print ("train time = " + str(traintime) + "   predict time = " + str(predicttime))

{'max_depth': 10, 'n_estimators': 12, 'criterion': 'gini'}
[[1394  287]
 [ 164 1455]]
mean error rate = 0.136666666667
train time = 11.945666968507046   predict time = 0.005732498899833516


In [5]:
#GBDT 1
GBparam_grid = {'n_estimators':[10,11,12],
                'learning_rate': [.05,.1,.2,.3],
                  'max_depth':[6,7,8]}

GB = GradientBoostingClassifier(random_state = 42)
GBmodel = GridSearchCV(GB, param_grid = GBparam_grid, cv=3)
start_time = timeit.default_timer()
GBmodel.fit(X1_train, y1_train[0])
traintime = timeit.default_timer() - start_time

print (GBmodel.best_params_)

start_time = timeit.default_timer()
GB_prediction_test = GBmodel.predict(X1_test)
predicttime = timeit.default_timer() - start_time

GBcf = confusion_matrix(y1_test, GB_prediction_test)
print (GBcf)

GB1_mean_error_rate = (1.0*GBcf[0,1] + GBcf[1,0])/(GBcf[0,0] + GBcf[0,1] + GBcf[1,0] + GBcf[1,1])
print ("mean error rate = " + str(GB1_mean_error_rate))
print ("train time = " + str(traintime) + "   predict time = " + str(predicttime))

{'learning_rate': 0.2, 'n_estimators': 12, 'max_depth': 7}
[[1521  160]
 [ 117 1502]]
mean error rate = 0.0839393939394
train time = 45.75317073445945   predict time = 0.002380519165448902


In [6]:
#XGBoost 1
XGparam_grid = {'n_estimators':[10,11,12],
                'learning_rate': [.05,.1,.2,.3],
                  'max_depth':[6,7,8]}

XG = XGBClassifier()
XGmodel = GridSearchCV(XG, param_grid = XGparam_grid, cv=3)
start_time = timeit.default_timer()
XGmodel.fit(X1_train, y1_train[0])
traintime = timeit.default_timer() - start_time

print (XGmodel.best_params_)

start_time = timeit.default_timer()
XG_prediction_test = XGmodel.predict(X1_test)
predicttime = timeit.default_timer() - start_time

XGcf = confusion_matrix(y1_test, XG_prediction_test)
print (XGcf)

XG1_mean_error_rate = (1.0*XGcf[0,1] + XGcf[1,0])/(XGcf[0,0] + XGcf[0,1] + XGcf[1,0] + XGcf[1,1])
print ("mean error rate = " + str(XG1_mean_error_rate))
print ("train time = " + str(traintime) + "   predict time = " + str(predicttime))

{'learning_rate': 0.3, 'n_estimators': 12, 'max_depth': 7}
[[1535  146]
 [ 113 1506]]
mean error rate = 0.0784848484848
train time = 14.094143873486559   predict time = 0.0038128505015606606


In [7]:
#Random Forest 2
RFparam_grid = {'n_estimators':[11,12,13],
              'criterion':['gini','entropy'],
              'max_depth':[8,9,10]}

RF = RandomForestClassifier(random_state = 42)
RFmodel = GridSearchCV(RF, param_grid = RFparam_grid, cv=3)
start_time = timeit.default_timer()
RFmodel.fit(X2_train, y2_train[0])
traintime = timeit.default_timer() - start_time

print (RFmodel.best_params_)

start_time = timeit.default_timer()
RF2_prediction_test = RFmodel.predict(X2_test)
predicttime = timeit.default_timer() - start_time

RF2cf = confusion_matrix(y2_test, RF2_prediction_test)
print (RF2cf)

RF2_mean_error_rate = (1.0*RF2cf[0,1] + RF2cf[1,0])/(RF2cf[0,0] + RF2cf[0,1] + RF2cf[1,0] + RF2cf[1,1])
print ("mean error rate = " + str(RF2_mean_error_rate))
print ("train time = " + str(traintime) + "   predict time = " + str(predicttime))

{'max_depth': 10, 'n_estimators': 13, 'criterion': 'gini'}
[[768  82]
 [ 56 744]]
mean error rate = 0.0836363636364
train time = 5.365731102535463   predict time = 0.003749026776333153


In [8]:
#GBDT 2
GBparam_grid = {'n_estimators':[10,11,12],
                'learning_rate': [.05,.1,.2,.3],
                  'max_depth':[6,7,8]}

GB = GradientBoostingClassifier(random_state = 42)
GBmodel = GridSearchCV(GB, param_grid = GBparam_grid, cv=3)
start_time = timeit.default_timer()
GBmodel.fit(X2_train, y2_train[0])
traintime = timeit.default_timer() - start_time

print (GBmodel.best_params_)

start_time = timeit.default_timer()
GB2_prediction_test = GBmodel.predict(X2_test)
predicttime = timeit.default_timer() - start_time

GB2cf = confusion_matrix(y2_test, GB2_prediction_test)
print (GB2cf)

GB2_mean_error_rate = (1.0*GB2cf[0,1] + GB2cf[1,0])/(GB2cf[0,0] + GB2cf[0,1] + GB2cf[1,0] + GB2cf[1,1])
print ("mean error rate = " + str(GB2_mean_error_rate))
print ("train time = " + str(traintime) + "   predict time = " + str(predicttime))

{'learning_rate': 0.3, 'n_estimators': 12, 'max_depth': 6}
[[770  80]
 [ 61 739]]
mean error rate = 0.0854545454545
train time = 19.583851328097666   predict time = 0.0015497528858219312


In [9]:
#XGBoost 2
XGparam_grid = {'n_estimators':[10,11,12],
                'learning_rate': [.05,.1,.2,.3],
                  'max_depth':[6,7,8]}

XG = XGBClassifier()
XGmodel = GridSearchCV(XG, param_grid = XGparam_grid, cv=3)
start_time = timeit.default_timer()
XGmodel.fit(X2_train, y2_train[0])
traintime = timeit.default_timer() - start_time

print (XGmodel.best_params_)

start_time = timeit.default_timer()
XG2_prediction_test = XGmodel.predict(X2_test)
predicttime = timeit.default_timer() - start_time

XG2cf = confusion_matrix(y2_test, XG2_prediction_test)
print (XG2cf)

XG2_mean_error_rate = (1.0*XG2cf[0,1] + XG2cf[1,0])/(XG2cf[0,0] + XG2cf[0,1] + XG2cf[1,0] + XG2cf[1,1])
print ("mean error rate = " + str(XG2_mean_error_rate))
print ("train time = " + str(traintime) + "   predict time = " + str(predicttime))

{'learning_rate': 0.3, 'n_estimators': 12, 'max_depth': 8}
[[771  79]
 [ 48 752]]
mean error rate = 0.0769696969697
train time = 8.08106106766867   predict time = 0.001996871579606818


In [10]:
print (X1_train.shape)
print (X1_test.shape)
print (X2_train.shape)
print (X2_test.shape)

(6700, 30)
(3300, 30)
(3350, 20)
(1650, 20)


In [11]:
print ("RF1 mean error rate is " + str(RF1_mean_error_rate))
print ("GB1 mean error rate is " + str(GB1_mean_error_rate))
print ("XG1 mean error rate is " + str(XG1_mean_error_rate))
print ("RF2 mean error rate is " + str(RF2_mean_error_rate))
print ("GB2 mean error rate is " + str(GB2_mean_error_rate))
print ("XG2 mean error rate is " + str(XG2_mean_error_rate))

RF1 mean error rate is 0.136666666667
GB1 mean error rate is 0.0839393939394
XG1 mean error rate is 0.0784848484848
RF2 mean error rate is 0.0836363636364
GB2 mean error rate is 0.0854545454545
XG2 mean error rate is 0.0769696969697


Data set 1 is much larger than data set 2, so we expected to be able to achieve better results from set 1. However, after tuning our parameters using grid search CV, we were able to obtain the lowest mean error rate using XG boost on set 2. Additionally, XG boost was fast to fit the data, taking ~8-13 seconds each time. XG boost was designed to be a more efficient and flexible algorithm, so it is no surprise that it performed the best in our tests. If we had unlimited time and computing resources we could cross validate more extensively and tune additional parameters in order to further improve our model. However, given our constraints, we are pleased with the accuracy of our results.

## Diabetes Classification with Support Vector Machines

A comparison between a linear SVM and Gaussian radial basis kernel using the Pima Indian Women diabetes detection problem on the [dataset](http://archive.ics.uci.edu/ml/datasets/Pima+Indians+Diabetes).

Default parametrs were held except for the slack penatlty "C" which was decided using grid search cross validation. 

## SOLUTION

In [12]:
import numpy as np
import pandas as pd
from sklearn.svm import SVC
from sklearn import grid_search
data_train = pd.read_csv('diabetes_train-log.csv')
data_test = pd.read_csv('diabetes_test-log.csv')
cols = ['numpreg', 'plasmacon', 'bloodpress', 'skinfold', 'seruminsulin', 'BMI', 'pedigreefunction', 'age']
xtrain = np.asmatrix(data_train[cols])
ytrain = np.asarray(data_train['classvariable']).T
xtest = np.asmatrix(data_test[cols])
ytest = np.asarray(data_test['classvariable']).T



In [13]:
#a
Cs = [.01,.1,1,10,100,1000]
SVCmodel = SVC(kernel="linear",random_state=42)
SVM = GridSearchCV(cv=10, estimator = SVCmodel, param_grid = {'C':Cs})
SVM.fit(xtrain, ytrain)
print (SVM.best_params_)
SVM_test_pred = SVM.predict(xtest)
SVMcf = confusion_matrix(ytest, SVM_test_pred)
print (SVMcf)
print ("mean error rate = " + str((1.0*SVMcf[0,1] + SVMcf[1,0])/(SVMcf[0,0] + SVMcf[0,1] + SVMcf[1,0] + SVMcf[1,1])))

{'C': 1000}
[[218  27]
 [ 45  67]]
mean error rate = 0.201680672269


In [14]:
#b
Cs = [.01,.1,1,10,100,1000]
SVCrbfmodel = SVC(kernel="rbf",random_state=42)
SVMrbf = GridSearchCV(cv=10, estimator = SVCrbfmodel, param_grid = {'C':Cs})
SVMrbf.fit(xtrain, ytrain)
print (SVMrbf.best_params_)
SVMrbf_test_pred = SVMrbf.predict(xtest)
SVMrbf_cf = confusion_matrix(ytest, SVMrbf_test_pred)
print (SVMrbf_cf)
print ("mean error rate = " + str((1.0*SVMrbf_cf[0,1] + SVMrbf_cf[1,0])/
                                  (SVMrbf_cf[0,0] + SVMrbf_cf[0,1] + SVMrbf_cf[1,0] + SVMrbf_cf[1,1])))

{'C': 10}
[[205  40]
 [ 41  71]]
mean error rate = 0.226890756303


The mean error rate is lower in the linear SVM  when compared to the Gaussian radial basis kernal. This demonstrates that the linear SVM is more accruate at predicting diabetes in the dataset because there are fewer false positives and false negatives. We tested Cs on a logarithmic scale to effeciently compare vastly different slack penalties. Doing this demonstrated that the linear SVM is optimized by a much larger slack variable than RBF.