In [3]:
import numpy as np
from sklearn import datasets
from sklearn.cross_validation import train_test_split
import time
from xgboost import XGBClassifier

## Load Dataset

In [4]:
data = datasets.load_breast_cancer()
X = data.data
Y = data.target

#Split Training and Test Set
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = .3)

#### Initializing XGBoost Classifier with no parameters

In [6]:
begin = time.time()
XGB = XGBClassifier()
XGB.fit(X_train, Y_train)
X_predict = XGB.predict(X_test)
print "The Misclassification rate is \t\t" + str(sum(X_predict - Y_test)**2/float(len(X_predict)))
end = time.time()
print "Time it takes to fit and predict XGB \t" + str(end - begin)

The Misclassification rate is 		0.093567251462
Time it takes to fit and predict XGB 	0.0596899986267


#### Initializing XGBoost Classifier with max_depth parameter

In [8]:
begin = time.time()
XGB = XGBClassifier(max_depth = 2)
XGB.fit(X_train, Y_train)
X_predict = XGB.predict(X_test)
print "The Misclassification rate is \t\t" + str(sum(X_predict - Y_test)**2/float(len(X_predict)))
end = time.time()
print "Time it takes to fit and predict XGB \t" + str(end - begin)
print "Less accurate; the time is about the same."

The Misclassification rate is 		0.093567251462
Time it takes to fit and predict XGB 	0.0311439037323
Less accurate; the time is about the same.


#### Initializing XGBoost Classifier with n_estimators parameter

In [9]:
begin = time.time()
XGB = XGBClassifier(n_estimators=3)
XGB.fit(X_train, Y_train)
X_predict = XGB.predict(X_test)
print "The Misclassification rate is \t\t" + str(sum(X_predict - Y_test)**2/float(len(X_predict)))
end = time.time()
print "Time it takes to fit and predict XGB \t" + str(end - begin)
print "More accurate; Less time when 0 < n_estimators < 7."

The Misclassification rate is 		0.00584795321637
Time it takes to fit and predict XGB 	0.00474500656128
More accurate; Less time when 0 < n_estimators < 7.


#### Initializing XGBoost Classifier with nthread parameter

In [10]:
begin = time.time()
XGB = XGBClassifier(nthread=3)
XGB.fit(X_train, Y_train)
X_predict = XGB.predict(X_test)
print "The Misclassification rate is \t\t" + str(sum(X_predict - Y_test)**2/float(len(X_predict)))
end = time.time()
print "Time it takes to fit and predict XGB \t" + str(end - begin)
print "Accuracy remains the same as the original; More time if int is higher, and less time if int is smaller."

The Misclassification rate is 		0.093567251462
Time it takes to fit and predict XGB 	0.0279731750488
Accuracy remains the same as the original; More time if int is higher, and less time if int is smaller.


#### Initializing XGBoost Classifier with gamma parameter

In [18]:
begin = time.time()
XGB = XGBClassifier(gamma = .1)
XGB.fit(X_train, Y_train)
X_predict = XGB.predict(X_test)
print "The Misclassification rate is \t\t" + str(sum(X_predict - Y_test)**2/float(len(X_predict)))
end = time.time()
print "Time it takes to fit and predict XGB \t" + str(end - begin)
print "More accurate when the float is .1 and less so when it's higher; time is faster than original regardless of float."

The Misclassification rate is 		0.093567251462
Time it takes to fit and predict XGB 	0.0263950824738
More accurate when the float is .1 and less so when it's higher; time is faster than original regardless of float.


#### Initializing XGBoost Classifier with subsample parameter

In [19]:
begin = time.time()
XGB = XGBClassifier(subsample=.3)
XGB.fit(X_train, Y_train)
X_predict = XGB.predict(X_test)
print "The Misclassification rate is \t\t" + str(sum(X_predict - Y_test)**2/float(len(X_predict)))
end = time.time()
print "Time it takes to fit and predict XGB \t" + str(end - begin)
print "More accurate when the float is .2 or .3 and less so when it's higher; time is faster than original regardless of float."

The Misclassification rate is 		0.0233918128655
Time it takes to fit and predict XGB 	0.0179171562195
More accurate when the float is .2 or .3 and less so when it's higher; time is faster than original regardless of float.


#### Initializing XGBoost Classifier with the three fastest and most accurate parameters

In [22]:
begin = time.time()
XGB = XGBClassifier(n_estimators = 3, gamma = .1)#, subsample=.2)
XGB.fit(X_train, Y_train)
X_predict = XGB.predict(X_test)
print "The Misclassification rate is \t\t" + str(sum(X_predict - Y_test)**2/float(len(X_predict)))
end = time.time()
print "Time it takes to fit and predict XGB \t" + str(end - begin)
print "More accurate when the float is .2 or .3 and less so when it's higher; time is faster than original regardless of float."

The Misclassification rate is 		0.00584795321637
Time it takes to fit and predict XGB 	0.00460195541382
More accurate when the float is .2 or .3 and less so when it's higher; time is faster than original regardless of float.


The most accurate parameters that took the least time were n_estimators (at int = 3), gamma (at float = .1), and subsample (at float = .2 or .3) individually. Together the accuracy decreases because of the subsample which is the ratio of the training instance. But if we just had n_estimators, and gamma, accuracy is higher and the time it takes is faster. This is because n_estimators is the number of trees boosted and gamma is the minimum loss reduction required to make a further partition on a leaf node of the tree.


## Comparison's with Trees, Forests, and Gradient Boosting

#### Custom inputs for XGB and old models

In [27]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

begin0 = time.time()
DTC = DecisionTreeClassifier(criterion = "entropy", splitter = "random")
DTC.fit(X_train, Y_train)
X_predict =  DTC.predict(X_test)
print "The Misclassification rate for DTC is\t" + str(sum((X_predict - Y_test)**2)/float(len(X_predict)))
end0 = time.time()

begin1 = time.time()
RFC = RandomForestClassifier(criterion = "entropy", 
                             max_depth = 5)
RFC.fit(X_train, Y_train)
X_predict = RFC.predict(X_test)
print "The Misclassification rate for RFC is \t" + str(sum((X_predict - Y_test)**2)/float(len(X_predict)))
end1 = time.time()

begin2 = time.time()
GBC = GradientBoostingClassifier(learning_rate = .9, n_estimators = 20, max_depth = 5)
GBC.fit(X_train, Y_train)
X_predict = GBC.predict(X_test)
print "The Misclassification rate for GBC is \t" + str(sum(X_predict - Y_test)**2/float(len(X_predict))) 
end2 = time.time()

begin3 = time.time()
XGB = XGBClassifier(n_estimators = 3, gamma = .1)#, subsample=.2)
XGB.fit(X_train, Y_train)
X_predict = XGB.predict(X_test)
print "The Misclassification rate for XGB is \t" + str(sum(X_predict - Y_test)**2/float(len(X_predict)))+ "\n"
end3 = time.time()

print "Time it takes to fit and predict DTC\t" + str(end0 - begin0)
print "Time it takes to fit and predict RFC \t" + str(end1 - begin1)
print "Time it takes to fit and predict GBC \t" + str(end2 - begin2)
print "Time it takes to fit and predict XGB \t" + str(end3 - begin3)

The Misclassification rate for DTC is	0.0760233918129
The Misclassification rate for RFC is 	0.0526315789474
The Misclassification rate for GBC is 	0.0526315789474
The Misclassification rate for XGB is 	0.00584795321637

Time it takes to fit and predict DTC	0.00127696990967
Time it takes to fit and predict RFC 	0.0217299461365
Time it takes to fit and predict GBC 	0.0182859897614
Time it takes to fit and predict XGB 	0.00263595581055


The most accurate Classifier is the XGBoostClassifier, but the fastest is the DecisionTreeClassifier with XGBoost Classifier as a close second. So the one that has the best accuracy and the best time is XGBoostClassifier.

#### Custom inputs for XGB and default inputs for old models

In [28]:
begin0 = time.time()
DTC = DecisionTreeClassifier()
DTC.fit(X_train, Y_train)
X_predict =  DTC.predict(X_test)
print "The Misclassification rate for DTC is\t" + str(sum((X_predict - Y_test)**2)/float(len(X_predict)))
end0 = time.time()

begin1 = time.time()
RFC = RandomForestClassifier()
RFC.fit(X_train, Y_train)
X_predict = RFC.predict(X_test)
print "The Misclassification rate for RFC is \t" + str(sum((X_predict - Y_test)**2)/float(len(X_predict)))
end1 = time.time()

begin2 = time.time()
GBC = GradientBoostingClassifier()
GBC.fit(X_train, Y_train)
X_predict = GBC.predict(X_test)
print "The Misclassification rate for GBC is \t" + str(sum(X_predict - Y_test)**2/float(len(X_predict))) 
end2 = time.time()

begin3 = time.time()
XGB = XGBClassifier(n_estimators = 3, gamma = .1)
XGB.fit(X_train, Y_train)
X_predict = XGB.predict(X_test)
print "The Misclassification rate for XGB is \t" + str(sum(X_predict - Y_test)**2/float(len(X_predict)))+ "\n"
end3 = time.time()

print "Time it takes to fit and predict DTC\t" + str(end0 - begin0)
print "Time it takes to fit and predict RFC \t" + str(end1 - begin1)
print "Time it takes to fit and predict GBC \t" + str(end2 - begin2)
print "Time it takes to fit and predict XGB \t" + str(end3 - begin3)

The Misclassification rate for DTC is	0.0818713450292
The Misclassification rate for RFC is 	0.0350877192982
The Misclassification rate for GBC is 	0.0233918128655
The Misclassification rate for XGB is 	0.00584795321637

Time it takes to fit and predict DTC	0.00353813171387
Time it takes to fit and predict RFC 	0.0194439888
Time it takes to fit and predict GBC 	0.0957210063934
Time it takes to fit and predict XGB 	0.00243401527405


The fastest and most accurate classifier was XGBoostClassifier. XGBoost performed the best because it implements machine learning algorithms under the Gradient Boosting framework and provides a parallel tree boosting. It uses a more regularized model formalization to control over-fitting, which gives it better performance.