In [2]:
import numpy as np
import pandas as pd
from random import randint
from sklearn.datasets import fetch_covtype
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cross_validation import StratifiedShuffleSplit
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing
from sklearn.cross_validation import cross_val_score
from sklearn.cross_validation import train_test_split

In [3]:
data = fetch_covtype(random_state=11, shuffle=True)

###The Covertype dataset

####580,000+ examples of data obtained from 30 x 30 meter patches of US Forest
####The aim of collecting the data was to predict the dominant species of tree for each patch (covertype)
####7 species of tree have been labelled to create a 7 class prediction problem
####From each patch a total of 54 features have been extracted

In [4]:
print data.DESCR

Forest covertype dataset.

A classic dataset for classification benchmarks, featuring categorical and
real-valued features.

The dataset page is available from UCI Machine Learning Repository

    http://archive.ics.uci.edu/ml/datasets/Covertype

Courtesy of Jock A. Blackard and Colorado State University.



In [5]:
covertypes = ['Spruce/Fir', 'Lodgepole Pine', 'Ponderosa Pine', 'Cottonwood/Willow', 'Aspen', 'Douglas-fir', 'Krummholz']

In [6]:
def my_confusion_matrix(y_test, y_hat, names):
    '''This function uses the pd.crosstab function to create a confusion matrix:
    predictions are the predictions from the predictive mode
    y are the known class labels
    names are the names of the features used in the model'''
    
    cf = pd.crosstab(y_test, y_hat)
    cf.columns = names
    cf.index = names
    cf.columns.name = 'Prediction'
    cf.index.name = 'Actual'
    return cf

In [7]:
size = data.data
print size.shape

(581012, 54)


In [8]:
def my_shuffle(X, y, M):
    '''This function shuffles the order of the data set and keeps the labels in synch'''
    Xa = X.copy()
    ya = y.copy()
    
    for i in xrange(100000):
        n1 = randint(0, M-1)
        n2 = randint(0, M-1)
        
        tempx = Xa[n1, :].copy()
        tempy = ya[n1].copy()

        Xa[n1, :] = Xa[n2, :].copy()
        ya[n1] = ya[n2].copy()

        Xa[n2, :] = tempx
        ya[n2] = tempy
    
    return (Xa, ya)

In [9]:
M = 14000
N = 54
n_classes = 7
#specify number per class
my_size = 500

X1 = np.zeros((n_classes * my_size, N))
y1 = np.zeros((n_classes * my_size, 1))
features = range(N)

df1 = pd.DataFrame(data.data)
df1["y"] = data.target

for i in xrange(1, n_classes+1):
    j = i-1
    X1[j*my_size:i*my_size, :] = df1[df1.y == i][features][0:my_size].values
    y1[j*my_size:i*my_size, :] = i

print "shapes here", X1.shape, y1.shape

#This is just a little sanity check that X1, and X2 have the correct number of samples in each class
y1 = y1.ravel()
td = {}
for i in xrange(1, n_classes+1):
    td[i] = 0
for i in xrange(len(y1)):
    td[y1[i]] += 1
for i in xrange(1, n_classes+1):
    print "class ", i, " ", td[i]
print "\n\n"
y1 = y1.reshape(y1.shape[0], 1)
print "shapes here", X1.shape, y1.shape

#Now shuffle the pack
X2, y2 = my_shuffle(X1, y1, n_classes * my_size)

y2 = y2.ravel()
td = {}
for i in xrange(1, n_classes+1):
    td[i] = 0
for i in xrange(len(y2)):
    td[y2[i]] += 1
for i in xrange(1, n_classes+1):
    print "class ", i, " ", td[i]

shapes here (3500, 54) (3500, 1)
class  1   500
class  2   500
class  3   500
class  4   500
class  5   500
class  6   500
class  7   500



shapes here (3500, 54) (3500, 1)
class  1   500
class  2   500
class  3   500
class  4   500
class  5   500
class  6   500
class  7   500


In [10]:
#Xs = preprocessing.StandardScaler(copy=True, with_mean=True, with_std=True).fit_transform(X2)
Xs = X2

---
#Example Bagging
---

###The Sklearn Bagging Classifier encompasses a number of the averaging methods
###It has a comprehensive argument list:

- base_estimator is the classifier you are going to use with bagging
- n_estimators is the number of base estimators to use
- max_samples is the number of samples to draw from the complete training set, each random sample is used to train one of the base estimators
- max_features is the number of features to use from the complete training set
- bootstrap, if True, indicates to the algorithm to draw with replacement (remember you replace after each sample draw)
- bootstrap_features, if True, indicates to the algorithm to draw the features with replacement
- n_jobs allows for parallelization

---
#####Try the base classifier first
---

In [11]:
mySSS = StratifiedShuffleSplit(y2, 1, test_size=0.5, random_state=11)

In [12]:
clfKNN = KNeighborsClassifier(n_neighbors = 1)

for train_index, test_index in mySSS:
    X_train, X_test = Xs[train_index], Xs[test_index]
    y_train, y_test = y2[train_index], y2[test_index]
    
    clfKNN.fit(X_train, y_train)
    
    y_hat = clfKNN.predict(X_test)
    print accuracy_score(y_test, y_hat)
    print classification_report(y_test, y_hat)

0.682857142857
             precision    recall  f1-score   support

        1.0       0.56      0.48      0.52       250
        2.0       0.52      0.43      0.47       250
        3.0       0.66      0.57      0.61       250
        4.0       0.78      0.88      0.83       250
        5.0       0.76      0.88      0.81       250
        6.0       0.65      0.69      0.67       250
        7.0       0.78      0.85      0.81       250

avg / total       0.67      0.68      0.67      1750



---
#####Now wrap the Bagging Classifier around the base classifier
---

In [13]:
clfBag = BaggingClassifier(base_estimator = KNeighborsClassifier(n_neighbors = 1), n_estimators = 100, max_samples = 0.7,\
                          max_features = 0.7, random_state = 6, n_jobs = -1)

for train_index, test_index in mySSS:
    X_train, X_test = Xs[train_index], Xs[test_index]
    y_train, y_test = y2[train_index], y2[test_index]
    
    clfBag.fit(X_train, y_train)

    y_hat = clfBag.predict(X_test)
    print accuracy_score(y_test, y_hat)
    print classification_report(y_test, y_hat)

0.702857142857
             precision    recall  f1-score   support

        1.0       0.60      0.50      0.54       250
        2.0       0.55      0.42      0.48       250
        3.0       0.73      0.59      0.65       250
        4.0       0.79      0.92      0.85       250
        5.0       0.72      0.90      0.80       250
        6.0       0.67      0.70      0.68       250
        7.0       0.79      0.89      0.84       250

avg / total       0.69      0.70      0.69      1750



#####Introducing cross_val_score

In [14]:
clfBagB = BaggingClassifier(base_estimator = KNeighborsClassifier(n_neighbors = 1), n_estimators = 100, max_samples = 0.7,\
                          max_features = 0.7, random_state = 6, n_jobs = 1)

#NB: cross_val_score does the cross validation for you
#VIP: If you do use cross_val_score with a separate train test split, then you can easily get train, xvalidation and test sets - see grid search notebook
#cv = number of folds of cross validation to do
#n_jobs allows for parallezied execution, which sometimes gives problems inside interpreters

scores = cross_val_score(clfBagB, Xs, y2, cv=2, scoring = 'accuracy', n_jobs = -1)
print np.mean(scores)
print np.std(scores)

0.701714285714
0.00514285714286


---
#Patches Example
---

###The Sklearn Ensemble Tree or Patches Classifiers have a number of important arguments
####Random Forests
####Extra Trees

- max_features is the number of features to consider when looking for the best split, the default is all of the features 
- max_depth is the maximum depth of the tree. The default is to have a tree such that each leaf node is 'pure'. This parameter is affected by 2 others: 1. min_samples_split, and, 2. max_leaf_nodes
- min_samples_split is the minimum number of samples required to split an internal node, the default being 2
- min_samples_leaf is the number of samples in newly created leaves
- max_leaf_nodes grows a tree until the leaves have this number. Setting this means max_depth will be ignored
- bootstrap will allow the model to use bootstrapped samples
- n_estimators is the number of trees in the forest
- criterion is again is either 'entropy' or 'gini'
- n_jobs will parallelize and speed things up

##Random Forests

In [15]:
clfRF = RandomForestClassifier(n_estimators = 150, n_jobs = -1, random_state = 11)

for train_index, test_index in mySSS:
    X_train, X_test = Xs[train_index], Xs[test_index]
    y_train, y_test = y2[train_index], y2[test_index]
    
    clfRF.fit(X_train, y_train)
    
    y_hat = clfRF.predict(X_test)
    print accuracy_score(y_test, y_hat)
    print classification_report(y_test, y_hat)

0.768571428571
             precision    recall  f1-score   support

        1.0       0.67      0.66      0.67       250
        2.0       0.65      0.49      0.56       250
        3.0       0.75      0.71      0.73       250
        4.0       0.86      0.96      0.91       250
        5.0       0.79      0.85      0.82       250
        6.0       0.73      0.75      0.74       250
        7.0       0.87      0.94      0.91       250

avg / total       0.76      0.77      0.76      1750



##Extra Trees

In [16]:
clfET = ExtraTreesClassifier(n_estimators = 150, n_jobs = -1,random_state = 11)

for train_index, test_index in mySSS:
    X_train, X_test = Xs[train_index], Xs[test_index]
    y_train, y_test = y2[train_index], y2[test_index]
    
    clfET.fit(X_train, y_train)
    y_hat = clfET.predict(X_test)
    print accuracy_score(y_test, y_hat)
    print classification_report(y_test, y_hat)

0.763428571429
             precision    recall  f1-score   support

        1.0       0.67      0.65      0.66       250
        2.0       0.63      0.53      0.58       250
        3.0       0.74      0.67      0.71       250
        4.0       0.87      0.95      0.91       250
        5.0       0.82      0.82      0.82       250
        6.0       0.69      0.77      0.73       250
        7.0       0.89      0.94      0.91       250

avg / total       0.76      0.76      0.76      1750



---
#Boosting Example
---

###Ada Boost has the capability to boost another algorithm - the default being DecisionTreeClassifier

####Other important parameters:

- base_estimator specifies the model for boosting
- n_estimators as above
- learning_rate shrinks the contribution of each classifier, and so is a trade-off with n_estimators
- no n_jobs, so cannot parallelize execution

##AdaBoost

In [24]:
clfAB = AdaBoostClassifier(n_estimators = 100, random_state = 11)


for train_index, test_index in mySSS:
    X_train, X_test = Xs[train_index], Xs[test_index]
    y_train, y_test = y2[train_index], y2[test_index]
    
    clfAB.fit(X_train, y_train)
    
    y_hat = clfAB.predict(X_test)
    print accuracy_score(y_test, y_hat)
    print classification_report(y_test, y_hat)

0.406285714286
             precision    recall  f1-score   support

        1.0       0.60      0.01      0.02       250
        2.0       0.00      0.00      0.00       250
        3.0       0.31      0.94      0.47       250
        4.0       0.00      0.00      0.00       250
        5.0       0.38      0.91      0.54       250
        6.0       0.00      0.00      0.00       250
        7.0       0.62      0.98      0.76       250

avg / total       0.27      0.41      0.26      1750



In [21]:
clfTree = DecisionTreeClassifier(criterion = 'entropy')

for train_index, test_index in mySSS:
    X_train, X_test = Xs[train_index], Xs[test_index]
    y_train, y_test = y2[train_index], y2[test_index]
    
    clfTree.fit(X_train, y_train)
    
    y_hat = clfTree.predict(X_test)
    
    print accuracy_score(y_test, y_hat)
    print classification_report(y_test, y_hat)

0.684571428571
             precision    recall  f1-score   support

        1.0       0.59      0.58      0.59       250
        2.0       0.51      0.48      0.50       250
        3.0       0.60      0.65      0.62       250
        4.0       0.84      0.91      0.87       250
        5.0       0.75      0.78      0.76       250
        6.0       0.62      0.52      0.57       250
        7.0       0.86      0.86      0.86       250

avg / total       0.68      0.68      0.68      1750



##GradientBoost

####Gradient Boosting is a specific tree based model boosting algorithm
####Other important parameters:
- loss. If loss is set to 'exponential' then this algorithm is the same as AdaBoost. This algorithm differs, therefore, as a result of a different loss function
- learning_rate - as for AdaBoost
- n_estimators - as for AdaBoost with the same caveat
- All of the Decision Tree parameters apply here too - max_features, max_depth, min_samples_split, min_samples_leaf, max_leaf_nodes
- no paralleziation option

In [32]:
clfGB = GradientBoostingClassifier(n_estimators = 100, random_state = 11)

for train_index, test_index in mySSS:
    X_train, X_test = Xs[train_index], Xs[test_index]
    y_train, y_test = y2[train_index], y2[test_index]
    
    clfGB.fit(X_train, y_train)
    
    y_hat = clfGB.predict(X_test)
    print accuracy_score(y_test, y_hat)
    print classification_report(y_test, y_hat)

0.733142857143
             precision    recall  f1-score   support

        1.0       0.66      0.65      0.65       250
        2.0       0.64      0.48      0.54       250
        3.0       0.67      0.68      0.67       250
        4.0       0.87      0.94      0.91       250
        5.0       0.74      0.81      0.78       250
        6.0       0.67      0.65      0.66       250
        7.0       0.83      0.92      0.88       250

avg / total       0.73      0.73      0.73      1750



---
###Just checking the confusion matrices code is all OK
---

In [55]:
confusion_matrix(y_test, clfGB.predict(X_test))

array([[162,  47,   0,   0,  10,   2,  29],
       [ 64, 139,   6,   0,  32,   8,   1],
       [  0,   2, 157,  24,   2,  65,   0],
       [  0,   0,   6, 235,   0,   9,   0],
       [  3,  18,   8,   0, 216,   5,   0],
       [  1,   6,  54,  16,   4, 169,   0],
       [ 24,   2,   0,   0,   0,   0, 224]])

In [56]:
cm = pd.crosstab(y_test, clfGB.predict(X_test), rownames=["Actual"], colnames=["Predicted"])
cm

Predicted,1.0,2.0,3.0,4.0,5.0,6.0,7.0
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,162,47,0,0,10,2,29
2,64,139,6,0,32,8,1
3,0,2,157,24,2,65,0
4,0,0,6,235,0,9,0
5,3,18,8,0,216,5,0
6,1,6,54,16,4,169,0
7,24,2,0,0,0,0,224


In [57]:
my_confusion_matrix(y_test, clfGB.predict(X_test), covertypes)

Prediction,Spruce/Fir,Lodgepole Pine,Ponderosa Pine,Cottonwood/Willow,Aspen,Douglas-fir,Krummholz
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Spruce/Fir,162,47,0,0,10,2,29
Lodgepole Pine,64,139,6,0,32,8,1
Ponderosa Pine,0,2,157,24,2,65,0
Cottonwood/Willow,0,0,6,235,0,9,0
Aspen,3,18,8,0,216,5,0
Douglas-fir,1,6,54,16,4,169,0
Krummholz,24,2,0,0,0,0,224
