In [52]:
import numpy as np
import pandas as pd
from random import randint
from sklearn.datasets import fetch_covtype
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cross_validation import StratifiedShuffleSplit
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing
from sklearn.cross_validation import cross_val_score
from sklearn.cross_validation import train_test_split

In [2]:
data = fetch_covtype(random_state=11, shuffle=True)

###The Covertype dataset

####580,000+ examples of data obtained from 30 x 30 meter patches of US Forest
####The aim of collecting the data was to predict the dominant species of tree for each patch (covertype)
####7 species of tree have been labelled to create a 7 class prediction problem
####From each patch a total of 54 features have been extracted

In [3]:
print data.DESCR

Forest covertype dataset.

A classic dataset for classification benchmarks, featuring categorical and
real-valued features.

The dataset page is available from UCI Machine Learning Repository

    http://archive.ics.uci.edu/ml/datasets/Covertype

Courtesy of Jock A. Blackard and Colorado State University.



In [4]:
covertypes = ['Spruce/Fir', 'Lodgepole Pine', 'Ponderosa Pine', 'Cottonwood/Willow', 'Aspen', 'Douglas-fir', 'Krummholz']

In [5]:
def my_confusion_matrix(y_test, y_hat, names):
    '''This function uses the pd.crosstab function to create a confusion matrix:
    predictions are the predictions from the predictive mode
    y are the known class labels
    names are the names of the features used in the model'''
    
    cf = pd.crosstab(y_test, y_hat)
    cf.columns = names
    cf.index = names
    cf.columns.name = 'Prediction'
    cf.index.name = 'Actual'
    return cf

In [6]:
size = data.data
print size.shape

(581012, 54)


In [95]:
def my_shuffle(X, y):
    Xa = X.copy()
    ya = y.copy()
    
    for i in xrange(100000):
        n1 = randint(0, 13999)
        n2 = randint(0, 13999)
        
        tempx = Xa[n1, :].copy()
        tempy = ya[n1].copy()

        Xa[n1, :] = Xa[n2, :].copy()
        ya[n1] = ya[n2].copy()

        Xa[n2, :] = tempx
        ya[n2] = tempy
    
    return (Xa, ya)

In [96]:
X1 = np.zeros((14000, 54))
y1 = np.zeros((14000, 1))
features = range(54)
df1 = pd.DataFrame(data.data)
df1["y"] = data.target
#df1[df1.y == 1][features][0:2000]
for i in xrange(1, 8):
    j = i-1
    X1[j*2000:i*2000, :] = df1[df1.y == i][features][0:2000].values
    y1[j*2000:i*2000, :] = i


y1 = y1.ravel()
td = {}
for i in xrange(1, 8):
    td[i] = 0
for i in xrange(len(y1)):
    td[y1[i]] += 1
for i in xrange(1, 8):
    print "class ", i, " ", td[i]
print "\n\n"
y1 = y1.reshape(y1.shape[0], 1)
print "shapes here", X1.shape, y1.shape
X2, y2 = my_shuffle(X1, y1)

y2 = y2.ravel()
td = {}
for i in xrange(1, 8):
    td[i] = 0
for i in xrange(len(y2)):
    td[y2[i]] += 1
for i in xrange(1, 8):
    print "class ", i, " ", td[i]

class  1   2000
class  2   2000
class  3   2000
class  4   2000
class  5   2000
class  6   2000
class  7   2000



shapes here (14000, 54) (14000, 1)
class  1   2000
class  2   2000
class  3   2000
class  4   2000
class  5   2000
class  6   2000
class  7   2000


In [97]:
Xs = preprocessing.StandardScaler(copy=True, with_mean=True, with_std=True).fit_transform(X2)

In [10]:
#print Xs.shape

In [11]:
#X2[0,0:10]

In [12]:
#Xs[0,0:10]

---
#Example Bagging
---

###The Sklearn Bagging Classifier encompasses a number of the averaging methods
###It has a comprehensive argument list:

- base_estimator is the classifier you are going to use with bagging
- n_estimators is the number of base estimators to use
- max_samples is the number of samples to draw from the complete training set, each random sample is used to train one of the base estimators
- max_features is the number of features to use from the complete training set
- bootstrap, if True, indicates to the algorithm to draw with replacement (remember you replace after each sample draw)
- bootstrap_features, if True, indicates to the algorithm to draw the features with replacement

---
#####Try the base classifier first
---

In [103]:
mySSS = StratifiedShuffleSplit(y2, 1, test_size=0.5, random_state=11)

In [104]:
clfKNN = KNeighborsClassifier(n_neighbors = 1)

for train_index, test_index in mySSS:
    X_train, X_test = Xs[train_index], Xs[test_index]
    y_train, y_test = y2[train_index], y2[test_index]
    
    clfKNN.fit(X_train, y_train)
    
    y_hat = clfKNN.predict(X_test)
    print accuracy_score(y_test, y_hat)
    print classification_report(y_test, y_hat)

0.777142857143
             precision    recall  f1-score   support

        1.0       0.71      0.64      0.67      1000
        2.0       0.64      0.59      0.61      1000
        3.0       0.76      0.68      0.72      1000
        4.0       0.88      0.93      0.90      1000
        5.0       0.80      0.91      0.85      1000
        6.0       0.73      0.75      0.74      1000
        7.0       0.90      0.95      0.92      1000

avg / total       0.77      0.78      0.77      7000



---
#####Now wrap the Bagging Classifier around the base classifier
---

In [105]:
clfBag = BaggingClassifier(base_estimator = KNeighborsClassifier(n_neighbors = 1), n_estimators = 100, max_samples = 0.7,\
                          max_features = 0.7, random_state = 6, n_jobs = -1)

for train_index, test_index in mySSS:
    X_train, X_test = Xs[train_index], Xs[test_index]
    y_train, y_test = y2[train_index], y2[test_index]
    
    clfBag.fit(X_train, y_train)

    y_hat = clfBag.predict(X_test)
    print accuracy_score(y_test, y_hat)
    print classification_report(y_test, y_hat)

0.795571428571
             precision    recall  f1-score   support

        1.0       0.71      0.66      0.68      1000
        2.0       0.69      0.59      0.63      1000
        3.0       0.80      0.69      0.75      1000
        4.0       0.87      0.95      0.91      1000
        5.0       0.80      0.93      0.86      1000
        6.0       0.76      0.80      0.78      1000
        7.0       0.91      0.95      0.93      1000

avg / total       0.79      0.80      0.79      7000



#####Introducing cross_val_score

In [111]:
clfBagB = BaggingClassifier(base_estimator = KNeighborsClassifier(n_neighbors = 1), n_estimators = 100, max_samples = 0.7,\
                          max_features = 0.7, random_state = 6, n_jobs = 1)
scores = cross_val_score(clfBagB, Xs, y2, cv=2, scoring = 'accuracy', n_jobs = -1)
print np.mean(scores)
print np.std(scores)

0.790928571429
0.00192857142857


---
#Patches Example
---

###The Sklearn Ensemble Tree or Patches Classifiers have a number of important arguments

- max_features
- min_samples_leaf
- bootstrap
- n_estimators

##Random Forests

In [114]:
clfRF = RandomForestClassifier(n_estimators = 100, n_jobs = -1,random_state = 11)

for train_index, test_index in mySSS:
    X_train, X_test = Xs[train_index], Xs[test_index]
    y_train, y_test = y2[train_index], y2[test_index]
    
    clfRF.fit(X_train, y_train)
    
    y_hat = clfRF.predict(X_test)
    print accuracy_score(y_test, y_hat)
    print classification_report(y_test, y_hat)

0.833571428571
             precision    recall  f1-score   support

        1.0       0.74      0.73      0.74      1000
        2.0       0.75      0.63      0.68      1000
        3.0       0.85      0.78      0.81      1000
        4.0       0.92      0.96      0.94      1000
        5.0       0.85      0.92      0.88      1000
        6.0       0.80      0.85      0.83      1000
        7.0       0.92      0.96      0.94      1000

avg / total       0.83      0.83      0.83      7000



##Extra Trees

In [115]:
clfET = ExtraTreesClassifier(n_estimators = 150, n_jobs = -1,random_state = 11)

for train_index, test_index in mySSS:
    X_train, X_test = Xs[train_index], Xs[test_index]
    y_train, y_test = y2[train_index], y2[test_index]
    
    clfET.fit(X_train, y_train)
    y_hat = clfET.predict(X_test)
    print accuracy_score(y_test, y_hat)
    print classification_report(y_test, y_hat)

0.839285714286
             precision    recall  f1-score   support

        1.0       0.75      0.73      0.74      1000
        2.0       0.74      0.65      0.69      1000
        3.0       0.85      0.78      0.82      1000
        4.0       0.92      0.96      0.94      1000
        5.0       0.86      0.93      0.89      1000
        6.0       0.81      0.86      0.83      1000
        7.0       0.93      0.95      0.94      1000

avg / total       0.84      0.84      0.84      7000



---
#Boosting Example
---

##AdaBoost

In [119]:
clfAB = AdaBoostClassifier(n_estimators = 1000, random_state = 11)

for train_index, test_index in mySSS:
    X_train, X_test = Xs[train_index], Xs[test_index]
    y_train, y_test = y2[train_index], y2[test_index]
    
    clfAB.fit(X_train, y_train)
    
    y_hat = clfAB.predict(X_test)
    print accuracy_score(y_test, y_hat)
    print classification_report(y_test, y_hat)

0.437142857143
             precision    recall  f1-score   support

        1.0       0.39      0.66      0.49      1000
        2.0       0.28      0.14      0.19      1000
        3.0       0.60      0.17      0.26      1000
        4.0       0.46      0.12      0.18      1000
        5.0       0.74      0.41      0.53      1000
        6.0       0.32      0.85      0.46      1000
        7.0       0.69      0.72      0.70      1000

avg / total       0.50      0.44      0.40      7000



##GradientBoost

####Gradient Boosting uses gradient descent
####Gradient Boosting has the following arguments that need to be set carefully:
- learning_rate
- max_depth
- subsample
- min_samples_leaf
- n_estimators

In [122]:
clfGB = GradientBoostingClassifier(n_estimators = 1000, random_state = 11)

for train_index, test_index in mySSS:
    X_train, X_test = Xs[train_index], Xs[test_index]
    y_train, y_test = y2[train_index], y2[test_index]
    
    clfGB.fit(X_train, y_train)
    
    y_hat = clfGB.predict(X_test)
    print accuracy_score(y_test, y_hat)
    print classification_report(y_test, y_hat)

0.820714285714
             precision    recall  f1-score   support

        1.0       0.74      0.70      0.72      1000
        2.0       0.70      0.64      0.67      1000
        3.0       0.81      0.77      0.79      1000
        4.0       0.94      0.95      0.95      1000
        5.0       0.84      0.91      0.87      1000
        6.0       0.79      0.82      0.80      1000
        7.0       0.91      0.95      0.93      1000

avg / total       0.82      0.82      0.82      7000



In [55]:
confusion_matrix(y_test, clf.predict(X_test))

array([[693, 188,   1,   0,  29,   6,  83],
       [220, 603,  18,   0, 119,  31,   9],
       [  1,  12, 755,  40,  16, 176,   0],
       [  0,   0,  28, 955,   0,  17,   0],
       [  7,  54,  12,   0, 914,  13,   0],
       [  1,  19, 129,  18,  17, 816,   0],
       [ 44,   1,   0,   0,   3,   0, 952]])

In [56]:
cm = pd.crosstab(y_test, clf.predict(X_test), rownames=["Actual"], colnames=["Predicted"])
cm

Predicted,1.0,2.0,3.0,4.0,5.0,6.0,7.0
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,693,188,1,0,29,6,83
2,220,603,18,0,119,31,9
3,1,12,755,40,16,176,0
4,0,0,28,955,0,17,0
5,7,54,12,0,914,13,0
6,1,19,129,18,17,816,0
7,44,1,0,0,3,0,952


In [57]:
my_confusion_matrix(y_test, clf.predict(X_test), covertypes)

Prediction,Spruce/Fir,Lodgepole Pine,Ponderosa Pine,Cottonwood/Willow,Aspen,Douglas-fir,Krummholz
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Spruce/Fir,693,188,1,0,29,6,83
Lodgepole Pine,220,603,18,0,119,31,9
Ponderosa Pine,1,12,755,40,16,176,0
Cottonwood/Willow,0,0,28,955,0,17,0
Aspen,7,54,12,0,914,13,0
Douglas-fir,1,19,129,18,17,816,0
Krummholz,44,1,0,0,3,0,952
