In [54]:
#load library with Iris
from sklearn.datasets import load_iris

#load classifier from sklearn library
from sklearn.ensemble import RandomForestClassifier

import pandas as pd
import numpy as np

#set random seed 
np.random.seed(0)

# Load data

In [55]:
#Create an object called irits with iris dataset
iris = load_iris()

In [56]:
#create df with four feature variables
df = pd.DataFrame(iris.data, columns = iris.feature_names)

#view data
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [57]:
#Add a column, target names 
df['species'] = pd.Categorical.from_codes(iris.target,iris.target_names)

In [58]:
print(df.shape)
df.head()

(150, 5)


Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


# Preprocess data

In [59]:
from sklearn.preprocessing import LabelEncoder

In [60]:
enc = LabelEncoder()

In [61]:
df['species'] = enc.fit_transform(df['species'])

In [63]:
print(df.shape)
df.head()

(150, 5)


Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),species
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


# Create training and test data

In [65]:
from sklearn.model_selection import train_test_split

In [77]:
df.iloc[:,:4].head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [78]:
X_train, X_test, Y_train, Y_test = train_test_split(df.iloc[:,:4],df['species'],test_size=0.25)

In [79]:
print(X_train.shape, Y_train.shape)
print(X_test.shape, Y_test.shape)

(112, 4) (112,)
(38, 4) (38,)


# Train random forest classifier

In [80]:
# Create a random forest Classifier. By convention, clf means 'Classifier'
clf = RandomForestClassifier(n_jobs=2, random_state=0)

# Train the Classifier to take the training features and learn how they relate
# to the training y (the species)
clf.fit(X_train,Y_train )

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=2,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

# Apply classifier to test data

If you have been following along, you will know we only trained our classifier on part of the data, leaving the rest out. This is, in my humble opinion, the most important part of machine learning. Why? Because by leaving out a portion of the data, we have a set of data to test the accuracy of our model!

Let’s do that now.

In [81]:
# Apply the Classifier we trained to the test data (which, remember, it has never seen before)
clf.predict(X_test)

array([1, 2, 2, 1, 0, 2, 2, 1, 0, 1, 0, 2, 1, 2, 0, 2, 0, 1, 0, 1, 1, 0, 2,
       1, 2, 0, 1, 2, 1, 1, 1, 2, 2, 1, 0, 2, 1, 1])

What are you looking at above? Remember that we coded each of the three species of plant as 0, 1, or 2. What the list of numbers above is showing you is what species our model predicts each plant is based on the the sepal length, sepal width, petal length, and petal width. How confident is the classifier about each plant? We can see that too.

In [82]:
# View the predicted probabilities of the first 10 observations
clf.predict_proba(X_test)[0:10]

array([[ 0. ,  1. ,  0. ],
       [ 0. ,  0. ,  1. ],
       [ 0. ,  0.1,  0.9],
       [ 0. ,  0.7,  0.3],
       [ 1. ,  0. ,  0. ],
       [ 0. ,  0. ,  1. ],
       [ 0. ,  0.1,  0.9],
       [ 0. ,  1. ,  0. ],
       [ 1. ,  0. ,  0. ],
       [ 0. ,  1. ,  0. ]])

There are three species of plant, thus [ 1. , 0. , 0. ] tells us that the classifier is certain that the plant is the first class. Taking another example, [ 0.9, 0.1, 0. ] tells us that the classifier gives a 90% probability the plant belongs to the first class and a 10% probability the plant belongs to the second class. Because 90 is greater than 10, the classifier predicts the plant is the first class.

# Evaluate classifier

Now that we have predicted the species of all plants in the test data, we can compare our predicted species with the that plant’s actual species.

In [93]:
# Create actual english names for the plants for each predicted plant class
preds = clf.predict(X_test)

# Create confusion matrix

A confusion matrix can be, no pun intended, a little confusing to interpret at first, but it is actually very straightforward. The columns are the species we predicted for the test data and the rows are the actual species for the test data. So, if we take the top row, we can wee that we predicted all 13 setosa plants in the test data perfectly. However, in the next row, we predicted 5 of the versicolor plants correctly, but mis-predicted two of the versicolor plants as virginica.

The short explanation of how to interpret a confusion matrix is: anything on the diagonal was classified correctly and anything off the diagonal was classified incorrectly.

In [94]:
# Create confusion matrix
cm = pd.crosstab(Y_test, preds, rownames=['Actual Species'], colnames=['Predicted Species'])
cm

Predicted Species,0,1,2
Actual Species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,9,0,0
1,0,14,1
2,0,2,12


In [98]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

In [100]:
cm = confusion_matrix(Y_test,preds)
cm

array([[ 9,  0,  0],
       [ 0, 14,  1],
       [ 0,  2, 12]])

In [102]:
accuracy = accuracy_score(Y_test,preds)
accuracy

0.92105263157894735

# View feature importance 

In [103]:
list(zip(X_train, clf.feature_importances_))

[('sepal length (cm)', 0.13034999480728879),
 ('sepal width (cm)', 0.024729582303777614),
 ('petal length (cm)', 0.2814895848241673),
 ('petal width (cm)', 0.56343083806476635)]