# Random Forest Example using the iris dataset

Example adapted from example on Slideshare.net by Simplilearn

## Begin by importing the data and modules necessary

In [1]:
# Import the iris dataset from scikit-learn
from sklearn.datasets import load_iris

# Import random forest classifier
from sklearn.ensemble import RandomForestClassifier

# Import pandas
import pandas as pd

# Import numpy
import numpy as np

# Set random seed
np.random.seed(0)

## Now we'll create an iris object with our data and make a dataframe

In [2]:
# Create an object with the iris data
iris = load_iris()

# Create a dataframe with the four feature variables
df = pd.DataFrame(iris.data, columns=iris.feature_names)

# Viewing the top 5 rows
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


## Adding the species name and taking another look

In [3]:
# Add a column to the data for our label
df['species'] = pd.Categorical.from_codes(iris.target, iris.target_names)

# Viewing the top 5 rows again
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [4]:
# Add some labels for test and training data
df['is_train'] = np.random.uniform(0, 1, len(df)) <= .75

# View the top 5 rows again
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),species,is_train
0,5.1,3.5,1.4,0.2,setosa,True
1,4.9,3.0,1.4,0.2,setosa,True
2,4.7,3.2,1.3,0.2,setosa,True
3,4.6,3.1,1.5,0.2,setosa,True
4,5.0,3.6,1.4,0.2,setosa,True


In [5]:
# Creating dataframes from the previously identified is train labels
train, test = df[df['is_train'] == True], df[df['is_train'] == False]

# Compare number of oservations in each of the new dataframes
print('Number of observations in the training data: ', len(train))
print('Number of observations in the test data: ', len(test))

Number of observations in the training data:  118
Number of observations in the test data:  32


In [6]:
# Create a list of the feature names
features = df.columns[:4]

# View the feature list
features

Index(['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)',
       'petal width (cm)'],
      dtype='object')

In [7]:
# Convert the species names into digits and place in the target
y = pd.factorize(train['species'])[0]

# View the target
y

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2])

In [8]:
# Create random forest classifier
clf = RandomForestClassifier(n_jobs=2, random_state=0)

# Training the classifier
# The fit() creates a model from the classifier
# train[features] selects only the feature colums of the dataframe
# y is our "target"
clf.fit(train[features], y)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=2,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [9]:
# Apply the trained model to the test dataset
clf.predict(test[features])

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 2, 2, 1, 1, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [10]:
# Take a look at the predicted probablilities of the first 10 observations
clf.predict_proba(test[features])[0:10]

array([[1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.]])

In [12]:
# Map the predicted species to a list of predictions
preds = iris.target_names[clf.predict(test[features])]

# View the predictions for the first five observations
preds[0:5]

array(['setosa', 'setosa', 'setosa', 'setosa', 'setosa'], dtype='<U10')

In [13]:
# Lets look at the actual species for the first five observations
test['species'].head()

7     setosa
8     setosa
10    setosa
13    setosa
17    setosa
Name: species, dtype: category
Categories (3, object): [setosa, versicolor, virginica]

In [14]:
# Here is the confusion matrix
pd.crosstab(test['species'], preds, rownames=['Actual Species'], colnames=['Predicted Species'])

Predicted Species,setosa,versicolor,virginica
Actual Species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
setosa,13,0,0
versicolor,0,5,2
virginica,0,0,12


In [15]:
13+5+12+2

32

In [16]:
30/32

0.9375

93.75% Accuracy. Not bad, for identifying flowers, especially when the classes aren't extremely unbalanced. For data that is highly symmetric, like a lot cyber data, it just won't do. At least not as a standalone measure to create alerts