# Random Forest with Iris dataset

In [80]:
#Load the library with the Iris dataset
from sklearn.datasets import load_iris
from sklearn.ensemble import RandomForestClassifier
import pandas as pd 
import numpy as np 

#Set a random seed for testing
np.random.seed(0)


In [81]:
iris = load_iris()

#Create a new dataframe with the four feature variables
df = pd.DataFrame(iris.data, columns = iris.feature_names)
df.head(10)

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2
5,5.4,3.9,1.7,0.4
6,4.6,3.4,1.4,0.3
7,5.0,3.4,1.5,0.2
8,4.4,2.9,1.4,0.2
9,4.9,3.1,1.5,0.1


In [56]:
#Add a new Column with the species name which the model will predict
#".target" attribute is a synonym for the 'dependent variable', the variable which we are trying to predict
df['Species'] = pd.Categorical.from_codes(iris.target, iris.target_names)
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),Species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


# Creating the training model

In [59]:
#Create a new column tht for each row, generates a random number between 0 and 1
#If the value < 0.75m set the value of the cell as 'True' and 'False' otherwise
#This is a simple and a cheap way of assigning data for training and testing by categorizing them as 'True' or 'False'
df['Train_values'] = np.random.uniform(0, 1, len(df)) <= 0.75
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),Species,Train_values
0,5.1,3.5,1.4,0.2,setosa,False
1,4.9,3.0,1.4,0.2,setosa,False
2,4.7,3.2,1.3,0.2,setosa,True
3,4.6,3.1,1.5,0.2,setosa,True
4,5.0,3.6,1.4,0.2,setosa,True


In [63]:
#Create two new dataframes, one with the training data and one with the testing data

training, testing = df[df['Train_values'] == True], df[df['Train_values'] == False]

#Show the count of observations for the test and the training dataframes
print('No. of observations in the training data: ', len(training))
print('No. of observations in the testing data: ', len(testing))

No. of observations in the training data:  114
No. of observations in the testing data:  36


# Preprocess Data

In [64]:
#Create a list of the frature column's names
features = df.columns[:4]
features 

Index(['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)',
       'petal width (cm)'],
      dtype='object')

In [67]:
#train['species'] contains the actual species names
#Before that can be used, we need to conver each species name into a digit values
#Code the species as 0, 1, 2

y = pd.factorize(training['Species'])[0] #".factorize" method encodes input values as an enumerated type or a categorical variable
y

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2], dtype=int64)

# Training the RFC

In [69]:
#Create a random forest classifier, clf means 'classifier'
clf = RandomForestClassifier(n_jobs = 2, random_state = 0)

#Train the classsifier to take the training features and learn how they relate to the Species training
clf.fit(training[features], y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=2,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

# Apply Classifier to test data

In [70]:
#Apply the classifier we trained to the test data
clf.predict(testing[features])

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 1, 2, 1, 1, 1, 1, 1, 1,
       1, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2], dtype=int64)

In [72]:
#View the predicted probabilities of the first 15 observations
clf.predict_proba(testing[features])[0:15]

array([[1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [0.1, 0.8, 0.1],
       [0. , 1. , 0. ],
       [0. , 1. , 0. ],
       [0. , 0.9, 0.1],
       [0. , 0.4, 0.6],
       [0. , 0.9, 0.1]])

#### Looking above, the three species of the plant have been plotted as follows
1. [1., 0., 0.] means that the classifer is certain that the plant is a first class
2. [0.1, 0.8, 0.1] means that the classifier is certain that the plant has an 80% probability of being a second class and 10% of other classes respectively

# Evaluating the classifier

In [77]:
#Create actua english names for the plants in each predicted class
preds = iris.target_names[clf.predict(testing[features])]

#View the PREDICTED species for the first 5 observations
preds[0:5]

array(['setosa', 'setosa', 'setosa', 'setosa', 'setosa'], dtype='<U10')

In [76]:
#View the ACTUAL species for the first 5 observations
testing['Species'].head()

0     setosa
1     setosa
10    setosa
18    setosa
31    setosa
Name: Species, dtype: category
Categories (3, object): [setosa, versicolor, virginica]

# Construction of a Confusion Matrix

In [78]:
#Create the confusion matrix
pd.crosstab(testing['Species'], preds, rownames=['Actual species'], colnames=['Predicted Species'])

Predicted Species,setosa,versicolor,virginica
Actual species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
setosa,9,0,0
versicolor,0,12,2
virginica,0,1,12


#### Anything on the diagonal was classified correctly and anything off the diagonal was classified incorrectly

# View Feature importance

In [79]:
#View a list of the features and their importance scores
list(zip(training[features], clf.feature_importances_))

[('sepal length (cm)', 0.12782355930474554),
 ('sepal width (cm)', 0.031885447100919936),
 ('petal length (cm)', 0.3697690569984059),
 ('petal width (cm)', 0.47052193659592856)]

#### Petal width played the most important role in making the learning model