# Building a Random Forest Classifier

# Load Data and Initialize Environment

In [9]:
# Load the library with the iris dataset
from sklearn.datasets import load_iris
 
# Load scikit's random forest classifier library
from sklearn.ensemble import RandomForestClassifier

# Load Train Test Split
from sklearn.model_selection import train_test_split
 
# Load pandas
import pandas as pd
 
# Load numpy
import numpy as np
 
# Set random seed
np.random.seed(0)

# Create an object called iris with the iris data
iris = load_iris()
 
# Create a dataframe with the four feature variables
df = pd.DataFrame(iris.data, columns = iris.feature_names)

# Add a new column with the species names; this is what we are going to try to predict
df['species'] = pd.Categorical.from_codes(iris.target, iris.target_names)
 
# View the top 5 rows
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [23]:
df

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,virginica
146,6.3,2.5,5.0,1.9,virginica
147,6.5,3.0,5.2,2.0,virginica
148,6.2,3.4,5.4,2.3,virginica


# Pre-Process Data

In [15]:
# Create a list of the feature column's names
features = df.columns[:4]
 
# View features
features

Index(['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)',
       'petal width (cm)'],
      dtype='object')

In [16]:
# train['species'] contains the actual species names. Before we can use it,
# we need to convert each species name into a digit. So, in this case, there
# are three species, which have been coded as 0, 1, or 2.
y = pd.factorize(df['species'])[0]

# View target
y

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

# Create Training and Test Data

The original split was unnecessarily manual, yet with randomness. Using an inbuilt method instead.

In [19]:
X = df.loc[:, df.columns != 'species']

In [25]:
splitsize = np.random.uniform(0, 1)
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = splitsize, random_state = 580)

# Show the number of observations for the test and training dataframes
print('Number of observations in the training data:', len(X_train))
print('Number of observations in the test data:', len(X_test))

Number of observations in the training data: 49
Number of observations in the test data: 101


# Train Random Forest Classifier

In [29]:
# Create a random forest Classifier. By convention, clf means 'Classifier'
clf = RandomForestClassifier(n_jobs = 2, random_state = 0)
 
# Train the Classifier to take the training features and learn how they relate
# to the training y (the species)
clf.fit(X_train, y_train)
RandomForestClassifier(
    bootstrap = True, 
    class_weight = None, 
    criterion = 'gini',
    max_depth = None, 
    max_features = 'auto', 
    max_leaf_nodes = None,
    min_impurity_decrease = 1e-07,
    min_samples_leaf = 1,
    min_samples_split = 2,
    min_weight_fraction_leaf = 0.0,
    n_estimators = 10, 
    n_jobs = 2, 
    oob_score = False, 
    random_state = 0,
    verbose = 0, 
    warm_start = False
    )

In [32]:
# Apply the Classifier we trained to the test data (which, remember, it has never seen before)
preds = clf.predict(X_test)

# Confusion Matrix

In [33]:
# Create confusion matrix
pd.crosstab(
    y_test, 
    preds, 
    rownames = ['Actual Species'], 
    colnames = ['Predicted Species']
    )

Predicted Species,0,1,2
Actual Species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,35,0,0
1,0,30,1
2,0,5,30


# Features and Importance Scores

In [34]:
# View a list of the features and their importance scores
list(zip(
    X_train, 
    clf.feature_importances_
    ))

[('sepal length (cm)', 0.08012751417376454),
 ('sepal width (cm)', 0.027200526890271814),
 ('petal length (cm)', 0.4008852856374727),
 ('petal width (cm)', 0.49178667329849096)]

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=51eeb538-78d0-4759-ab21-b1b8c45886c3' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>