In [2]:
# Loading the library with the iris dataset
from sklearn.datasets import load_iris

from sklearn.ensemble import RandomForestClassifier
import pandas as pd
import numpy as np

# Setting random seed
np.random.seed(0)

In [3]:
# Creating an object called iris with the iris data
iris = load_iris()

In [4]:
# Creating a dataframe with the four features available
df = pd.DataFrame(iris.data, columns=iris.feature_names)
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [5]:
# Adding a new column for the species name
df['Species'] = pd.Categorical.from_codes(iris.target, iris.target_names)

df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),Species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [6]:
# Creating Test and Train Data
df['is_train'] = np.random.uniform(0, 1, len(df)) <= 0.75

df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),Species,is_train
0,5.1,3.5,1.4,0.2,setosa,True
1,4.9,3.0,1.4,0.2,setosa,True
2,4.7,3.2,1.3,0.2,setosa,True
3,4.6,3.1,1.5,0.2,setosa,True
4,5.0,3.6,1.4,0.2,setosa,True


In [7]:
# Creating dataframes with test rows and training rows
train, test = df[df['is_train']==True], df[df['is_train']==False]

In [8]:
# Show the number of observations for the test and training dataframes
print('Number of observations in the training data:', len(train))
print('Number of observations in the testing data:', len(test))

Number of observations in the training data: 118
Number of observations in the testing data: 32


In [9]:
# Creating a list of the feature column's names
features = df.columns[:4]
print(features)

Index(['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)',
       'petal width (cm)'],
      dtype='object')


In [10]:
# Converting each species name into digits
y = pd.factorize(train['Species'])[0]
print(y)

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2]


In [11]:
# Creating a Random Forest Classifier
clf = RandomForestClassifier(n_jobs=2, random_state=0) # meaning of n_jobs=2?

# Training the Classifier
clf.fit(train[features], y)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=2,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [14]:
test[features]

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
7,5.0,3.4,1.5,0.2
8,4.4,2.9,1.4,0.2
10,5.4,3.7,1.5,0.2
13,4.3,3.0,1.1,0.1
17,5.1,3.5,1.4,0.3
18,5.7,3.8,1.7,0.3
19,5.1,3.8,1.5,0.3
20,5.4,3.4,1.7,0.2
21,5.1,3.7,1.5,0.4
23,5.1,3.3,1.7,0.5


In [15]:
# Applying the trained Classifier to the test
clf.predict(test[features])

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 2, 2, 1, 1, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2], dtype=int64)

In [18]:
# Viewing the predicted probabilities of the first 20 observations
clf.predict_proba(test[features])[0:20]

array([[1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [1. , 0. , 0. ],
       [0. , 0.5, 0.5],
       [0. , 1. , 0. ],
       [0. , 0.9, 0.1],
       [0. , 0.2, 0.8],
       [0. , 0.3, 0.7],
       [0. , 1. , 0. ],
       [0. , 0.8, 0.2]])

In [19]:
# Mapping names for the plants for each predicted plant class
preds = iris.target_names[clf.predict(test[features])]

In [21]:
# View the PREDICTED species for the first n observations
preds[0:25]

array(['setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa',
       'setosa', 'setosa', 'setosa', 'setosa', 'setosa', 'setosa',
       'setosa', 'versicolor', 'versicolor', 'versicolor', 'virginica',
       'virginica', 'versicolor', 'versicolor', 'virginica', 'virginica',
       'virginica', 'virginica', 'virginica'], dtype='<U10')

In [23]:
# Viewing the ACTUAL species for the first five observations
test['Species'].head()

7     setosa
8     setosa
10    setosa
13    setosa
17    setosa
Name: Species, dtype: category
Categories (3, object): [setosa, versicolor, virginica]

In [24]:
# Creating confusion matrix
pd.crosstab(test['Species'], preds, rownames=['Actual Species'], colnames=['Predicted Species'])

Predicted Species,setosa,versicolor,virginica
Actual Species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
setosa,13,0,0
versicolor,0,5,2
virginica,0,0,12


- Number of accurate predictions: 30
- Inaccurate: 2
- Model accuracy: 30/32 * 100% = 93%

In [37]:
preds1 = iris.target_names[clf.predict( [[5.0, 3.6, 1.4, 2.0], [4.0, 2.6, 1.4, 2.0]])]
preds1

array(['versicolor', 'virginica'], dtype='<U10')

### Why Random Forest?
- Little overfitting
- High accuracy
- Estimates missing data