In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

import joblib
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [2]:
#Load the dataset
df = sns.load_dataset("iris")

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal_length  150 non-null    float64
 1   sepal_width   150 non-null    float64
 2   petal_length  150 non-null    float64
 3   petal_width   150 non-null    float64
 4   species       150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB


<div class="alert alert-info">Remember, normally you would do your EDA, data cleaning, features selection and feature engineering at this point. Howver, for this demo, we will skipp all of that.</div>

### 3-Way splitting of the data

In [5]:
#gather up names of all the columns
cols = df.columns

#set the prediction column and the feature columns for KNN
prediction_col = 'species'
feature_cols = [c for c in cols if c != prediction_col]
X = df[feature_cols]
y = df[prediction_col]

In [6]:
# Train-Validation-Test Split - First split
X_train, X_temp, y_train, y_temp = train_test_split(X, y, random_state=42, test_size=0.3, stratify=y)

In [7]:
X_temp.info()

<class 'pandas.core.frame.DataFrame'>
Index: 45 entries, 107 to 58
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal_length  45 non-null     float64
 1   sepal_width   45 non-null     float64
 2   petal_length  45 non-null     float64
 3   petal_width   45 non-null     float64
dtypes: float64(4)
memory usage: 1.8 KB


In [8]:
X_temp.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
107,7.3,2.9,6.3,1.8
63,6.1,2.9,4.7,1.4
133,6.3,2.8,5.1,1.5
56,6.3,3.3,4.7,1.6
127,6.1,3.0,4.9,1.8


In [9]:
y_temp.info()

<class 'pandas.core.series.Series'>
Index: 45 entries, 107 to 58
Series name: species
Non-Null Count  Dtype 
--------------  ----- 
45 non-null     object
dtypes: object(1)
memory usage: 720.0+ bytes


In [10]:
y_temp.head()

107     virginica
63     versicolor
133     virginica
56     versicolor
127     virginica
Name: species, dtype: object

In [11]:
# Train-Validation-Test Split - Second split
X_test, X_val, y_test, y_val = train_test_split(X_temp, y_temp, random_state=42, test_size=0.5, stratify=y_temp)

In [12]:
print(X_test.shape)
X_test.head()

(22, 4)


Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
77,6.7,3.0,5.0,1.7
111,6.4,2.7,5.3,1.9
69,5.6,2.5,3.9,1.1
133,6.3,2.8,5.1,1.5
141,6.9,3.1,5.1,2.3


In [13]:
print(X_val.shape)
X_val.head()

(23, 4)


Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
18,5.7,3.8,1.7,0.3
38,4.4,3.0,1.3,0.2
134,6.1,2.6,5.6,1.4
57,4.9,2.4,3.3,1.0
127,6.1,3.0,4.9,1.8


In [14]:
print(y_test.shape)
y_test.head()

(22,)


77     versicolor
111     virginica
69     versicolor
133     virginica
141     virginica
Name: species, dtype: object

In [15]:
print(y_val.shape)
y_val.head()

(23,)


18         setosa
38         setosa
134     virginica
57     versicolor
127     virginica
Name: species, dtype: object

Alright! We have the three distinct sets of data for our modeling.

### Building the Model
We will start out with a RandomForest Classifier to start the process

In [17]:
# Train a Random Forest classifier using training set
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [18]:
pred_X_test = model.predict(X_test)

In [19]:
accuracy_score(y_test,pred_X_test)

0.8636363636363636

<div class="alert alert-info">Remember the process of constructing the model is iterative. So you might need to experiment with hyperparameters, changes tot he features, model selection, etc. The goal is to construct the most optimal model possible at this point.</div>

### Saving the Optimal Model

In [21]:
# Save the model using joblib
model_filename = "iris_rf_model.joblib"
joblib.dump(model, model_filename)

['iris_rf_model.joblib']

<img align="left" style="padding-right:50px;" src="figures_wk6/saved_model.png" width=250><br>
If you go out to your file system, you should be able to see a copy of your saved model.

### Loading and Using the Saved Model
For this demo, we are loading our saved model in the same notebook. However, you don't have to be in the same notebook.

In [24]:
# Load the saved model
loaded_model = joblib.load(model_filename)

In [25]:
# Make predictions on the validation set
y_val_pred = loaded_model.predict(X_val)

In [26]:
y_val_pred

array(['setosa', 'setosa', 'versicolor', 'versicolor', 'virginica',
       'virginica', 'versicolor', 'virginica', 'setosa', 'versicolor',
       'versicolor', 'setosa', 'versicolor', 'virginica', 'versicolor',
       'versicolor', 'setosa', 'versicolor', 'virginica', 'setosa',
       'versicolor', 'virginica', 'setosa'], dtype=object)

In [27]:
accuracy_score(y_val,y_val_pred)

0.9130434782608695