In [1]:
import numpy as np
import pandas as pd 
from pandas import Series, DataFrame

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
iris = pd.read_csv("Iris.csv")

In [4]:
iris.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal.length  150 non-null    float64
 1   sepal.width   150 non-null    float64
 2   petal.length  150 non-null    float64
 3   petal.width   150 non-null    float64
 4   variety       150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB


In [5]:
iris.head()

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width,variety
0,5.1,3.5,1.4,0.2,Setosa
1,4.9,3.0,1.4,0.2,Setosa
2,4.7,3.2,1.3,0.2,Setosa
3,4.6,3.1,1.5,0.2,Setosa
4,5.0,3.6,1.4,0.2,Setosa


In [6]:
X = iris[['sepal.length','sepal.width','petal.length','petal.width']]

In [7]:
Y = iris.variety

In [8]:
from sklearn.model_selection import train_test_split

In [9]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=7)

In [10]:
from sklearn.ensemble import RandomForestClassifier

In [11]:
param_grid = { 
    'n_estimators': [100, 200, 500],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [4,5,6,7,8],
    'criterion' :['gini', 'entropy']
}

In [12]:
rfc=RandomForestClassifier(random_state=42)
from sklearn.model_selection import GridSearchCV
CV_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid, cv= 3, n_jobs=-1)

In [13]:
CV_rfc.fit(X_train, Y_train)

GridSearchCV(cv=3, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=None,
                                              oob_score=False, random_state=42,
                                  

In [14]:
print("\nBest estimator:")
print()
print(CV_rfc.best_estimator_)


Best estimator:

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=4, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)


In [15]:
rfc2 = CV_rfc.best_estimator_

In [16]:
yprf_pred = rfc2.predict(X_test)

In [17]:
type(Y_test)

pandas.core.series.Series

In [18]:
Y_test_numpy = Y_test.to_numpy()

In [19]:
type(Y_test_numpy)

numpy.ndarray

In [20]:
from pycm import *
cm = ConfusionMatrix(actual_vector=Y_test_numpy, predict_vector=yprf_pred)
print(cm)

Predict          Setosa           Versicolor       Virginica        
Actual
Setosa           12               0                0                

Versicolor       0                14               2                

Virginica        0                2                15               





Overall Statistics : 

95% CI                                                            (0.82796,0.99426)
ACC Macro                                                         0.94074
ARI                                                               0.73305
AUNP                                                              0.9298
AUNU                                                              0.93616
Bennett S                                                         0.86667
CBA                                                               0.91912
CSI                                                               0.83824
Chi-Squared                                                       70.81126
Chi-Squared

**test if overfitting**

In [21]:
is_overfitting = rfc2.predict(X_train)

In [22]:
Y_train_numpy = Y_train.to_numpy()

In [23]:
cmOver = ConfusionMatrix(actual_vector=is_overfitting, predict_vector=Y_train_numpy)
print(cmOver)

Predict          Setosa           Versicolor       Virginica        
Actual
Setosa           38               0                0                

Versicolor       0                34               0                

Virginica        0                0                33               





Overall Statistics : 

95% CI                                                            (1.0,1.0)
ACC Macro                                                         1.0
ARI                                                               1.0
AUNP                                                              1.0
AUNU                                                              1.0
Bennett S                                                         1.0
CBA                                                               1.0
CSI                                                               1.0
Chi-Squared                                                       210.0
Chi-Squared DF                                   

**simpan sebagai joblib**

In [22]:
import joblib

In [23]:
# save the model to disk
filename = 'finalized_model.sav'
joblib.dump(rfc2, filename)

['finalized_model.sav']

load model

In [24]:
# load the model from disk
loaded_model = joblib.load(filename)
result = loaded_model.score(X_test, Y_test)
print(result)

0.9111111111111111


**test**

In [25]:
labels = {1:"Iris-setosa", 2:"Iris-versicolor", 3:"Iris-virginica"}

In [26]:
sepallength = 1
sepalwidth  = 2
petallength = 1
petalwidth  = 1

In [27]:
pred_args = np.array([sepallength,sepalwidth,petallength,petalwidth]).reshape(1, -1)

In [28]:
model_prediction = loaded_model.predict(pred_args)[0]

In [29]:
model_prediction

'Versicolor'

In [30]:
model_prediction_proba = loaded_model.predict_proba(pred_args)[0]

In [31]:
model_prediction_proba

array([0.47, 0.53, 0.  ])

In [32]:
iris.tail()

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width,variety
145,6.7,3.0,5.2,2.3,Virginica
146,6.3,2.5,5.0,1.9,Virginica
147,6.5,3.0,5.2,2.0,Virginica
148,6.2,3.4,5.4,2.3,Virginica
149,5.9,3.0,5.1,1.8,Virginica
