## Application of machine learning classification algorithms for predicting the species.

In [1]:
# Import Libarries

import numpy as np
import pandas as pd 

### Import Data 

In [2]:
iris_data = pd.read_csv('iris.csv')

In [3]:
iris_data.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


### Clean Data

Check for duplicate values and null values

In [4]:
iris_data.isnull().sum()

sepal_length    0
sepal_width     0
petal_length    0
petal_width     0
species         0
dtype: int64

#### Data Insights

No null values

In [6]:
iris_data[iris_data.duplicated()]

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
142,5.8,2.7,5.1,1.9,virginica


In [7]:
iris_data['species'].value_counts()

versicolor    50
setosa        50
virginica     50
Name: species, dtype: int64

#### Data Insights

The dataset contains three species of balanced entries of 50. Therefore the is no need to delete the entry as it might unbalance the dataset.

### Split the Dataset into Two

Input set and output set

Output set is the predictions

In [96]:
X = iris_data.drop(columns =['species'])
y = iris_data['species']

In [97]:
X.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [98]:
y.head()

0    setosa
1    setosa
2    setosa
3    setosa
4    setosa
Name: species, dtype: object

### Create a Model using a Machine Learning Algorithm

In [99]:
from sklearn.tree import DecisionTreeClassifier

In [100]:
model = DecisionTreeClassifier()

In [101]:
model.fit(X,y)

DecisionTreeClassifier()

In [102]:
# make prediction

# sepal_length=3,sepal_width=2,petal_length=3,petal_width=1

predictions = model.predict([[3,2,3,1]])

In [103]:
predictions

array(['versicolor'], dtype=object)

#### Data Insights

The iris flower with sepal_length=3,sepal_width=2,petal_length=3,petal_width=1 is most likely to be a Versicolor specie


In [104]:
# make prediction

# sepal_length=5,sepal_width=3.7,petal_length=2.4,petal_width=0.2

predictions = model.predict([[5,3.7,2.4,0.2]])

predictions

array(['setosa'], dtype=object)

## Measure Model Accuracy

In [105]:
# split the data into two sets

from sklearn.model_selection import train_test_split

In [106]:
X_train,X_test,y_train,y_test = train_test_split(X,y, test_size=0.2)

# X_train and X_test are input data for training and testing

# y_train and y_test are output data for training and testing

In [107]:
model.fit(X_train, y_train)


DecisionTreeClassifier()

In [108]:
predictions = model.predict(X_test)

In [111]:
# measure accuracy

from sklearn.metrics import accuracy_score

score = accuracy_score(y_test, predictions)

score

0.9666666666666667

In [113]:
# measure accuracy

from sklearn.metrics import accuracy_score

score = accuracy_score(y_test, predictions)                               

score

0.9666666666666667

In [114]:
predictions = model.predict([[3,2,3,1]])
predictions

array(['versicolor'], dtype=object)

## Model Persistence

In [115]:
import joblib

# joblib model has method for saving and loading model

In [116]:
joblib.dump(model, 'iris_predictor.joblib')

['iris_predictor.joblib']

In [117]:
# load model

model = joblib.load('iris_predictor.joblib')

In [118]:
predictions = model.predict([[2,3,4,3.8]])

In [119]:
predictions

array(['virginica'], dtype=object)

## Export Model as a Visual Format

In [94]:
from sklearn import tree

X = iris_data.drop(columns =['species'])
y = iris_data['species']

In [95]:
model.fit(X,y)

tree.export_graphviz(model, out_file='iris_predictor.dot',
                    feature_names=['sepal_length','sepal_width','petal_length','petal_width'],
                    class_names=sorted(y.unique()),
                    label='all',
                    rounded=True,
                    filled=True)