# Iris classification

In [2]:
import numpy as np
import pandas as pd

### Importing the data

In [3]:
df = pd.read_csv('./iris.csv', index_col=0)

df.shape

(150, 5)

In [4]:
df.head()

Unnamed: 0_level_0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,5.1,3.5,1.4,0.2,Iris-setosa
2,4.9,3.0,1.4,0.2,Iris-setosa
3,4.7,3.2,1.3,0.2,Iris-setosa
4,4.6,3.1,1.5,0.2,Iris-setosa
5,5.0,3.6,1.4,0.2,Iris-setosa


# Preparing the data

### Species classification

- Iris-setosa => 1
- Iris-versicolor => 2
- Iris-virginica => 3

In [52]:
def label(value):
    _dict = {
        1: 'Iris-Setosa',
        2: 'Iris-Versicolor',
        3: 'Iris-Virginica'
    }
    return _dict[value]

In [5]:
# change the labes for values
df.loc[df.Species == 'Iris-setosa', 'Species'] = 1
df.loc[df.Species == 'Iris-versicolor', 'Species'] = 2
df.loc[df.Species == 'Iris-virginica', 'Species'] = 3


df.head()

Unnamed: 0_level_0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,5.1,3.5,1.4,0.2,1
2,4.9,3.0,1.4,0.2,1
3,4.7,3.2,1.3,0.2,1
4,4.6,3.1,1.5,0.2,1
5,5.0,3.6,1.4,0.2,1


In [88]:
train_values = df.sample(frac=0.6).astype(float)
test_values = df.sample(frac=0.4).astype(float)

#train data
train_inputs = train_values.as_matrix(
    columns=['SepalLengthCm', 'SepalWidthCm',
    'PetalLengthCm', 'PetalWidthCm']).astype(float)

train_predicts = train_values['Species'].values.astype(int)


# test data
test_inputs = test_values.as_matrix(
    columns=['SepalLengthCm', 'SepalWidthCm',
    'PetalLengthCm', 'PetalWidthCm']).astype(float)

test_predicts = test_values['Species'].values.astype(int)


In [89]:
# methos to show the ratio of the dataset

def print_ratio(df, label):
    
    total_values = len(df.loc[df['Species']])

    iris_setosa_total = len(df.loc[df['Species'] == 1])
    iris_setosa_total_percent = iris_setosa_total / total_values  * 100

    iris_versicolor_total = len(df.loc[df['Species'] == 2])
    iris_versicolor_total_percent = iris_versicolor_total / total_values * 100

    iris_virginica_total = len(df.loc[df['Species'] == 3])
    iris_virginica_total_percent = iris_virginica_total / total_values * 100

    print(label)
    print('Iris-setosa: {0} {1:0.2f}%'.format(iris_setosa_total, iris_setosa_total_percent))
    print('Iris-versicolor: {0} {1:0.2f}%'.format(iris_versicolor_total, iris_versicolor_total_percent))
    print('Iris-virginica: {0} {1:0.2f}%'.format(iris_virginica_total, iris_virginica_total_percent))



print_ratio(df, 'Total')
print("")
print_ratio(train_values, 'Train values')
print("")
print_ratio(test_values, 'Test values')



Total
Iris-setosa: 50 33.33%
Iris-versicolor: 50 33.33%
Iris-virginica: 50 33.33%

Train values
Iris-setosa: 27 30.00%
Iris-versicolor: 30 33.33%
Iris-virginica: 33 36.67%

Test values
Iris-setosa: 18 30.00%
Iris-versicolor: 23 38.33%
Iris-virginica: 19 31.67%


### Training the model

In [90]:
from sklearn.naive_bayes import GaussianNB

model = GaussianNB()

# aqui ta o pulo do gato
model.fit(train_inputs, train_predicts)

GaussianNB(priors=None)

### Predict on train data

In [91]:
# metrics to accuracy
from sklearn import metrics

train_predicted_values = model.predict(train_inputs)

accuracy = metrics.accuracy_score(train_predicts, train_predicted_values)
print('Accuracy {0:0.4f}%'.format(accuracy))

Accuracy 0.9556%


### Predict on test data

In [92]:
test_predicted_values = model.predict(test_inputs)

accuracy = metrics.accuracy_score(test_predicts, test_predicted_values)
print('Accuracy {0:0.4f}%'.format(accuracy))

Accuracy 0.9667%


## Prediction

In [93]:
predicted = model.predict([
    [6.4, 3.2, 4.5, 1.5]
])


print('this flower is a {0}'.format(label(*predicted)))

this flower is a Iris-Versicolor
