


Bảng dữ liệu bao gồm các trường như sau:

1. Id: chỉ số đếm của từng mẫu
2. SepalLengthCm: độ dài của đài hoa
3. SepalWidthCm: độ rộng của đài hoa
4. PetalLengthCm: độ dài cách hoa
5. PetalWidthCm: độ rộng của cánh hoa
6. Species: phân loại của từng loài hoa lan

# Target
Phân loại cho từng loài hoa lan (setosa, virginica, versicolor) dựa theo các đặc trưng(dữ liệu) cho trước.

# Overview

In [None]:
# Gọi các thư viện cần thiết cho bài toán
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import pandas as pd
import plotly.graph_objs as go
import seaborn as sns

In [None]:
# gọi đường dẫn vào file dữ liệu + xem các dữ liệu đầu
data_path = '/Dataset/iris.csv'
data = pd.read_csv(data_path)
data.head()

In [None]:
# Bắt đầu tìm hiểu về các trường dữ liệu
data.info(verbose=True)

In [None]:
# Tìm hiểu các giá trị của từng trường
data.nunique()

In [None]:
# Tìm hiểu về các giá trị của từng trường dữ liệu là số
data.describe()

In [None]:
# Xem số lượng dữ liệu trống (null)
null_info = pd.DataFrame({'Null count': data.isnull().sum(), 'Null ratio': data.isnull().sum()/len(data)})
null_info.drop('Species', inplace=True)
null_info.sort_values(by='Null ratio', ascending=False, inplace=True)
null_info

In [None]:
# vẽ biểu diễn null
import missingno as msno

msno.matrix(data);

In [None]:
# Xem tỉ só giữa các loài hoa
print('% Setosa:', format((data.Species=='Iris-setosa').sum() / len(data) * 100,'.2f'))
print('% Vesicolor:', format((data.Species=='Iris-versicolor').sum() / len(data) * 100,'.2f'))
print('% Virginica:', format((data.Species=='Iris-virginica').sum() / len(data) * 100,'.2f'))

# Visualization 

In [None]:
# Copy 1 bản nháp của data để vẽ và thay đổi dữ liệu
data1 = data.copy()
data1.head()

## SepalLengthCm

In [None]:
data1['SepalLengthCm'].value_counts(dropna=False)

In [None]:
plt.figure(figsize=(13,6))
plt.subplot(1,2,1)
sns.histplot(data1.SepalLengthCm, bins=10, kde=True)
plt.title('Histplot diagram for SepalLengthCm')
plt.subplot(1,2,2)
sns.boxplot(x="Species", y="SepalLengthCm", data=data1)
sns.stripplot(x='Species',y='SepalLengthCm',data=data1,jitter=True,edgecolor='gray')
plt.title('Boxplot for SepalLengthCm')

## SepalWidthCm

In [None]:
data1['SepalWidthCm'].value_counts(dropna=False)

In [None]:
plt.figure(figsize=(13,6))
plt.subplot(1,2,1)
sns.histplot(data1.SepalWidthCm, bins=5, kde=True)
plt.title('Histplot diagram for SepalWidthCm')
plt.subplot(1,2,2)
sns.violinplot(x="Species", y="SepalWidthCm", data=data1, size=6)
sns.stripplot(x='Species',y='SepalWidthCm',data=data1,jitter=True,edgecolor='gray')
plt.title('Boxplot for SepalWidthCm')

## PetalLengthCm

In [None]:
data1['PetalLengthCm'].value_counts(dropna=False)

In [None]:
plt.figure(figsize=(13,6))
plt.subplot(1,2,1)
sns.histplot(data1.PetalLengthCm, bins=10, kde=True)
plt.title('Histplot diagram for PetalLengthCm')
plt.subplot(1,2,2)
sns.boxplot(x="Species", y="SepalLengthCm", data=data1)
plt.title('Boxplot for PetalLengthCm')

## PetalWidthCm

In [None]:
data1['PetalWidthCm'].value_counts(dropna=False)

In [None]:
plt.figure(figsize=(13,6))
plt.subplot(1,2,1)
sns.histplot(data1.PetalWidthCm, bins=5, kde=True)
plt.title('Histplot diagram for PetalWidthCm')
plt.subplot(1,2,2)
sns.violinplot(x="Species", y="PetalWidthCm", data=data1, size=6)
# sns.stripplot(x='Species',y='PetalWidthCm',data=data1,jitter=True,edgecolor='gray')
plt.title('Boxplot for PetalWidthCm')

# Relationship

In [None]:
sns.jointplot(x="SepalLengthCm", y="SepalWidthCm", data=data1, size=5, hue="Species")
plt.title('Relationship between SepalWidthCm and SepalLengthCm')

In [None]:
sns.set_style('whitegrid')
sns.lmplot(x='SepalLengthCm', y='SepalWidthCm', data=data1, hue="Species", markers=['o', 'v', 'x'])
plt.title('Regression between SepalLengthCm and SepalWidthCm')

In [None]:
sns.jointplot(x="PetalLengthCm", y="PetalWidthCm", data=data1, size=5, hue="Species")
plt.title('Relationship between PetalWidthCm and PetalLengthCm')

In [None]:
sns.set_style('whitegrid')
sns.lmplot(x='PetalLengthCm', y='PetalWidthCm', data=data1, hue="Species", markers=['o', 'v', 'x'])
plt.title('Regression between PetalLengthCm and PetalWidthCm')

In [None]:
plt.figure(figsize=(20,15))
plt.subplot(2,2,1)
sns.barplot(x = 'Species', y = 'SepalLengthCm', data = data1, palette="cubehelix")
plt.subplot(2,2,2)
sns.barplot(x = 'Species', y = 'SepalWidthCm', data = data1, palette="Oranges")
plt.subplot(2,2,3)
sns.barplot(x = 'Species', y = 'PetalLengthCm', data = data1, palette="Oranges")
plt.subplot(2,2,4)
sns.barplot(x = 'Species', y = 'PetalWidthCm', data = data1, palette="cubehelix")

In [None]:
plt.figure(figsize=(12,8)) 
sns.heatmap(data1.corr(), annot=True, cmap='Dark2_r', linewidths = 2)
plt.show()

# Summarize

In [None]:
# biểu đồ so sánh quan hệ giữa các cặp thuộc tính
sns.pairplot(data=data1,kind='scatter')

In [None]:
# biểu đồ so sánh quan hệ giữa các cặp thuộc tính phân chia theo loài hoa
sns.pairplot(data=data1,hue='Species');

# Applied Algorithms

In [None]:
#Metrics
from sklearn.metrics import make_scorer, accuracy_score,precision_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score ,precision_score,recall_score,f1_score

In [None]:
#Model Selection
from sklearn.model_selection import KFold,train_test_split,cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.linear_model import  LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn import linear_model
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier

In [None]:
data.drop(columns="Id",inplace=True)

In [None]:
# chia tách đặc trưng và nhãn
X=data.iloc[:,0:4].values
y=data.iloc[:,4].values

In [None]:
# Mã hóa nhãn
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)

In [None]:
# chia dữ liệu train và test
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=0)

## Naive Bayes

In [None]:
gaussian = GaussianNB()
gaussian.fit(X_train, y_train)
Y_pred = gaussian.predict(X_test) 
accuracy_nb=round(accuracy_score(y_test,Y_pred)* 100, 2)
acc_gaussian = round(gaussian.score(X_train, y_train) * 100, 2)

cm = confusion_matrix(y_test, Y_pred)
accuracy = accuracy_score(y_test,Y_pred)
precision =precision_score(y_test, Y_pred,average='micro')
recall =  recall_score(y_test, Y_pred,average='micro')
f1 = f1_score(y_test,Y_pred,average='micro')
print('Confusion matrix for Naive Bayes\n',cm)
print('accuracy_Naive Bayes: %.3f' %accuracy)
print('precision_Naive Bayes: %.3f' %precision)
print('recall_Naive Bayes: %.3f' %recall)
print('f1-score_Naive Bayes : %.3f' %f1)

## Decison Tree

In [None]:
decision_tree = DecisionTreeClassifier() 
decision_tree.fit(X_train, y_train)  
Y_pred = decision_tree.predict(X_test) 
accuracy_dt=round(accuracy_score(y_test,Y_pred)* 100, 2)
acc_decision_tree = round(decision_tree.score(X_train, y_train) * 100, 2)

cm = confusion_matrix(y_test, Y_pred)
accuracy = accuracy_score(y_test,Y_pred)
precision =precision_score(y_test, Y_pred,average='micro')
recall =  recall_score(y_test, Y_pred,average='micro')
f1 = f1_score(y_test,Y_pred,average='micro')
print('Confusion matrix for DecisionTree\n',cm)
print('accuracy_DecisionTree: %.3f' %accuracy)
print('precision_DecisionTree: %.3f' %precision)
print('recall_DecisionTree: %.3f' %recall)
print('f1-score_DecisionTree : %.3f' %f1)

In [None]:
from sklearn.tree import plot_tree
plt.figure(figsize = (15,10))
plot_tree(decision_tree.fit(X_train, y_train)  ,filled=True)
plt.show()

## Random Forest

In [None]:
random_forest = RandomForestClassifier(n_estimators=100)
random_forest.fit(X_train, y_train)
Y_prediction = random_forest.predict(X_test)
accuracy_rf=round(accuracy_score(y_test,Y_prediction)* 100, 2)
acc_random_forest = round(random_forest.score(X_train, y_train) * 100, 2)


cm = confusion_matrix(y_test, Y_prediction)
accuracy = accuracy_score(y_test,Y_prediction)
precision =precision_score(y_test, Y_prediction,average='micro')
recall =  recall_score(y_test, Y_prediction,average='micro')
f1 = f1_score(y_test,Y_prediction,average='micro')
print('Confusion matrix for Random Forest\n',cm)
print('accuracy_random_Forest : %.3f' %accuracy)
print('precision_random_Forest : %.3f' %precision)
print('recall_random_Forest : %.3f' %recall)
print('f1-score_random_Forest : %.3f' %f1)

## KNN

In [None]:
knn = KNeighborsClassifier(n_neighbors = 3)
knn.fit(X_train, y_train)
Y_pred = knn.predict(X_test) 
accuracy_knn=round(accuracy_score(y_test,Y_pred)* 100, 2)
acc_knn = round(knn.score(X_train, y_train) * 100, 2)

cm = confusion_matrix(y_test, Y_pred)
accuracy = accuracy_score(y_test,Y_pred)
precision =precision_score(y_test, Y_pred,average='micro')
recall =  recall_score(y_test, Y_pred,average='micro')
f1 = f1_score(y_test,Y_pred,average='micro')
print('Confusion matrix for KNN\n',cm)
print('accuracy_KNN : %.3f' %accuracy)
print('precision_KNN : %.3f' %precision)
print('recall_KNN: %.3f' %recall)
print('f1-score_KNN : %.3f' %f1)

In [None]:
# Với nhiều giá trị của n
plt.subplots(figsize=(20,5))
a_index=list(range(1,50))
a=pd.Series()
x=range(1,50)
#x=[1,2,3,4,5,6,7,8,9,10]
for i in list(range(1,50)):
    model=KNeighborsClassifier(n_neighbors=i) 
    model.fit(X_train, y_train) 
    prediction=model.predict(X_test)
    a=a.append(pd.Series(accuracy_score(y_test,prediction)))
plt.plot(a_index, a,marker="*")
plt.xticks(x)
plt.show()

## MLP

In [None]:
MLP = MLPClassifier(random_state=1, max_iter=300)
MLP.fit(X_train, y_train)
Y_pred = gaussian.predict(X_test) 
accuracy_MLP=round(accuracy_score(y_test,Y_pred)* 100, 2)
acc_MLP = round(MLP.score(X_train, y_train) * 100, 2)

cm = confusion_matrix(y_test, Y_pred)
accuracy = accuracy_score(y_test,Y_pred)
precision =precision_score(y_test, Y_pred,average='micro')
recall =  recall_score(y_test, Y_pred,average='micro')
f1 = f1_score(y_test,Y_pred,average='micro')
print('Confusion matrix for Multi Layer Perceptron\n',cm)
print('accuracy_Multi Layer Perceptron: %.3f' %accuracy)
print('precision_Multi Layer Perceptron: %.3f' %precision)
print('recall_Multi Layer Perceptron: %.3f' %recall)
print('f1-score_Multi Layer Perceptron : %.3f' %f1)

## Support Vector Machine (SVM)

In [None]:
linear_svc = LinearSVC(max_iter=4000)
linear_svc.fit(X_train, y_train)
Y_pred = linear_svc.predict(X_test)
accuracy_svc=round(accuracy_score(y_test,Y_pred)* 100, 2)
acc_linear_svc = round(linear_svc.score(X_train, y_train) * 100, 2)

cm = confusion_matrix(y_test, Y_pred)
accuracy = accuracy_score(y_test,Y_pred)
precision =precision_score(y_test, Y_pred,average='micro')
recall =  recall_score(y_test, Y_pred,average='micro')
f1 = f1_score(y_test,Y_pred,average='micro')
print('Confusion matrix for SVC\n',cm)
print('accuracy_SVC: %.3f' %accuracy)
print('precision_SVC: %.3f' %precision)
print('recall_SVC: %.3f' %recall)
print('f1-score_SVC : %.3f' %f1)

# Find best model

In [None]:
results = pd.DataFrame({
    'Model': [ 'KNN', 
              'Random Forest',
              'Naive Bayes',  
              ' Support Vector Machine',
              'Multi Layer Perceptron',
              'Decision Tree'],
    'Score': [ acc_knn, 
              acc_random_forest,
              acc_gaussian,  
              acc_linear_svc,
              acc_MLP,
              acc_decision_tree],
    "Accuracy_score":[accuracy_knn,
                      accuracy_rf,
                      accuracy_nb,
                      accuracy_svc,
                      accuracy_MLP,
                      accuracy_dt
                     ]})
result_df = results.sort_values(by='Accuracy_score', ascending=False)
result_df = result_df.reset_index(drop=True)
result_df.head(9)

In [None]:
plt.subplots(figsize=(12,8))
ax=sns.barplot(x='Model',y="Accuracy_score",data=result_df)
labels = (result_df["Accuracy_score"])
# add result numbers on barchart
for i, v in enumerate(labels):
    ax.text(i, v+1, str(v), horizontalalignment = 'center', size = 15, color = 'black')