In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics

In [2]:
address = 'data\iris.data.csv'
df = pd.read_csv(filepath_or_buffer=address, header=None, sep=',')

df.columns = ['Sepal Length', 'Sepal Width', 'Petal Length', 'Petal Width', 'Species']

In [3]:
df.head()

Unnamed: 0,Sepal Length,Sepal Width,Petal Length,Petal Width,Species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [4]:
df.Species.unique()

array(['setosa', 'versicolor', 'virginica'], dtype=object)

In [6]:
X = df.iloc[:, 0:4]
X

Unnamed: 0,Sepal Length,Sepal Width,Petal Length,Petal Width
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2
...,...,...,...,...
145,6.7,3.0,5.2,2.3
146,6.3,2.5,5.0,1.9
147,6.5,3.0,5.2,2.0
148,6.2,3.4,5.4,2.3


In [8]:
Y = df.iloc[:,4]
Y

0         setosa
1         setosa
2         setosa
3         setosa
4         setosa
         ...    
145    virginica
146    virginica
147    virginica
148    virginica
149    virginica
Name: Species, Length: 150, dtype: object

In [9]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=0)

In [10]:
clf = DecisionTreeClassifier()
clf.fit(X_train, Y_train)

In [11]:
Y_predict = clf.predict(X_test)
Y_predict

array(['virginica', 'versicolor', 'setosa', 'virginica', 'setosa',
       'virginica', 'setosa', 'versicolor', 'versicolor', 'versicolor',
       'virginica', 'versicolor', 'versicolor', 'versicolor',
       'versicolor', 'setosa', 'versicolor', 'versicolor', 'setosa',
       'setosa', 'virginica', 'versicolor', 'setosa', 'setosa',
       'virginica', 'setosa', 'setosa', 'versicolor', 'versicolor',
       'setosa', 'virginica', 'versicolor', 'setosa', 'virginica',
       'virginica', 'versicolor', 'setosa', 'virginica', 'versicolor',
       'versicolor', 'virginica', 'setosa', 'virginica', 'setosa',
       'setosa'], dtype=object)

In [13]:
accuracy = metrics.accuracy_score(Y_test, Y_predict)
print(f"Accuracy: {accuracy}")

Accuracy: 0.9777777777777777


In [15]:
data = pd.DataFrame(X_test, columns=['Sepal Length', 'Sepal Width', 'Petal Length', 'Petal Width'])
data['Prediction'] = Y_predict
data['Original'] = Y_test
data

Unnamed: 0,Sepal Length,Sepal Width,Petal Length,Petal Width,Prediction,Original
114,5.8,2.8,5.1,2.4,virginica,virginica
62,6.0,2.2,4.0,1.0,versicolor,versicolor
33,5.5,4.2,1.4,0.2,setosa,setosa
107,7.3,2.9,6.3,1.8,virginica,virginica
7,5.0,3.4,1.5,0.2,setosa,setosa
100,6.3,3.3,6.0,2.5,virginica,virginica
40,5.0,3.5,1.3,0.3,setosa,setosa
86,6.7,3.1,4.7,1.5,versicolor,versicolor
76,6.8,2.8,4.8,1.4,versicolor,versicolor
71,6.1,2.8,4.0,1.3,versicolor,versicolor


In [16]:
filtered_data = data[data['Prediction'] != data['Original']]
filtered_data

Unnamed: 0,Sepal Length,Sepal Width,Petal Length,Petal Width,Prediction,Original
83,6.0,2.7,5.1,1.6,virginica,versicolor


In [18]:
df.describe().round(2)

Unnamed: 0,Sepal Length,Sepal Width,Petal Length,Petal Width
count,150.0,150.0,150.0,150.0
mean,5.84,3.06,3.76,1.2
std,0.83,0.44,1.77,0.76
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


In [21]:
# Tukey methode
# IQR = 6.4 - 5.1 = 1.3
# (1.5)IQR = 1.95
# 5.1 - 1.95 = 3.15
# 6.4 + 1.95 = 8.35 ...
def Tukey_method_outliners(data):
    Q1 = data.quantile(0.25)
    Q3 = data.quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - (1.5 * IQR)
    upper_bound = Q3 + (1.5 * IQR)
    return data[(data > upper_bound) & (data < lower_bound)]

In [22]:
Tukey_method_outliners(df['Sepal Length'])

Series([], Name: Sepal Length, dtype: float64)

In [23]:
Tukey_method_outliners(df['Sepal Width'])

Series([], Name: Sepal Width, dtype: float64)

In [24]:
Tukey_method_outliners(df['Petal Length'])

Series([], Name: Petal Length, dtype: float64)

In [25]:
Tukey_method_outliners(df['Petal Width'])

Series([], Name: Petal Width, dtype: float64)