In [47]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.metrics import confusion_matrix,accuracy_score,f1_score,multilabel_confusion_matrix
from sklearn.metrics import recall_score,precision_score,roc_curve,classification_report
from sklearn.preprocessing import StandardScaler,MinMaxScaler

from scipy import stats
import warnings
warnings.filterwarnings("ignore")

import plotly.express as px

import pickle
import json

In [48]:
#Reading csv file(creating dataframe)
df=pd.read_csv("Iris.csv")
df

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa
...,...,...,...,...,...,...
145,146,6.7,3.0,5.2,2.3,Iris-virginica
146,147,6.3,2.5,5.0,1.9,Iris-virginica
147,148,6.5,3.0,5.2,2.0,Iris-virginica
148,149,6.2,3.4,5.4,2.3,Iris-virginica


In [49]:
#check information about dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             150 non-null    int64  
 1   SepalLengthCm  150 non-null    float64
 2   SepalWidthCm   150 non-null    float64
 3   PetalLengthCm  150 non-null    float64
 4   PetalWidthCm   150 non-null    float64
 5   Species        150 non-null    object 
dtypes: float64(4), int64(1), object(1)
memory usage: 7.2+ KB


In [50]:
df.describe()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm
count,150.0,150.0,150.0,150.0,150.0
mean,75.5,5.843333,3.054,3.758667,1.198667
std,43.445368,0.828066,0.433594,1.76442,0.763161
min,1.0,4.3,2.0,1.0,0.1
25%,38.25,5.1,2.8,1.6,0.3
50%,75.5,5.8,3.0,4.35,1.3
75%,112.75,6.4,3.3,5.1,1.8
max,150.0,7.9,4.4,6.9,2.5


In [51]:
df.isna().sum()

Id               0
SepalLengthCm    0
SepalWidthCm     0
PetalLengthCm    0
PetalWidthCm     0
Species          0
dtype: int64

In [52]:
df.drop("Id",axis=1,inplace=True)

In [53]:
# detecting outliers
class Detection():
    def __init__(self,Dataframe):
        self.Dataframe=Dataframe
    def detect(self,col,n):
        self.col=col
        self.n=n
        q1=self.Dataframe[self.col].quantile(0.25)
        q2=self.Dataframe[self.col].quantile(0.5)
        q3=self.Dataframe[self.col].quantile(0.75)
        iqr=q3-q1
        self.lower_tail=q1-self.n*iqr
        self.upper_tail=q3+self.n*iqr
        print(f"lower_tail for {self.col} is {self.lower_tail}")
        print(f"upper_tail for {self.col} is {self.upper_tail}")
        outliers=self.Dataframe.loc[(self.Dataframe[self.col]>self.upper_tail)|(self.Dataframe[self.col]<self.lower_tail),self.col]
        print(f"Outliers for this {self.col} is {outliers}")
    def mean(self):
        self.MEAN=self.Dataframe.loc[(self.Dataframe[self.col]<self.upper_tail)|(self.Dataframe[self.col]>self.lower_tail),self.col].mean()
        print(self.MEAN)
    def median(self):
        self.MEDIAN=self.Dataframe.loc[(self.Dataframe[self.col]<self.upper_tail)|(self.Dataframe[self.col]>self.lower_tail),self.col].median()
        print(self.MEDIAN)
    def replace_with_lowertail(self):
        
        self.Dataframe.loc[(self.Dataframe[self.col]>self.upper_tail)|(self.Dataframe[self.col]<self.lower_tail),self.col]=self.lower_tail
        
    def replace_with_uppertail(self):
        
        self.Dataframe.loc[(self.Dataframe[self.col]>self.upper_tail)|(self.Dataframe[self.col]<self.lower_tail),self.col]=self.upper_tail
    
    def replace_with_mean(self):
        
        self.Dataframe.loc[(self.Dataframe[self.col]>self.upper_tail)|(self.Dataframe[self.col]<self.lower_tail),self.col]=self.MEAN
    
    def replace_with_median(self):
        
        self.Dataframe.loc[(self.Dataframe[self.col]>self.upper_tail)|(self.Dataframe[self.col]<self.lower_tail),self.col]=self.MEDIAN
    
    def replace_with_statisvalue(self,n):
        
        self.Dataframe.loc[(self.Dataframe[self.col]>self.upper_tail)|(self.Dataframe[self.col]<self.lower_tail),self.col]=n
        
out=Detection(df)

In [54]:
out.detect("SepalLengthCm",1.5)

lower_tail for SepalLengthCm is 3.1499999999999986
upper_tail for SepalLengthCm is 8.350000000000001
Outliers for this SepalLengthCm is Series([], Name: SepalLengthCm, dtype: float64)


In [55]:
out.detect("SepalWidthCm",1.5)

lower_tail for SepalWidthCm is 2.05
upper_tail for SepalWidthCm is 4.05
Outliers for this SepalWidthCm is 15    4.4
32    4.1
33    4.2
60    2.0
Name: SepalWidthCm, dtype: float64


In [56]:
out.median() #median of SepalWidthCm

3.0


In [57]:
out.replace_with_median() #replaced all outliers with median

In [58]:
out.detect("SepalWidthCm",1.5)

lower_tail for SepalWidthCm is 2.05
upper_tail for SepalWidthCm is 4.05
Outliers for this SepalWidthCm is Series([], Name: SepalWidthCm, dtype: float64)


In [59]:
out.detect("PetalLengthCm",1.5)

lower_tail for PetalLengthCm is -3.649999999999999
upper_tail for PetalLengthCm is 10.349999999999998
Outliers for this PetalLengthCm is Series([], Name: PetalLengthCm, dtype: float64)


In [60]:
out.detect("PetalWidthCm",1.5)

lower_tail for PetalWidthCm is -1.95
upper_tail for PetalWidthCm is 4.05
Outliers for this PetalWidthCm is Series([], Name: PetalWidthCm, dtype: float64)


In [61]:
px.scatter_3d(df,x=df["SepalLengthCm"],y=df["SepalWidthCm"],z=df["PetalLengthCm"],color=df["Species"])

# Train test split

In [83]:
x=df.drop("Species",axis=1)
y=df["Species"]

In [84]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.25,random_state=42,stratify=y)

# Model selection

In [85]:
knn_model=KNeighborsClassifier()
knn_model.fit(x_train,y_train)

# Model evalution

In [86]:
y_train_pred=knn_model.predict(x_train)

cm=confusion_matrix(y_train,y_train_pred)
print(cm)
print("*"*80)
mcm=multilabel_confusion_matrix(y_train,y_train_pred)
print(mcm)
print("*"*80)
accuracy=accuracy_score(y_train,y_train_pred)
print(f"Training Accuracy:-{accuracy}")
print("*"*80)
clf_report=classification_report(y_train,y_train_pred)
print(f"Training classification report:-{clf_report}")

[[38  0  0]
 [ 0 35  2]
 [ 0  1 36]]
********************************************************************************
[[[74  0]
  [ 0 38]]

 [[74  1]
  [ 2 35]]

 [[73  2]
  [ 1 36]]]
********************************************************************************
Training Accuracy:-0.9732142857142857
********************************************************************************
Training classification report:-                 precision    recall  f1-score   support

    Iris-setosa       1.00      1.00      1.00        38
Iris-versicolor       0.97      0.95      0.96        37
 Iris-virginica       0.95      0.97      0.96        37

       accuracy                           0.97       112
      macro avg       0.97      0.97      0.97       112
   weighted avg       0.97      0.97      0.97       112



In [87]:
#Testing data
y_test_pred=knn_model.predict(x_test)

cm=confusion_matrix(y_test,y_test_pred)
print(cm)
print("*"*80)
mcm=multilabel_confusion_matrix(y_test,y_test_pred)
print(mcm)
print("*"*80)
accuracy=accuracy_score(y_test,y_test_pred)
print(f"Testing Accuracy:-{accuracy}")
print("*"*80)
clf_report=classification_report(y_test,y_test_pred)
print(f"Testing classification report:-{clf_report}")

[[12  0  0]
 [ 0 13  0]
 [ 0  1 12]]
********************************************************************************
[[[26  0]
  [ 0 12]]

 [[24  1]
  [ 0 13]]

 [[25  0]
  [ 1 12]]]
********************************************************************************
Testing Accuracy:-0.9736842105263158
********************************************************************************
Testing classification report:-                 precision    recall  f1-score   support

    Iris-setosa       1.00      1.00      1.00        12
Iris-versicolor       0.93      1.00      0.96        13
 Iris-virginica       1.00      0.92      0.96        13

       accuracy                           0.97        38
      macro avg       0.98      0.97      0.97        38
   weighted avg       0.98      0.97      0.97        38



# Hyperparameter Tuning

In [88]:
param_grid = {"n_neighbors":np.arange(2,30),"p":[1,2]}

gscv_knn_model = GridSearchCV(knn_model,param_grid,cv=5)
gscv_knn_model.fit(x_train,y_train)

In [89]:
gscv_knn_model.best_estimator_

In [90]:
knn_model_after=KNeighborsClassifier(n_neighbors=10)
knn_model_after.fit(x_train,y_train)

In [91]:
y_train_pred=knn_model_after.predict(x_train)

cm=confusion_matrix(y_train,y_train_pred)
print(cm)
print("*"*80)
mcm=multilabel_confusion_matrix(y_train,y_train_pred)
print(mcm)
print("*"*80)
accuracy=accuracy_score(y_train,y_train_pred)
print(f"Training Accuracy:-{accuracy}")
print("*"*80)
clf_report=classification_report(y_train,y_train_pred)
print(f"Training classification report:-{clf_report}")

[[38  0  0]
 [ 0 35  2]
 [ 0  1 36]]
********************************************************************************
[[[74  0]
  [ 0 38]]

 [[74  1]
  [ 2 35]]

 [[73  2]
  [ 1 36]]]
********************************************************************************
Training Accuracy:-0.9732142857142857
********************************************************************************
Training classification report:-                 precision    recall  f1-score   support

    Iris-setosa       1.00      1.00      1.00        38
Iris-versicolor       0.97      0.95      0.96        37
 Iris-virginica       0.95      0.97      0.96        37

       accuracy                           0.97       112
      macro avg       0.97      0.97      0.97       112
   weighted avg       0.97      0.97      0.97       112



In [92]:
#Testing data
y_test_pred=knn_model_after.predict(x_test)

cm=confusion_matrix(y_test,y_test_pred)
print(cm)
print("*"*80)
mcm=multilabel_confusion_matrix(y_test,y_test_pred)
print(mcm)
print("*"*80)
accuracy=accuracy_score(y_test,y_test_pred)
print(f"Testing Accuracy:-{accuracy}")
print("*"*80)
clf_report=classification_report(y_test,y_test_pred)
print(f"Testing classification report:-{clf_report}")

[[12  0  0]
 [ 0 13  0]
 [ 0  1 12]]
********************************************************************************
[[[26  0]
  [ 0 12]]

 [[24  1]
  [ 0 13]]

 [[25  0]
  [ 1 12]]]
********************************************************************************
Testing Accuracy:-0.9736842105263158
********************************************************************************
Testing classification report:-                 precision    recall  f1-score   support

    Iris-setosa       1.00      1.00      1.00        12
Iris-versicolor       0.93      1.00      0.96        13
 Iris-virginica       1.00      0.92      0.96        13

       accuracy                           0.97        38
      macro avg       0.98      0.97      0.97        38
   weighted avg       0.98      0.97      0.97        38



## new data prediction

In [72]:
x.head(1).T

Unnamed: 0,0
SepalLengthCm,5.1
SepalWidthCm,3.5
PetalLengthCm,1.4
PetalWidthCm,0.2


In [73]:
y.head(1)

0    Iris-setosa
Name: Species, dtype: object

In [74]:
SepalLengthCm = 5.1
SepalWidthCm = 3.5
PetalLengthCm = 1.4
PetalWidthCm = 0.2

In [75]:
test_array=np.zeros(len(x.columns))
test_array

array([0., 0., 0., 0.])

In [76]:
test_array[0]= SepalLengthCm
test_array[1]= SepalWidthCm
test_array[2]= PetalLengthCm
test_array[3]= PetalWidthCm

In [77]:
test_array

array([5.1, 3.5, 1.4, 0.2])

In [93]:
knn_model_after.predict([test_array])

array(['Iris-setosa'], dtype=object)

## Pickling model

In [None]:
pickle.dump(logistic_model,open('iris.pkl','wb'))

In [None]:
project_data = {"columns" : list(x.columns)}
project_data

In [None]:
with open("project_data.json","w") as f:
    json.dump(project_data ,f)