In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC;
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score;
from sklearn.model_selection import GridSearchCV;



In [56]:
data = load_iris()
df = pd.DataFrame(data.data, columns=data.feature_names)
df['target'] = data.target
# Add column such that if the target is 0, 1, 2 then it should be setosa, versicolor, virginica respectively
df['target_name'] = df['target'].map({0: 'setosa', 1: 'versicolor', 2: 'virginica'})
df.head()


Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target,target_name
0,5.1,3.5,1.4,0.2,0,setosa
1,4.9,3.0,1.4,0.2,0,setosa
2,4.7,3.2,1.3,0.2,0,setosa
3,4.6,3.1,1.5,0.2,0,setosa
4,5.0,3.6,1.4,0.2,0,setosa


## Data Cleaning And Validation

In [38]:
# Checking for data types and Null values
df.info()
df.isnull().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   sepal length (cm)  150 non-null    float64
 1   sepal width (cm)   150 non-null    float64
 2   petal length (cm)  150 non-null    float64
 3   petal width (cm)   150 non-null    float64
 4   target             150 non-null    int32  
dtypes: float64(4), int32(1)
memory usage: 5.4 KB


sepal length (cm)    0
sepal width (cm)     0
petal length (cm)    0
petal width (cm)     0
target               0
dtype: int64

## Data Visualization

In [None]:

# Count the number of each target_name
target_counts = df['target_name'].value_counts()
fig = px.pie(values=target_counts.values, 
             names=target_counts.index,
             title='Distribution of Iris Species',
             color_discrete_sequence=px.colors.qualitative.Set3)
fig.update_traces(textposition='inside', textinfo='percent+label')
fig.update_layout(
    showlegend=True,
    width=600,
    height=400
)

fig.show()



## Split Data

In [39]:
#splitting Data into Training and Test set
#Dividing out data into independent and dependent variables we have
X=df.drop(['target'], axis=1)
y=df['target']

In [40]:
#Splitting our data into training and testing 
# Dedicating 20% of the data to be testing and 80% for training we have

X_train,X_test,y_train,y_test = train_test_split(X,y, test_size=0.2, random_state=42, shuffle=True)

print("X_train Shape:", X_train.shape)
print("X_test Shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)


X_train Shape: (120, 4)
X_test Shape: (30, 4)
y_train shape: (120,)
y_test shape: (30,)


## Standardizing the Data to have common Scale

In [41]:
# Here we carry out feature scaling using the sklearn library StandardScaler.
# Feature scaling
sc = StandardScaler()
X_train_scale = sc.fit_transform(X_train)
X_test_scale = sc.fit_transform(X_test)


## Implementing Our Classification Models
- We will build 5 different models and compare their accuracy scores

In [42]:
# we create two dictionaries to store the results of our models

result_dic_train = {}
result_dic_test = {}
result_f1_recall ={}
result_confusion = {}

#### Implementing Logistic Regression

In [43]:
from sklearn.multiclass import OneVsOneClassifier
from sklearn.multiclass import OneVsRestClassifier

# Model Training
reg = OneVsOneClassifier(LogisticRegression(random_state=42))
accuracies = cross_val_score(reg, X_train_scale, y_train)
reg.fit(X_train_scale, y_train)
y_pred = reg.predict(X_test_scale)

In [44]:
# Model evaluation
l_score = f1_score(y_test, y_pred, average='micro')
l_recall = recall_score(y_test, y_pred, average='micro')
l_confusion = confusion_matrix(y_test, y_pred)

#Printing our accuracy we have:
print("Train Score:", np.mean(accuracies))
print('Test Score', reg.score(X_test, y_test))
print('F1 Score', l_score)
print('Recall Score', l_recall)
print('Confusion Matrix', l_confusion)

Train Score: 0.95
Test Score 0.36666666666666664
F1 Score 0.9666666666666667
Recall Score 0.9666666666666667
Confusion Matrix [[10  0  0]
 [ 0  9  0]
 [ 0  1 10]]




In [45]:
#Adding the result in our dictionaries we have:
result_dic_train["Logistic Train Score"]=np.mean(accuracies)
result_dic_test["Logistic Test Score"]=reg.score(X_test, y_test)
result_f1_recall['Logistic Recall']=l_recall



#### Implementing K-Nearest Neighbors (KNN)

In [46]:
# Model Training
knn = OneVsRestClassifier(KNeighborsClassifier(n_neighbors=3))
accuracies = cross_val_score(knn, X_train_scale, y_train, cv=5)
knn.fit(X_train_scale, y_train)
y2_pred = knn.predict(X_test)



In [47]:
# model Evaluation
kn_score = f1_score(y_test, y2_pred, average='micro')
kn_recall = recall_score(y_test, y2_pred, average='micro')
kn_confusion = confusion_matrix(y_test, y2_pred)

#Printing our accuracy we have:
print("Train Score:", np.mean(accuracies))
print('Test Score', knn.score(X_test, y_test))
print('F1 Score', kn_score)
print('Recall Score', kn_recall)
print('Confusion Matrix', kn_confusion)

Train Score: 0.95
Test Score 0.36666666666666664
F1 Score 0.36666666666666664
Recall Score 0.36666666666666664
Confusion Matrix [[ 0  0 10]
 [ 0  0  9]
 [ 0  0 11]]




In [48]:
#Adding the result in our dictionaries we have:
result_dic_train["KNN Train Score"]=np.mean(accuracies)
result_dic_test["KNN Test Score"]= reg.score(X_test, y_test)
result_f1_recall["KNN Recall"]= kn_recall



#### Implementing Decision Tree

In [51]:
# Model Training 
dtc = DecisionTreeClassifier(random_state=42)
accuracies =cross_val_score(dtc, X_train_scale, y_train, cv=5)
dtc.fit(X_train_scale, y_train)
y3_pred= dtc.predict(X_test)




In [53]:
# Model Evaluation
dt_score = f1_score(y_test, y3_pred, average='micro')
dt_recall = recall_score(y_test, y3_pred, average='micro')
dt_confusion = confusion_matrix(y_test, y3_pred)

#Printing our accuracy we have:
print("Train Score:", np.mean(accuracies))
print('Test Score', dtc.score(X_test, y_test))
print('F1 Score', dt_score)
print('Recall Score', dt_recall)
print('Confusion Matrix', dt_confusion)

Train Score: 0.95
Test Score 0.36666666666666664
F1 Score 0.36666666666666664
Recall Score 0.36666666666666664
Confusion Matrix [[ 0  0 10]
 [ 0  0  9]
 [ 0  0 11]]




In [54]:
#Adding the result in our dictionaries we have:
result_dic_train["Decision Train Score"]=np.mean(accuracies)
result_dic_test["Decision Test Score"]= reg.score(X_test, y_test)
result_f1_recall["Decision Recall Score"]= dt_recall



## Model Evaluation

In [55]:
#Recall Scores
df_result_recall = pd.DataFrame.from_dict(result_f1_recall,orient = "index",columns=["Score"])
df_result_recall

Unnamed: 0,Score
Logistic Recall,0.966667
KNN Recall,0.366667
Decision Recall Score,0.366667


### Exporting Our Model

In [50]:
import pickle

with open('log_model.sav', 'wb') as f:
    pickle.dump(reg, f)