In [27]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder

## Data Collection and Preprocessing

In [2]:
iris_data = pd.read_csv("Iris.csv")

In [3]:
iris_data.head()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa


In [5]:
iris_data.shape

(150, 6)

In [6]:
iris_data.describe()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm
count,150.0,150.0,150.0,150.0,150.0
mean,75.5,5.843333,3.054,3.758667,1.198667
std,43.445368,0.828066,0.433594,1.76442,0.763161
min,1.0,4.3,2.0,1.0,0.1
25%,38.25,5.1,2.8,1.6,0.3
50%,75.5,5.8,3.0,4.35,1.3
75%,112.75,6.4,3.3,5.1,1.8
max,150.0,7.9,4.4,6.9,2.5


In [8]:
iris_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             150 non-null    int64  
 1   SepalLengthCm  150 non-null    float64
 2   SepalWidthCm   150 non-null    float64
 3   PetalLengthCm  150 non-null    float64
 4   PetalWidthCm   150 non-null    float64
 5   Species        150 non-null    object 
dtypes: float64(4), int64(1), object(1)
memory usage: 7.2+ KB


In [9]:
# Checking for null value
iris_data.isnull().sum()

Id               0
SepalLengthCm    0
SepalWidthCm     0
PetalLengthCm    0
PetalWidthCm     0
Species          0
dtype: int64

In [7]:
iris_data['Species'].value_counts()

Iris-setosa        50
Iris-versicolor    50
Iris-virginica     50
Name: Species, dtype: int64

In [10]:
iris_data.groupby('Species').mean()

Unnamed: 0_level_0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm
Species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Iris-setosa,25.5,5.006,3.418,1.464,0.244
Iris-versicolor,75.5,5.936,2.77,4.26,1.326
Iris-virginica,125.5,6.588,2.974,5.552,2.026


### Convert the categorical data into numerical values

In [12]:
label_encoder = LabelEncoder()

In [13]:
iris_data['Species'] = label_encoder.fit_transform(iris_data['Species'])

In [16]:
iris_data['Species'].unique()

array([0, 1, 2])

In [52]:
iris_data.groupby('Species').mean()

Unnamed: 0_level_0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm
Species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,25.5,5.006,3.418,1.464,0.244
1,75.5,5.936,2.77,4.26,1.326
2,125.5,6.588,2.974,5.552,2.026


### Splitting the dataframe into features and target

In [18]:
X = iris_data.drop(['Id','Species'], axis =1)
Y = iris_data['Species']

In [19]:
print(X)
print(Y)

     SepalLengthCm  SepalWidthCm  PetalLengthCm  PetalWidthCm
0              5.1           3.5            1.4           0.2
1              4.9           3.0            1.4           0.2
2              4.7           3.2            1.3           0.2
3              4.6           3.1            1.5           0.2
4              5.0           3.6            1.4           0.2
..             ...           ...            ...           ...
145            6.7           3.0            5.2           2.3
146            6.3           2.5            5.0           1.9
147            6.5           3.0            5.2           2.0
148            6.2           3.4            5.4           2.3
149            5.9           3.0            5.1           1.8

[150 rows x 4 columns]
0      0
1      0
2      0
3      0
4      0
      ..
145    2
146    2
147    2
148    2
149    2
Name: Species, Length: 150, dtype: int32


In [22]:
# Cpnvert the data to numpy array
X = np.asarray(X)
Y = np.asarray(Y)

In [25]:
print(X[:20])
Y

[[5.1 3.5 1.4 0.2]
 [4.9 3.  1.4 0.2]
 [4.7 3.2 1.3 0.2]
 [4.6 3.1 1.5 0.2]
 [5.  3.6 1.4 0.2]
 [5.4 3.9 1.7 0.4]
 [4.6 3.4 1.4 0.3]
 [5.  3.4 1.5 0.2]
 [4.4 2.9 1.4 0.2]
 [4.9 3.1 1.5 0.1]
 [5.4 3.7 1.5 0.2]
 [4.8 3.4 1.6 0.2]
 [4.8 3.  1.4 0.1]
 [4.3 3.  1.1 0.1]
 [5.8 4.  1.2 0.2]
 [5.7 4.4 1.5 0.4]
 [5.4 3.9 1.3 0.4]
 [5.1 3.5 1.4 0.3]
 [5.7 3.8 1.7 0.3]
 [5.1 3.8 1.5 0.3]]


array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [20]:
# Split the data into training and test set
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=2)

In [21]:
print(X.shape, X_train.shape, X_test.shape)

(150, 4) (120, 4) (30, 4)


# Model Selection

## Comparing the models with default hyperparameter using Cross Validation

In [26]:
models = [LogisticRegression(max_iter=1000), SVC(kernel='linear'), KNeighborsClassifier(), RandomForestClassifier(random_state=0)]

In [30]:
def compare_model_cv():
    
    for model in models:
        
        cv_score = cross_val_score(model, X, Y, cv=5)
        accuracy = round(((sum(cv_score)/len(cv_score))*100),2)
        
        print(model)
        print("Cross Validation: \n", cv_score)
        print("Accuracy: \n", accuracy)
        print("******************************************")

In [31]:
compare_model_cv()

LogisticRegression(max_iter=1000)
Cross Validation: 
 [0.96666667 1.         0.93333333 0.96666667 1.        ]
Accuracy: 
 97.33
******************************************
SVC(kernel='linear')
Cross Validation: 
 [0.96666667 1.         0.96666667 0.96666667 1.        ]
Accuracy: 
 98.0
******************************************
KNeighborsClassifier()
Cross Validation: 
 [0.96666667 1.         0.93333333 0.96666667 1.        ]
Accuracy: 
 97.33
******************************************
RandomForestClassifier(random_state=0)
Cross Validation: 
 [0.96666667 0.96666667 0.93333333 0.96666667 1.        ]
Accuracy: 
 96.67
******************************************


- SVC has the highest accuracy with default hyperparameters

### Building the model with SVC

In [32]:
model_svc = SVC(kernel='linear')

In [33]:
model_svc.fit(X_train, Y_train)

In [37]:
# Model Evalutaion for training data
prediction_train = model_svc.predict(X_train)
train_accuracy = accuracy_score(Y_train, prediction_train)
print("Accuracy for training data: ", round(train_accuracy*100,2),"%")

Accuracy for training data:  99.17 %


In [38]:
# Model Evalutaion for test data
prediction_test = model_svc.predict(X_test)
test_accuracy = accuracy_score(Y_test, prediction_test)
print("Accuracy for test data: ", round(test_accuracy*100,2),"%")

Accuracy for test data:  100.0 %


## Comparing the model with Hyperparameter Tunning using GridSearchCV

In [39]:
model_list = [LogisticRegression(max_iter=10000), SVC(), KNeighborsClassifier(),RandomForestClassifier(random_state=0)]

In [40]:
# Create Dictionary for all the hyperparameter

model_hyperparameter = {
    'lr_hyperparameter':{
        'C':[1, 2, 10, 20]
    },
    'SVC_hyperparameter':{
        'kernel': ['linear','poly','rbf','sigmoid'],
        'C': [1, 5, 10, 20]
    },
    'knn_hyperparameter':{
        'n_neighbors':[3, 5, 10]
    },
    'rfc_hyperparameter':{
        'n_estimators':[10, 20, 50, 100]
    }
}

In [44]:
# Creating a list of all the keys from the dictionary
key_model = list(model_hyperparameter.keys())
key_model

['lr_hyperparameter',
 'SVC_hyperparameter',
 'knn_hyperparameter',
 'rfc_hyperparameter']

In [49]:
def compare_model_gridSearch(models, hyperparameter_dict):
    
    result = []
    
    i = 0
    
    for model in models:
        
        key = key_model[i]
        
        parameters = hyperparameter_dict[key]
        
        i+=1
        print(model)
        print(parameters)
        print("************************")
        
        classifier = GridSearchCV(model, parameters, cv =5)
        
        classifier.fit(X, Y)
        
        result.append({
            'Model': model,
            'Highest Score': classifier.best_score_,
            'Best Parameters': classifier.best_params_
        })
        
    result_dataframe = pd.DataFrame(result, columns=['Model', 'Highest Score', 'Best Parameters'])
    
    return result_dataframe

In [50]:
compare_model_gridSearch(model_list, model_hyperparameter)

LogisticRegression(max_iter=10000)
{'C': [1, 2, 10, 20]}
************************
SVC()
{'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'C': [1, 5, 10, 20]}
************************
KNeighborsClassifier()
{'n_neighbors': [3, 5, 10]}
************************
RandomForestClassifier(random_state=0)
{'n_estimators': [10, 20, 50, 100]}
************************


Unnamed: 0,Model,Highest Score,Best Parameters
0,LogisticRegression(max_iter=10000),0.98,{'C': 20}
1,SVC(),0.986667,"{'C': 5, 'kernel': 'rbf'}"
2,KNeighborsClassifier(),0.98,{'n_neighbors': 10}
3,RandomForestClassifier(random_state=0),0.966667,{'n_estimators': 100}


- SVC with hyperparameter values as c:5 and kernel: 'rbf' has the highest accuracy score

## Building a predictive System

In [55]:
# input_data = (5.1,3.5,1.4,0.2)
input_data = (5.7,2.8,4.5,1.3)
#input_data = (6.3,2.5,5.0,1.9)

input_numpy = np.asarray(input_data)

input_reshape = input_numpy.reshape(1,-1)

prediction = model_svc.predict(input_reshape)

print(prediction)

if prediction[0] == 0:
    print("Iris-setosa")
elif prediction[0] == 1:
    print("Iris-versicolor")
else:
    print("Iris-virginica")

[1]
Iris-versicolor


