<a href="https://colab.research.google.com/github/crossboww/Machine_Learning/blob/main/ML_learning/Cross_Validation_with_Python_Implementation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#Importing the Dependencies
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score

#Importing the Models

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

#Data Collection and Pre-Processing

In [None]:
heart_data = pd.read_csv("heart.csv")

In [None]:
heart_data.shape

(303, 14)

In [None]:
heart_data.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [None]:
heart_data.isnull().sum()

Unnamed: 0,0
age,0
sex,0
cp,0
trestbps,0
chol,0
fbs,0
restecg,0
thalach,0
exang,0
oldpeak,0


#Splitting the Data into Feature and Target columns

In [None]:
x = heart_data.drop(columns = 'target', axis = 1)
y = heart_data['target']

In [None]:
print(x.head())

   age  sex  cp  trestbps  chol  fbs  restecg  thalach  exang  oldpeak  slope  \
0   63    1   3       145   233    1        0      150      0      2.3      0   
1   37    1   2       130   250    0        1      187      0      3.5      0   
2   41    0   1       130   204    0        0      172      0      1.4      2   
3   56    1   1       120   236    0        1      178      0      0.8      2   
4   57    0   0       120   354    0        1      163      1      0.6      2   

   ca  thal  
0   0     1  
1   0     2  
2   0     2  
3   0     2  
4   0     2  


In [None]:
print(y.head())

0    1
1    1
2    1
3    1
4    1
Name: target, dtype: int64


In [None]:
#Counting the classes of values in Target Columns
y.value_counts()


Unnamed: 0_level_0,count
target,Unnamed: 1_level_1
1,165
0,138


1---------> non-healthy heart

0---------> healthy heart

# Splitting the data into Training and testing data

First we are doing with Train-test-split function

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, stratify = y, random_state= 3)

In [None]:
print(x.shape, x_train.shape, x_test.shape)

(303, 13) (242, 13) (61, 13)


#Standerdize the Data

In [None]:
scaler = StandardScaler()

In [None]:
x_train_stnd = scaler.fit_transform(x_train)
x_test_stnd = scaler.transform(x_test)

In [None]:
print(x_train_stnd)

[[ 1.4589099   0.70272837  1.04280096 ... -0.67987914 -0.71481302
   1.12664163]
 [ 1.12213134 -1.42302495 -0.9134613  ... -0.67987914  1.3323438
  -0.48665585]
 [ 0.33631468  0.70272837 -0.9134613  ... -0.67987914  0.30876539
   1.12664163]
 ...
 [ 1.68342895  0.70272837  2.02093209 ... -0.67987914  0.30876539
  -0.48665585]
 [-0.44950197  0.70272837  1.04280096 ...  0.93316745 -0.71481302
  -0.48665585]
 [ 0.44857421  0.70272837  1.04280096 ...  0.93316745  1.3323438
   1.12664163]]


In [None]:
X = x_train_stnd
Y = y_train

#Training the model and Validation

In [None]:
models = [LogisticRegression(max_iter = 1000), SVC(kernel = 'linear'), KNeighborsClassifier(), RandomForestClassifier()]

In [None]:
def train_test_val_accuracy():

  for model in models:

    #Training the model
    model.fit(X, Y)

    #Evaluting the Models
    test_data_predict = model.predict(x_test_stnd)

    accuracy = accuracy_score(y_test, test_data_predict)

    print("Accuracy Score for ", model, " = ", accuracy)

In [None]:
train_test_val_accuracy()

Accuracy Score for  LogisticRegression(max_iter=1000)  =  0.7868852459016393
Accuracy Score for  SVC(kernel='linear')  =  0.7704918032786885
Accuracy Score for  KNeighborsClassifier()  =  0.8032786885245902
Accuracy Score for  RandomForestClassifier()  =  0.7704918032786885


#**Cross Validation Techniques**

Cross validation for LogisticRegression Model

In [None]:
cv_score_lr = cross_val_score(LogisticRegression(max_iter=1000), x, y, cv = 5)

In [None]:
print(cv_score_lr)

[0.80327869 0.86885246 0.85245902 0.86666667 0.75      ]


In [None]:
mean_cv_score_lr = sum(cv_score_lr) / len(cv_score_lr)

mean_cv_score_lr = mean_cv_score_lr*100

mean_cv_score_lr = round(mean_cv_score_lr, 2)

print(mean_cv_score_lr)

82.83


Creating the Function to Validate the Score of each models

In [None]:
models = [LogisticRegression(max_iter=1000), SVC(kernel = 'linear'), KNeighborsClassifier(), RandomForestClassifier()]

In [None]:
def compare_model_cv():

  for model in models:

    cv_score = cross_val_score(model, x, y, cv = 5)

    mean_cv_score = sum(cv_score) / len(cv_score)

    mean_cv_score = mean_cv_score*100

    mean_cv_score = round(mean_cv_score, 2)

    print(cv_score)
    print("Cross Validation Accuracy Score for", model, " = " , mean_cv_score)
    print("-----------------------------------------------------")

In [None]:
compare_model_cv()

[0.80327869 0.86885246 0.85245902 0.86666667 0.75      ]
Cross Validation Accuracy Score for LogisticRegression(max_iter=1000)  =  82.83
-----------------------------------------------------
[0.81967213 0.8852459  0.80327869 0.86666667 0.76666667]
Cross Validation Accuracy Score for SVC(kernel='linear')  =  82.83
-----------------------------------------------------
[0.60655738 0.6557377  0.57377049 0.73333333 0.65      ]
Cross Validation Accuracy Score for KNeighborsClassifier()  =  64.39
-----------------------------------------------------
[0.81967213 0.86885246 0.78688525 0.8        0.75      ]
Cross Validation Accuracy Score for RandomForestClassifier()  =  80.51
-----------------------------------------------------
