# LEAVE ONE OUT cross validation method

In [17]:
# basic liberaries for data manupulation
import pandas as pd
import numpy as np

# liberaries for ploting graphs 
import seaborn as sns
import matplotlib.pyplot as plt
# parameters of graps
plt.rcParams['figure.figsize'] = (10, 6)
plt.rcParams['figure.dpi'] = 250
sns.set_theme(style='darkgrid')
 
# liberaries for using cross validation methods 
from sklearn.model_selection import train_test_split, KFold, LeaveOneOut, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.metrics import confusion_matrix, classification_report

# to ignore future warnings
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('https://raw.githubusercontent.com/aishwaryamate/Datasets/refs/heads/main/Pima.csv', index_col=0)
df

Unnamed: 0,preg,plas,pres,skin,test,mass,pedi,age,class
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


# Basic EDA

In [3]:
# checking null values

df.isna().sum()

preg     0
plas     0
pres     0
skin     0
test     0
mass     0
pedi     0
age      0
class    0
dtype: int64

In [4]:
# checking duplicated values

df.duplicated().sum()

0

In [5]:
# checking datatypes

df.dtypes

preg       int64
plas       int64
pres       int64
skin       int64
test       int64
mass     float64
pedi     float64
age        int64
class      int64
dtype: object

# CROSS VALIDATION TECHNIQUES
1. trian test split
2. KFold
3. Leave One Out

# defining x and y varibles

In [6]:
df.head(1)

Unnamed: 0,preg,plas,pres,skin,test,mass,pedi,age,class
0,6,148,72,35,0,33.6,0.627,50,1


In [7]:
# all the independent features

x = df.drop(columns=['class'])
x

Unnamed: 0,preg,plas,pres,skin,test,mass,pedi,age
0,6,148,72,35,0,33.6,0.627,50
1,1,85,66,29,0,26.6,0.351,31
2,8,183,64,0,0,23.3,0.672,32
3,1,89,66,23,94,28.1,0.167,21
4,0,137,40,35,168,43.1,2.288,33
...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63
764,2,122,70,27,0,36.8,0.340,27
765,5,121,72,23,112,26.2,0.245,30
766,1,126,60,0,0,30.1,0.349,47


In [8]:
# target column

y = df['class']
y

0      1
1      0
2      1
3      0
4      1
      ..
763    0
764    0
765    0
766    1
767    0
Name: class, Length: 768, dtype: int64

<p  style = "font-size : 35px; color : #34656d ; font-family : 'Comic Sans MS'; text-align : center; background-color : #00008B; border-radius: 5px 5px;"><strong>LEAVE ONE OUT</strong></p> 

In [11]:
df.shape

(768, 9)

In [10]:
# creating instance for leave one out
# it will leave one row for testing purpose and keeps other rows for training purpose
# cross_val_score uses this rows for training and testing of the models that I will mention inside cross validation score method
# then calculates accuracy of models 
# there are 768 rows in my dataset so it will create 768 parts of data

lv_one_out = LeaveOneOut()

In [12]:
# using Decision Tree model
score = cross_val_score(DecisionTreeClassifier(), x, y, cv=lv_one_out)

In [13]:
# each model will give me either 0% or 100% accuracy because only one data is select for training purpose
# and it is classification model so either classificationn will be False(0%) or True(100%)
score

array([1., 1., 1., 1., 0., 1., 0., 1., 1., 1., 1., 1., 1., 0., 1., 0., 1.,
       1., 1., 0., 1., 0., 1., 0., 0., 1., 0., 1., 0., 1., 1., 1., 1., 1.,
       0., 1., 0., 1., 0., 0., 0., 0., 1., 1., 0., 1., 0., 1., 1., 1., 1.,
       1., 1., 1., 0., 1., 1., 1., 1., 1., 1., 0., 1., 0., 1., 0., 0., 0.,
       1., 1., 0., 0., 1., 1., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 0., 1., 1., 0., 1., 0., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 0., 1., 0., 0., 1., 1., 1., 0., 1., 1.,
       1., 1., 1., 1., 1., 0., 1., 1., 1., 0., 0., 0., 1., 1., 1., 1., 1.,
       1., 1., 0., 1., 1., 0., 1., 0., 1., 1., 1., 0., 0., 1., 1., 1., 1.,
       1., 0., 1., 1., 1., 1., 1., 1., 0., 1., 1., 1., 1., 1., 0., 1., 0.,
       0., 0., 1., 1., 1., 1., 1., 0., 0., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 0., 1., 1., 1., 1., 1., 1., 1., 0., 0., 1., 1., 1., 0., 1.,
       1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 1., 1., 1., 0., 0., 1.,
       1., 1., 1., 1., 1.

In [15]:
# FINAL ACCURACY
score.mean()

0.70703125

<p  style = "font-size : 35px; color : #34656d ; font-family : 'Comic Sans MS'; text-align : center; background-color : #00008B; border-radius: 5px 5px;"><strong>LEAVE ONE OUT</strong></p>  

# USING LOGISTIC REGRESSION

In [18]:
# let's see
lv_one = LeaveOneOut()
scores = cross_val_score(LogisticRegression(), x, y, cv = lv_one)
scores.mean()

0.7786458333333334

# LogisticRegression model is performing better than DecisionTreeClassifier