# Hold-Out validation

This will demonstate on how to perform hold-out validation in a sample dataset.

## Importing and loading data

In [1]:
# Importing necessary libraries
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings('ignore')

import matplotlib.pyplot as plt
%matplotlib inline

print(pd.__version__)
print(np.__version__)

2.1.3
1.26.1


In [2]:
# Loading the data
data = pd.read_csv('datasets/data_cleaned.csv')

# Checking the data
data.head()

Unnamed: 0,Survived,Age,Fare,Pclass_1,Pclass_2,Pclass_3,Sex_female,Sex_male,SibSp_0,SibSp_1,...,Parch_0,Parch_1,Parch_2,Parch_3,Parch_4,Parch_5,Parch_6,Embarked_C,Embarked_Q,Embarked_S
0,0,22.0,7.25,0,0,1,0,1,0,1,...,1,0,0,0,0,0,0,0,0,1
1,1,38.0,71.2833,1,0,0,1,0,0,1,...,1,0,0,0,0,0,0,1,0,0
2,1,26.0,7.925,0,0,1,1,0,1,0,...,1,0,0,0,0,0,0,0,0,1
3,1,35.0,53.1,1,0,0,1,0,0,1,...,1,0,0,0,0,0,0,0,0,1
4,0,35.0,8.05,0,0,1,0,1,1,0,...,1,0,0,0,0,0,0,0,0,1


In [3]:
# Checking for NULL values
data.isnull().sum().head()

Survived    0
Age         0
Fare        0
Pclass_1    0
Pclass_2    0
dtype: int64

## Splitting the data

### Separating Dependent and Independent Variables

In [4]:
# For training set
data_x = data.drop(['Survived'], axis=1)
data_y = data['Survived']

print(data_x.shape, data_y.shape)

(891, 24) (891,)


### Creating validation and test sets

In [5]:
from sklearn.model_selection import train_test_split

# Splitting the data using the train_test_split function
X_train, X_test, y_train, y_test = train_test_split(data_x, data_y, test_size = 0.2, random_state = 50, stratify = data_y)

In [8]:
# Splitting the data further for validation and test set with the help of train_test_split function
train_x, val_x, train_y, val_y = train_test_split(X_train, y_train, test_size = 0.2 , random_state = 51, stratify = y_train)

print(train_x.shape, train_y.shape)
print(val_x.shape, val_y.shape)
print(X_test.shape, y_test.shape)

(569, 24) (569,)
(143, 24) (143,)
(179, 24) (179,)


### Checking distribution of target class in train, test and validation sets

In [9]:
# For train_y
print(train_y.value_counts()/len(train_y))

Survived
0    0.616872
1    0.383128
Name: count, dtype: float64


In [10]:
# For val_y
print(val_y.value_counts()/len(val_y))

Survived
0    0.615385
1    0.384615
Name: count, dtype: float64


In [12]:
# For y_test
print(y_test.value_counts()/len(y_test))

Survived
0    0.614525
1    0.385475
Name: count, dtype: float64
