# Cross Validation (K-Fold technique)

In [1]:
# 1. Import libraries
import pandas as pd
from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, make_scorer
import numpy as np

In [2]:
# 2. Load dataset (Wine Quality dataset from UCI)
# Dataset link: https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv"
data = pd.read_csv(url, sep=';')

In [3]:
print("Dataset Shape:", data.shape)
print(data.head())

Dataset Shape: (1599, 12)
   fixed acidity  volatile acidity  citric acid  residual sugar  chlorides  \
0            7.4              0.70         0.00             1.9      0.076   
1            7.8              0.88         0.00             2.6      0.098   
2            7.8              0.76         0.04             2.3      0.092   
3           11.2              0.28         0.56             1.9      0.075   
4            7.4              0.70         0.00             1.9      0.076   

   free sulfur dioxide  total sulfur dioxide  density    pH  sulphates  \
0                 11.0                  34.0   0.9978  3.51       0.56   
1                 25.0                  67.0   0.9968  3.20       0.68   
2                 15.0                  54.0   0.9970  3.26       0.65   
3                 17.0                  60.0   0.9980  3.16       0.58   
4                 11.0                  34.0   0.9978  3.51       0.56   

   alcohol  quality  
0      9.4        5  
1      9.8      

In [4]:
# 3. Separate features (X) and target (y)
X = data.drop('quality', axis=1)  # all columns except target
y = data['quality']               # target variable


# axis = 0 → operate along rows (means "look vertically")
# axis = 1 → operate along columns (means "look horizontally")

In [5]:
# 4. Initialize model
model = LinearRegression()

In [6]:
# 5. Define K-Fold Cross-Validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)
# n_splits = number of folds
# shuffle=True ensures random splitting
# random_state ensures reproducibility

In [7]:
# 6. Define scoring metric (we'll use Negative RMSE for regression)
scoring = make_scorer(mean_squared_error, greater_is_better=False)

In [8]:
scoring

make_scorer(mean_squared_error, greater_is_better=False)

In [9]:
# 7. Perform cross-validation
cv_scores = cross_val_score(model, X, y, cv=kf, scoring=scoring)

In [10]:
cv_scores

array([-0.39002514, -0.46758029, -0.48341328, -0.45494133, -0.34785364])

In [11]:
np.sqrt(-(-0.39002514))

0.6245199276244113

In [12]:
# 8. Convert Negative MSE to Positive RMSE
rmse_scores = np.sqrt(-cv_scores)

In [13]:
rmse_scores

array([0.62451993, 0.68379842, 0.69527929, 0.67449339, 0.58979118])

In [14]:
# 9. Display results
print("RMSE for each fold:", rmse_scores)
print("Average RMSE:", rmse_scores.mean())
print("Standard Deviation of RMSE:", rmse_scores.std())

RMSE for each fold: [0.62451993 0.68379842 0.69527929 0.67449339 0.58979118]
Average RMSE: 0.6535764413508065
Standard Deviation of RMSE: 0.04000713519638171


In [15]:
np.sqrt(0.39)

0.6244997998398398