# Imports

In [26]:
# Cross-Validation with Scikit-learn

import numpy as np
import pandas as pd
import sklearn as sk
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import cross_val_score, cross_val_predict, cross_validate
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold, GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet, LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer

# Load data for regression

In [22]:
# Load regression dataset
housing = fetch_california_housing()
X, y = housing.data, housing.target
X = StandardScaler().fit_transform(X)

In [3]:
print(housing['DESCR'])

.. _california_housing_dataset:

California Housing dataset
--------------------------

**Data Set Characteristics:**

:Number of Instances: 20640

:Number of Attributes: 8 numeric, predictive attributes and the target

:Attribute Information:
    - MedInc        median income in block group
    - HouseAge      median house age in block group
    - AveRooms      average number of rooms per household
    - AveBedrms     average number of bedrooms per household
    - Population    block group population
    - AveOccup      average number of household members
    - Latitude      block group latitude
    - Longitude     block group longitude

:Missing Attribute Values: None

This dataset was obtained from the StatLib repository.
https://www.dcc.fc.up.pt/~ltorgo/Regression/cal_housing.html

The target variable is the median house value for California districts,
expressed in hundreds of thousands of dollars ($100,000).

This dataset was derived from the 1990 U.S. census, using one row per ce

In [4]:
X

array([[   8.3252    ,   41.        ,    6.98412698, ...,    2.55555556,
          37.88      , -122.23      ],
       [   8.3014    ,   21.        ,    6.23813708, ...,    2.10984183,
          37.86      , -122.22      ],
       [   7.2574    ,   52.        ,    8.28813559, ...,    2.80225989,
          37.85      , -122.24      ],
       ...,
       [   1.7       ,   17.        ,    5.20554273, ...,    2.3256351 ,
          39.43      , -121.22      ],
       [   1.8672    ,   18.        ,    5.32951289, ...,    2.12320917,
          39.43      , -121.32      ],
       [   2.3886    ,   16.        ,    5.25471698, ...,    2.61698113,
          39.37      , -121.24      ]], shape=(20640, 8))

In [5]:
y

array([4.526, 3.585, 3.521, ..., 0.923, 0.847, 0.894], shape=(20640,))

# Score model with cross-validation (regression example)

In [6]:
model = LinearRegression()

In [8]:
scores = cross_val_score(model, X, y, cv=5, scoring='r2')
print("Cross-Validation Scores (R2):", scores)
print("Mean CV R2:", scores.mean())

Cross-Validation Scores (R2): [0.54866323 0.46820691 0.55078434 0.53698703 0.66051406]
Mean CV R2: 0.5530311140279217


In [9]:
model.fit(X, y)

In [10]:
sk.metrics.get_scorer_names()

['accuracy',
 'adjusted_mutual_info_score',
 'adjusted_rand_score',
 'average_precision',
 'balanced_accuracy',
 'completeness_score',
 'd2_absolute_error_score',
 'explained_variance',
 'f1',
 'f1_macro',
 'f1_micro',
 'f1_samples',
 'f1_weighted',
 'fowlkes_mallows_score',
 'homogeneity_score',
 'jaccard',
 'jaccard_macro',
 'jaccard_micro',
 'jaccard_samples',
 'jaccard_weighted',
 'matthews_corrcoef',
 'mutual_info_score',
 'neg_brier_score',
 'neg_log_loss',
 'neg_max_error',
 'neg_mean_absolute_error',
 'neg_mean_absolute_percentage_error',
 'neg_mean_gamma_deviance',
 'neg_mean_poisson_deviance',
 'neg_mean_squared_error',
 'neg_mean_squared_log_error',
 'neg_median_absolute_error',
 'neg_negative_likelihood_ratio',
 'neg_root_mean_squared_error',
 'neg_root_mean_squared_log_error',
 'normalized_mutual_info_score',
 'positive_likelihood_ratio',
 'precision',
 'precision_macro',
 'precision_micro',
 'precision_samples',
 'precision_weighted',
 'r2',
 'rand_score',
 'recall',
 're

In [11]:
# if multiple metrics are desired
scores = cross_validate(model, X, y, cv=5, scoring=['r2', 'neg_mean_absolute_percentage_error'])
print(scores)

{'fit_time': array([0.02418208, 0.00217605, 0.00299215, 0.00224113, 0.00279403]), 'score_time': array([0.01104188, 0.0009141 , 0.00154972, 0.00066996, 0.00048089]), 'test_r2': array([0.54866323, 0.46820691, 0.55078434, 0.53698703, 0.66051406]), 'test_neg_mean_absolute_percentage_error': array([-0.47286294, -0.26825383, -0.27703699, -0.33184665, -0.29286055])}


In [12]:
print("R2\n", scores['test_r2'])
print("% error\n", scores['test_neg_mean_absolute_percentage_error'])

R2
 [0.54866323 0.46820691 0.55078434 0.53698703 0.66051406]
% error
 [-0.47286294 -0.26825383 -0.27703699 -0.33184665 -0.29286055]


# Select regression model using cross-validation

In [14]:
regression_models = {
    "LinearRegression": LinearRegression(),
    "Ridge": Ridge(),
    "Lasso": Lasso()
}

model_scores = {}
for name, reg in regression_models.items():
    score = cross_val_score(reg, X, y, cv=5, scoring='neg_mean_absolute_percentage_error').mean()
    model_scores[name] = score

print("\n2. Regression Model Selection:")
for name, score in model_scores.items():
    print(f"{name}: {score*-1:.4f}")


2. Regression Model Selection:
LinearRegression: 0.3286
Ridge: 0.3286
Lasso: 0.5348


# Select hyperparameters using cross-validation (Grid Search)

In [23]:
# 3. Select hyperparameters using cross-validation (Grid Search)
ridge_params = {
    'alpha': [0.01, 0.1, 1, 10, 100],
    'fit_intercept': [True, False]
}

grid = GridSearchCV(Ridge(), ridge_params, cv=5, scoring='neg_mean_absolute_percentage_error')
grid.fit(X, y)
print("\n3. Best hyperparameters:", grid.best_params_)
print("Best cross-val score (Negative MAPE):", grid.best_score_)


3. Best hyperparameters: {'alpha': 10, 'fit_intercept': True}
Best cross-val score (Negative MAPE): -0.3285127396771008


In [16]:
grid.cv_results_

{'mean_fit_time': array([0.01214037, 0.0008688 , 0.00096703, 0.00110002, 0.0011982 ,
        0.00085888, 0.00092368, 0.00059872, 0.00132875, 0.00072064]),
 'std_fit_time': array([2.00401345e-02, 1.85679863e-04, 5.77112843e-05, 8.33514700e-04,
        3.32973807e-04, 6.91013549e-05, 1.18462359e-04, 2.66757896e-05,
        4.27596484e-04, 8.78435257e-05]),
 'mean_score_time': array([0.00113463, 0.00048299, 0.00049729, 0.00043659, 0.00051455,
        0.00039601, 0.00031099, 0.00031657, 0.00032258, 0.00039611]),
 'std_score_time': array([1.47587389e-03, 3.68174100e-04, 2.85267662e-04, 2.32469309e-04,
        2.75547744e-04, 1.77744721e-04, 8.14567953e-05, 1.50346688e-05,
        1.13638008e-04, 1.79125362e-04]),
 'param_alpha': masked_array(data=[0.01, 0.01, 0.1, 0.1, 1.0, 1.0, 10.0, 10.0, 100.0,
                    100.0],
              mask=[False, False, False, False, False, False, False, False,
                    False, False],
        fill_value=1e+20),
 'param_fit_intercept': masked

# Select model and hyperparameters using cross-validation

In [24]:
# 4. Select model and hyperparameters using cross-validation
model_param_grid = {
    'Ridge': {
        'model': Ridge(),
        'params': {
            'alpha': [0.01, 0.1, 1, 10, 100]
        }
    },
    'Lasso': {
        'model': Lasso(),
        'params': {
            'alpha': [0.01, 0.1, 1, 10, 100]
        }
    },
    'ElasticNet': {
        'model': ElasticNet(),
        'params': {
            'alpha': [0.01, 0.1, 1, 10, 100],
            'l1_ratio': [0.1, 0.3, 0.5, 0.8, 1]
        }
    }
}

best_model = None
best_score = float('-inf')
best_params = {}

print("\n4. Regression Model and Hyperparameter Selection:")
for name, mp in model_param_grid.items():
    gs = GridSearchCV(mp['model'], mp['params'], cv=5, scoring='r2')
    gs.fit(X, y)
    print(f"{name} best R2 score: {gs.best_score_:.4f} with params {gs.best_params_}")
    if gs.best_score_ > best_score:
        best_score = gs.best_score_
        best_model = name
        best_params = gs.best_params_

print(f"\nBest regression model: {best_model} with R2 score {best_score:.4f} and params {best_params}")


4. Regression Model and Hyperparameter Selection:
Ridge best R2 score: 0.5531 with params {'alpha': 10}
Lasso best R2 score: 0.5498 with params {'alpha': 0.01}
ElasticNet best R2 score: 0.5520 with params {'alpha': 0.01, 'l1_ratio': 0.1}

Best regression model: Ridge with R2 score 0.5531 and params {'alpha': 10}


# Stratified Cross-Validation with Titanic Dataset

In [27]:

print("\n5. Stratified Cross-Validation with Titanic Dataset")
titanic_url = 'https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv'
titanic = pd.read_csv(titanic_url)

# Preprocessing
features = ['Pclass', 'Sex', 'Age', 'Fare']
titanic = titanic[features + ['Survived']]

# Handle missing values
imputer = SimpleImputer(strategy='mean')
titanic['Age'] = imputer.fit_transform(titanic[['Age']])

# Encode categorical variable
titanic['Sex'] = LabelEncoder().fit_transform(titanic['Sex'])

X_titanic = titanic[features]
y_titanic = titanic['Survived']

# Use StratifiedKFold
# KFold(n_splits=5, shuffle=True, random_state=41)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
model = LogisticRegression()
scores = cross_val_score(model, X_titanic, y_titanic, cv=skf)

print("Stratified CV Accuracy Scores:", scores)
print("Mean Stratified CV Accuracy:", scores.mean())


5. Stratified Cross-Validation with Titanic Dataset
Stratified CV Accuracy Scores: [0.80446927 0.79213483 0.78651685 0.76404494 0.79775281]
Mean Stratified CV Accuracy: 0.7889837423890528
