# Imports

In [None]:
# Cross-Validation with Scikit-learn

import numpy as np
import pandas as pd
import sklearn as sk
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import cross_val_score, cross_val_predict, cross_validate
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold, GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer

# Load data for regression

In [None]:
# Load regression dataset
housing = fetch_california_housing()
X, y = housing.data, housing.target

In [None]:
print(housing['DESCR'])

# Score model with cross-validation (regression example)

In [None]:
model = LinearRegression()

In [None]:
scores = cross_val_score(model, X, y, cv=5, scoring='r2')
print("Cross-Validation Scores (R2):", scores)
print("Mean CV R2:", scores.mean())

In [None]:
sk.metrics.get_scorer_names()

In [None]:
# if multiple metrics are desired
scores = cross_validate(model, X, y, cv=5, scoring=['r2', 'neg_mean_absolute_percentage_error'])
print(scores)

In [None]:
print("R2\n", scores['test_r2'])
print("% error\n", scores['test_neg_mean_absolute_percentage_error'])

# Select regression model using cross-validation

In [None]:
regression_models = {
    "LinearRegression": LinearRegression(),
    "Ridge": Ridge(),
    "Lasso": Lasso()
}

model_scores = {}
for name, reg in regression_models.items():
    score = cross_val_score(reg, X, y, cv=5, scoring='neg_mean_absolute_percentage_error').mean()
    model_scores[name] = score

print("\n2. Regression Model Selection:")
for name, score in model_scores.items():
    print(f"{name}: {score:.4f}")

# Select hyperparameters using cross-validation (Grid Search)

In [None]:
# 3. Select hyperparameters using cross-validation (Grid Search)
ridge_params = {
    'alpha': [0.01, 0.1, 1, 10, 100]
}

grid = GridSearchCV(Ridge(), ridge_params, cv=5, scoring='neg_mean_absolute_percentage_error')
grid.fit(X, y)
print("\n3. Best hyperparameters:", grid.best_params_)
print("Best cross-val score (Negative MAPE):", grid.best_score_)

In [None]:
grid.cv_results_

# Select model and hyperparameters using cross-validation

In [None]:
# 4. Select model and hyperparameters using cross-validation
model_param_grid = {
    'Ridge': {
        'model': Ridge(),
        'params': {
            'alpha': [0.01, 0.1, 1, 10, 100]
        }
    },
    'Lasso': {
        'model': Lasso(),
        'params': {
            'alpha': [0.01, 0.1, 1, 10, 100]
        }
    }
}

best_model = None
best_score = float('-inf')
best_params = {}

print("\n4. Regression Model and Hyperparameter Selection:")
for name, mp in model_param_grid.items():
    gs = GridSearchCV(mp['model'], mp['params'], cv=5, scoring='r2')
    gs.fit(X, y)
    print(f"{name} best R2 score: {gs.best_score_:.4f} with params {gs.best_params_}")
    if gs.best_score_ > best_score:
        best_score = gs.best_score_
        best_model = name
        best_params = gs.best_params_

print(f"\nBest regression model: {best_model} with R2 score {best_score:.4f} and params {best_params}")

# Stratified Cross-Validation with Titanic Dataset

In [None]:

print("\n5. Stratified Cross-Validation with Titanic Dataset")
titanic_url = 'https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv'
titanic = pd.read_csv(titanic_url)

# Preprocessing
features = ['Pclass', 'Sex', 'Age', 'Fare']
titanic = titanic[features + ['Survived']]

# Handle missing values
imputer = SimpleImputer(strategy='mean')
titanic['Age'] = imputer.fit_transform(titanic[['Age']])

# Encode categorical variable
titanic['Sex'] = LabelEncoder().fit_transform(titanic['Sex'])

X_titanic = titanic[features]
y_titanic = titanic['Survived']

# Use StratifiedKFold
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
model = RandomForestClassifier()
scores = cross_val_score(model, X_titanic, y_titanic, cv=skf)

print("Stratified CV Accuracy Scores:", scores)
print("Mean Stratified CV Accuracy:", scores.mean())