### Grid Search & Cross-Validation from Scratch

Here I re-created the implementation of the **Grid Search with Cross Validation (CV)** seen in exercise `08_sklearn_grid_search` from scratch (without using Scikit-Learn).

In [1]:
import numpy as np
import pandas as pd
import itertools

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [2]:
diabetes = pd.read_csv('../datasets/diabetes.csv')

diabetes = diabetes.sample(frac=1, random_state=42).reset_index(drop=True)

selected_features = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
                     'BMI', 'DiabetesPedigreeFunction', 'Age']
X = diabetes[selected_features].values
y = diabetes['Outcome'].values

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_std = scaler.fit_transform(X_train)
X_test_std = scaler.transform(X_test)

logistic_model = LogisticRegression(random_state=42, solver='saga')

In [3]:
# Function to perform k-fold cross-validation manually
def cross_validate(model, X, y, k):
    fold_size = len(X) // k
    scores = []

    for i in range(k):
        # Creating train/validation splits
        X_val = X[i * fold_size:(i + 1) * fold_size]
        y_val = y[i * fold_size:(i + 1) * fold_size]

        X_train = np.concatenate((X[:i * fold_size], X[(i + 1) * fold_size:]), axis=0)
        y_train = np.concatenate((y[:i * fold_size], y[(i + 1) * fold_size:]), axis=0)

        # Train model
        model.fit(X_train, y_train)
        y_pred = model.predict(X_val)
        acc = accuracy_score(y_val, y_pred)
        scores.append(acc)
        print(f"  Fold {i+1}: Accuracy = {acc:.4f}")  # Print accuracy for each fold

    return np.mean(scores) # return average CV accuracy

In [4]:
# Define hyperparameter grid
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
    'penalty': ['l1', 'l2'],
    'tol': [0.1, 0.001]
}

# Generate all possible hyperparameter combinations
param_combinations = list(itertools.product(param_grid['C'], param_grid['penalty'], param_grid['tol']))
num_candidates = len(param_combinations)  # Total hyperparameter combinations
cv_folds = 5  # Number of cross-validation folds
total_fits = num_candidates * cv_folds  # Total model fits

# Perform grid search with CV from Scratch
best_score = -np.inf
best_params = None

print(f"Fitting {cv_folds} folds for each of {num_candidates} candidates, totalling {total_fits} fits.\n")
for idx, params in enumerate(param_combinations):
    C, penalty, tol = params

    # Define model with current hyperparameters configuration to test
    model = LogisticRegression(random_state=42, solver='saga', C=C, penalty=penalty, tol=tol)

    print(f"\n[{idx+1}/{num_candidates}] Testing Params: C={C}, penalty={penalty}, tol={tol}")
    
    # Perform cross-validation
    mean_cv_score = cross_validate(model, X_train_std, y_train, k=cv_folds)

    print(f"  Average CV Accuracy: {mean_cv_score:.4f}")

    # Update best model if the current one is better
    if mean_cv_score > best_score:
        best_score = mean_cv_score
        best_params = params

# Print best parameters
print("\nBest Parameters:", best_params)
print(f"Best Accuracy (training set): {best_score:.2f}")

Fitting 5 folds for each of 24 candidates, totalling 120 fits.


[1/24] Testing Params: C=0.001, penalty=l1, tol=0.1
  Fold 1: Accuracy = 0.6557
  Fold 2: Accuracy = 0.7213
  Fold 3: Accuracy = 0.6066
  Fold 4: Accuracy = 0.6639
  Fold 5: Accuracy = 0.6475
  Average CV Accuracy: 0.6590

[2/24] Testing Params: C=0.001, penalty=l1, tol=0.001
  Fold 1: Accuracy = 0.6557
  Fold 2: Accuracy = 0.7213
  Fold 3: Accuracy = 0.6066
  Fold 4: Accuracy = 0.6639
  Fold 5: Accuracy = 0.6475
  Average CV Accuracy: 0.6590

[3/24] Testing Params: C=0.001, penalty=l2, tol=0.1
  Fold 1: Accuracy = 0.6557
  Fold 2: Accuracy = 0.7213
  Fold 3: Accuracy = 0.6066
  Fold 4: Accuracy = 0.6639
  Fold 5: Accuracy = 0.6475
  Average CV Accuracy: 0.6590

[4/24] Testing Params: C=0.001, penalty=l2, tol=0.001
  Fold 1: Accuracy = 0.6557
  Fold 2: Accuracy = 0.7213
  Fold 3: Accuracy = 0.6066
  Fold 4: Accuracy = 0.6639
  Fold 5: Accuracy = 0.6475
  Average CV Accuracy: 0.6590

[5/24] Testing Params: C=0.01, penalty=

In [5]:
# Train the best model on the full training set
best_model = LogisticRegression(random_state=42, solver='saga', C=best_params[0], 
                                penalty=best_params[1], tol=best_params[2], max_iter=1000)
best_model.fit(X_train_std, y_train)

# Make predictions on the standardized test data
y_pred = best_model.predict(X_test_std)

# Evaluate performance
accuracy = accuracy_score(y_test, y_pred)
classification_report_str = classification_report(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

# Print results
print("\nFinal Model Evaluation:")
print(f'Best Hyperparameters: {best_params}')
print(f'Accuracy with Best Model (test set): {accuracy:.2f}')
print('Classification Report:\n', classification_report_str)
print('Confusion Matrix:\n', conf_matrix)


Final Model Evaluation:
Best Hyperparameters: (1, 'l1', 0.1)
Accuracy with Best Model (test set): 0.77
Classification Report:
               precision    recall  f1-score   support

           0       0.79      0.88      0.83        96
           1       0.74      0.60      0.67        58

    accuracy                           0.77       154
   macro avg       0.76      0.74      0.75       154
weighted avg       0.77      0.77      0.77       154

Confusion Matrix:
 [[84 12]
 [23 35]]
