# Logistic regression model

One, two, you know what to do...

## Notebook 

### Imports

In [1]:
# Standard library imports
import time
from pathlib import Path

# Third party imports
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from scipy.stats import uniform, loguniform
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder, PolynomialFeatures
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.model_selection import RandomizedSearchCV


### Run configuration

In [None]:
# Set to True when running on Kaggle, False when running locally
KAGGLE = False

# Set to True to run runtime experiment, False to load from disk
MEASURE_RUNTIME = True

# Optimization runtime
RUNTIME_LIMIT = 60  # minutes

# Random state for reproducibility
RANDOM_STATE = 315

### Data loading

In [3]:
# Set file paths based on environment
if KAGGLE:
    # Kaggle paths - data is in /kaggle/input/
    train_df_path = '/kaggle/input/playground-series-s5e12/train.csv'
    test_df_path = '/kaggle/input/playground-series-s5e12/test.csv'

else:
    # Otherwise, load data from course GitHub repository
    train_df_path = 'https://gperdrizet.github.io/FSA_devops/assets/data/unit3/diabetes_prediction_train.csv'
    test_df_path = 'https://gperdrizet.github.io/FSA_devops/assets/data/unit3/diabetes_prediction_test.csv'

# Load the training and testing datasets
train_df = pd.read_csv(train_df_path)
test_df = pd.read_csv(test_df_path)

# Display first few rows of training data
train_df.head().transpose()

Unnamed: 0,0,1,2,3,4
id,0,1,2,3,4
age,31,50,32,54,54
alcohol_consumption_per_week,1,2,3,3,1
physical_activity_minutes_per_week,45,73,158,77,55
diet_score,7.7,5.7,8.5,4.6,5.7
sleep_hours_per_day,6.8,6.5,7.4,7.0,6.2
screen_time_hours_per_day,6.1,5.8,9.1,9.2,5.1
bmi,33.4,23.8,24.1,26.6,28.8
waist_to_hip_ratio,0.93,0.83,0.83,0.83,0.9
systolic_bp,112,120,95,121,108


In [4]:
# Display dataset information (columns, dtypes, non-null counts)
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 700000 entries, 0 to 699999
Data columns (total 26 columns):
 #   Column                              Non-Null Count   Dtype  
---  ------                              --------------   -----  
 0   id                                  700000 non-null  int64  
 1   age                                 700000 non-null  int64  
 2   alcohol_consumption_per_week        700000 non-null  int64  
 3   physical_activity_minutes_per_week  700000 non-null  int64  
 4   diet_score                          700000 non-null  float64
 5   sleep_hours_per_day                 700000 non-null  float64
 6   screen_time_hours_per_day           700000 non-null  float64
 7   bmi                                 700000 non-null  float64
 8   waist_to_hip_ratio                  700000 non-null  float64
 9   systolic_bp                         700000 non-null  int64  
 10  diastolic_bp                        700000 non-null  int64  
 11  heart_rate                

## 1. Data Preprocessing

In [5]:
# Define the label
label = 'diagnosed_diabetes'

In [6]:
# Drop the ID column
train_df.drop(columns=['id'], inplace=True)

In [7]:
# Sample data if requested
if SAMPLE is not None:
    train_df = train_df.sample(n=SAMPLE, random_state=RANDOM_STATE)
    print(f"Using sample of {SAMPLE} rows")

else:
    print(f"Using full dataset: {len(train_df)} rows")


Using sample of 10000 rows


### 1.1. Clean numerical features

In [8]:
# Define numerical features to apply IQR clipping
numerical_features = ['age', 'alcohol_consumption_per_week', 'physical_activity_minutes_per_week', 'diet_score', 'sleep_hours_per_day', 'screen_time_hours_per_day', 'bmi', 'waist_to_hip_ratio', 'systolic_bp', 'diastolic_bp', 'heart_rate', 'cholesterol_total', 'hdl_cholesterol', 'ldl_cholesterol', 'triglycerides', 'family_history_diabetes', 'hypertension_history', 'cardiovascular_history']

# Define IQR clipping transformer
class IQRClipper(BaseEstimator, TransformerMixin):
    """Clips features to a multiple of the interquartile range (IQR)."""
    
    def __init__(self, iqr_multiplier=2.0):
        self.iqr_multiplier = iqr_multiplier
        self.lower_bounds_ = None
        self.upper_bounds_ = None
    
    def fit(self, X, y=None):
        """Calculate the clipping bounds based on IQR."""

        Q1 = np.percentile(X, 25, axis=0)
        Q3 = np.percentile(X, 75, axis=0)
        IQR = Q3 - Q1
        self.lower_bounds_ = Q1 - self.iqr_multiplier * IQR
        self.upper_bounds_ = Q3 + self.iqr_multiplier * IQR
        return self
    
    def transform(self, X):
        """Apply clipping to the data."""

        return np.clip(X, self.lower_bounds_, self.upper_bounds_)

### 1.2. Encode categorical features

In [9]:
# Define ordinal features to encode
ordinal_features = ['education_level', 'income_level']

# Define ordinal categories in order
education_categories = [['No formal', 'Highschool', 'Graduate', 'Postgraduate']]
income_categories = [['Low', 'Lower-Middle', 'Middle', 'Upper-Middle', 'High']]

# Create ordinal encoder with categories
ordinal_encoder = OrdinalEncoder(
    categories=education_categories + income_categories,
    handle_unknown='use_encoded_value',
    unknown_value=-1
)

In [10]:
# Define features for one-hot encoding
nominal_features = ['gender', 'ethnicity', 'smoking_status', 'employment_status']

# Create one-hot encoder (will learn categories from data)
onehot_encoder = OneHotEncoder(
    drop='first',
    sparse_output=False,
    handle_unknown='ignore'
)


## 2. Feature engineering

### 2.1. Polynomial features

In [11]:
# Create polynomial features transformer
poly_transformer = PolynomialFeatures()

### 2.2. PCA

In [12]:
# Create PCA transformer
pca = PCA()

## 3. Model optimization

### 3.1. Build model pipeline

In [13]:
# Create preprocessing pipeline for numerical features
numerical_pipeline = Pipeline([
    ('clipper', IQRClipper()),
    ('scaler', StandardScaler())
])

# Create column transformer to apply different transformations to different columns
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_pipeline, numerical_features),
        ('ord', ordinal_encoder, ordinal_features),
        ('nom', onehot_encoder, nominal_features)
    ]
)

# Create full feature engineering & estimator pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('poly', poly_transformer),
    ('pca', pca),
    ('logit', LogisticRegression())
])

### 3.2. Optimize model pipeline

In [14]:
# Define parameter distributions for optimization
param_distributions = [
    {
        'preprocessor__num__clipper__iqr_multiplier': uniform(loc=1.25, scale=1.25),  # Uniform between 1.25 and 2.5
        'poly__degree': [1, 2, 3, 4],
        'poly__include_bias': [True, False],
        'poly__interaction_only': [False, True],
        'pca__n_components': uniform(loc=0.25, scale=0.75),  # Uniform between 0.25 and 1.0
        'logit__C': loguniform(0.001, 100),  # Log-uniform between 0.001 and 100
        'logit__penalty': ['l2'],
        'logit__max_iter': [1000],
        'logit__class_weight': ['balanced'],
    },
    {
        'preprocessor__num__clipper__iqr_multiplier': uniform(loc=1.25, scale=1.25),  # Uniform between 1.25 and 2.5
        'poly__degree': [1, 2, 3, 4],
        'poly__include_bias': [True, False],
        'poly__interaction_only': [False, True],
        'pca__n_components': uniform(loc=0.25, scale=0.75),  # Uniform between 0.25 and 1.0
        'logit__penalty': [None],
        'logit__max_iter': [1000],
        'logit__class_weight': ['balanced'],
    }
]

#### 3.2.1. Runtime experiment

In [None]:
# Define experimental parameters
sample_sizes = [3000, 6000, 12000]
n_iters = [5, 10, 20]

# Define path for saving/loading results
data_dir = Path('../data')
runtime_results_path = data_dir / 'runtime_experiment_results.csv'

if MEASURE_RUNTIME:
    # Store results
    results = []

    # Get full dataset size for later prediction
    full_dataset_size = len(pd.read_csv(train_df_path))

    print("Running runtime experiments...")
    print("=" * 60)

    # Run experiments
    for sample_size in sample_sizes:
        for n_iter in n_iters:

            # Sample the data
            train_sample = pd.read_csv(train_df_path).drop(columns=['id']).sample(
                n=sample_size, 
                random_state=RANDOM_STATE
            )

            X_sample = train_sample.drop(columns=[label])
            y_sample = train_sample[label]
            
            # Create a simple search
            simple_search = RandomizedSearchCV(
                estimator=pipeline,
                param_distributions=param_distributions,
                n_iter=n_iter,
                scoring='roc_auc',
                n_jobs=-1,
                cv=3,
                random_state=RANDOM_STATE,
                verbose=0
            )
            
            # Time the fitting
            start_time = time.time()
            simple_search.fit(X_sample, y_sample)
            elapsed_time = time.time() - start_time
            
            results.append({
                'sample_size': sample_size,
                'n_iter': n_iter,
                'runtime_seconds': elapsed_time
            })
            
            print(f"Sample size: {sample_size:5d} | Iterations: {n_iter:3d} | Time: {elapsed_time:6.2f}s")

    print("\n" + "=" * 60)
    print("Experiment complete!")

    # Convert results to DataFrame
    results_df = pd.DataFrame(results)
    
    # Save results to disk
    data_dir.mkdir(parents=True, exist_ok=True)
    results_df.to_csv(runtime_results_path, index=False)
    print(f"\nResults saved to: {runtime_results_path}")
    
else:
    # Load results from disk
    print("Loading runtime experiment results from disk...")
    results_df = pd.read_csv(runtime_results_path)
    print(f"Loaded {len(results_df)} results from: {runtime_results_path}")
    print("=" * 60)

# Display results
results_df.head(10)

Running runtime experiments...
Sample size:  3000 | Iterations:   5 | Time:  87.83s
Sample size:  3000 | Iterations:   5 | Time:  87.83s
Sample size:  3000 | Iterations:  10 | Time: 201.43s
Sample size:  3000 | Iterations:  10 | Time: 201.43s
Sample size:  3000 | Iterations:  20 | Time: 386.04s


In [None]:
# Build linear regression model to predict runtime
# Features: sample_size and n_iter
# Target: runtime_seconds

X_train_runtime = results_df[['sample_size', 'n_iter']].values
y_train_runtime = results_df['runtime_seconds'].values

# Fit linear model
runtime_model = LinearRegression()
runtime_model.fit(X_train_runtime, y_train_runtime)

# Display model coefficients
print("Runtime Model Coefficients:")
print("=" * 60)
print(f"Intercept: {runtime_model.intercept_:.4f} seconds")
print(f"Sample size coefficient: {runtime_model.coef_[0]:.6f} seconds per sample")
print(f"Iteration coefficient: {runtime_model.coef_[1]:.4f} seconds per iteration")
print()

# Calculate R² score
r2_score = runtime_model.score(X_train_runtime, y_train_runtime)
print(f"R² score: {r2_score:.4f}")
print()

# Show predictions vs actual
results_df['predicted_runtime'] = runtime_model.predict(X_train_runtime)
results_df['residual'] = results_df['runtime_seconds'] - results_df['predicted_runtime']

print("\nActual vs Predicted Runtimes:")
print(results_df.to_string(index=False))

In [None]:
# Predict runtime for full dataset with varying iteration counts
iteration_range = np.arange(10, 501, 10)
full_size_predictions = []

for n_iter in iteration_range:
    predicted_time = runtime_model.predict([[full_dataset_size, n_iter]])[0]
    full_size_predictions.append(predicted_time)

# Create the plot
fig, ax = plt.subplots(figsize=(12, 6))

# Plot predicted runtime vs iterations for full dataset
ax.plot(iteration_range, full_size_predictions, 'b-', linewidth=2, label='Predicted Runtime')

# Add horizontal line for 1 hour and 2 hours
ax.axhline(y=3600, color='r', linestyle='--', linewidth=1.5, label='1 hour', alpha=0.7)
ax.axhline(y=7200, color='orange', linestyle='--', linewidth=1.5, label='2 hours', alpha=0.7)

# Formatting
ax.set_xlabel('Number of Iterations', fontsize=12)
ax.set_ylabel('Predicted Runtime (seconds)', fontsize=12)
ax.set_title(f'Predicted Runtime vs Iterations\n(Full Dataset: {full_dataset_size:,} samples)', fontsize=14, fontweight='bold')
ax.grid(True, alpha=0.3)
ax.legend(fontsize=10)

# Add secondary y-axis for minutes
ax2 = ax.secondary_yaxis('right', functions=(lambda x: x/60, lambda x: x*60))
ax2.set_ylabel('Runtime (minutes)', fontsize=12)

plt.tight_layout()
plt.show()

# Print some specific predictions
print("\nPredicted runtimes for full dataset:")
print("=" * 60)

for n_iter in [50, 100, 200, 300, 500]:
    pred_time = runtime_model.predict([[full_dataset_size, n_iter]])[0]
    print(f"{n_iter:3d} iterations: {pred_time:7.1f} seconds ({pred_time/60:5.1f} minutes)")

In [None]:
# Calculate optimal number of iterations for full dataset based on RUNTIME_LIMIT
# Runtime model: runtime = intercept + coef[0] * sample_size + coef[1] * n_iter
# Solving for n_iter: n_iter = (target_runtime - intercept - coef[0] * sample_size) / coef[1]

target_runtime_seconds = RUNTIME_LIMIT * 60  # Convert minutes to seconds
full_dataset_size = len(pd.read_csv(train_df_path))

# Calculate optimal iterations
optimal_n_iter = (target_runtime_seconds - runtime_model.intercept_ - 
                  runtime_model.coef_[0] * full_dataset_size) / runtime_model.coef_[1]

# Round down to be conservative and ensure we stay within the time limit
optimal_n_iter = int(np.floor(optimal_n_iter))

# Predict actual runtime for this number of iterations
predicted_runtime = runtime_model.predict([[full_dataset_size, optimal_n_iter]])[0]

print("Optimization Parameter Calculation:")
print("=" * 60)
print(f"Runtime limit: {RUNTIME_LIMIT} minutes ({target_runtime_seconds:.0f} seconds)")
print(f"Full dataset size: {full_dataset_size:,} samples")
print(f"\nOptimal number of iterations: {optimal_n_iter}")
print(f"Predicted runtime: {predicted_runtime:.1f} seconds ({predicted_runtime/60:.1f} minutes)")
print(f"Time margin: {(target_runtime_seconds - predicted_runtime)/60:.1f} minutes")
print("=" * 60)

#### 3.2.2. Optimize model pipeline

In [None]:
# Create RandomizedSearchCV with optimal number of iterations
random_search = RandomizedSearchCV(
    estimator=pipeline,
    param_distributions=param_distributions,
    n_iter=optimal_n_iter,
    scoring='roc_auc',
    n_jobs=-1,
    cv=3,
    random_state=RANDOM_STATE,
    verbose=1
)

# Fit the randomized search
print(f"Starting randomized search optimization with {optimal_n_iter} iterations...")
print(f"Expected runtime: ~{predicted_runtime/60:.1f} minutes")
print("=" * 60)

start_optimization = time.time()
random_search.fit(train_df.drop(columns=[label]), train_df[label])
optimization_time = time.time() - start_optimization

# Display best parameters and score
print("\n" + "="*60)
print("Randomized Search Results")
print("="*60)
print(f"Actual runtime: {optimization_time:.1f} seconds ({optimization_time/60:.1f} minutes)")
print(f"Best cross-validation ROC-AUC score: {random_search.best_score_:.4f}")
print(f"\nBest parameters:")
for param, value in random_search.best_params_.items():
    print(f"  {param}: {value}")

## 4. Submission

In [None]:
# Reload full training dataset
train_df_full = pd.read_csv(train_df_path)
train_df_full.drop(columns=['id'], inplace=True)

# Separate features and target
X_train_full = train_df_full.drop(columns=[label])
y_train_full = train_df_full[label]

# Get the best model from randomized search (already trained on best params)
best_model = random_search.best_estimator_

# Retrain on full dataset with best parameters
print("Retraining model on full dataset with best parameters...")
best_model.fit(X_train_full, y_train_full)

print(f"Model retrained on {len(X_train_full)} samples")
print(f"\nBest parameters used:")
for param, value in random_search.best_params_.items():
    print(f"  {param}: {value}")

In [None]:
# Prepare test data
test_ids = test_df['id'].copy()
X_test = test_df.drop(columns=['id'])

# Make predictions
print("Making predictions on test set...")
y_pred = best_model.predict(X_test)

# Create submission dataframe
submission_df = pd.DataFrame({
    'id': test_ids,
    'diagnosed_diabetes': y_pred
})

print(f"\nPredictions completed for {len(submission_df)} test samples")
print(f"\nSubmission dataframe shape: {submission_df.shape}")
print(f"\nFirst few predictions:")
print(submission_df.head())
