# 1. Setup

## 1.A Summary

## 1.B Libraries Import

In [112]:
# === STANDARD LIBRARY ===
import json
import os
import pickle
import time
from datetime import datetime as dt, timezone as tz

# === THIRD‑PARTY LIBRARIES ===
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from skorch import NeuralNetClassifier
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
)
from sklearn.model_selection import (
    GridSearchCV,
    StratifiedKFold,
    train_test_split,
)
from sklearn.preprocessing import MinMaxScaler, StandardScaler

# === PROJECT UTILITIES ===
from tools import Tools

## 1.C Invoke Classes

In [113]:
tools = Tools()

## 1.D Load Configuration

In [114]:
config = tools.load_toml_file("config.toml")
tools.print_message('success', 'Loaded configuration', format_dict={'number of keys': len(config)})

## 1.E Load the dataset

In [115]:
# Open dataset
# Realinho, V., Martins, M.V., Machado, J. and Baptista, L.M.T., 2021. Predict Students' Dropout and Academic Success. UCI Machine Learning Repository. Available at: https://doi.org/10.24432/C5MC89 [Accessed 31 May 2025].
df_dataset = tools.load_dataset(file_name='dataset_raw.csv')
df_dataset.head()

Unnamed: 0,marital_status,application_mode,application_order,course,daytime_evening_attendance,previous_qualification,previous_qualification_grade,nationality,mothers_qualification,fathers_qualification,...,curricular_units_2nd_sem_credited,curricular_units_2nd_sem_enrolled,curricular_units_2nd_sem_evaluations,curricular_units_2nd_sem_approved,curricular_units_2nd_sem_grade,curricular_units_2nd_sem_without_evaluations,unemployment_rate,inflation_rate,gdp,target
0,1,17,5,171,1,1,122.0,1,19,12,...,0,0,0,0,0.0,0,10.8,1.4,1.74,Dropout
1,1,15,1,9254,1,1,160.0,1,1,3,...,0,6,6,6,13.666667,0,13.9,-0.3,0.79,Graduate
2,1,1,5,9070,1,1,122.0,1,37,37,...,0,6,0,0,0.0,0,10.8,1.4,1.74,Dropout
3,1,17,2,9773,1,1,122.0,1,38,37,...,0,6,10,5,12.4,0,9.4,-0.8,-3.12,Graduate
4,2,39,1,8014,0,1,100.0,1,37,38,...,0,6,6,6,13.0,0,13.9,-0.3,0.79,Graduate


## 1.F Apply Target Binary Transformation

In [116]:
# Add a new target column with renamed values for one vs rest classification
df_dataset['target_binary'] = df_dataset['target'].map({'Dropout': 0, 'Graduate': 1, 'Enrolled': 1})
df_dataset['target_binary'].value_counts()

target_binary
1    3003
0    1421
Name: count, dtype: int64

## 1.G Data Shape Check

In [117]:
shape = df_dataset.shape
tools.print_message('success', 'Dataset loaded', format_dict={'rows': shape[0], 'columns': shape[1]})

## 1.H Create Directory to Save Models

In [118]:
models_dir = os.path.join(os.getcwd(), 'models')
models_performance = os.path.join(os.getcwd(), 'models_performance')
dirs_to_create = [models_dir, models_performance]
for d in dirs_to_create:
    if not os.path.exists(d):
        os.makedirs(d)


# 2. Feature Selection

## 2.A Summary

### <span style="color: #e74c3c;">**Feature Selection for Logistic Regression**</span>

This analysis prepared the dataset for Logistic Regression by removing problematic features and encoding categorical variables to create stable, interpretable predictors suitable for linear classification.

### <span style="color: #2E86AB;">**1. Feature Removal Strategy**</span>

**Removed 13 features** from original 36:
- **Data leakage**: 5 second semester features (students who withdraw early show zeros)
- **Severely imbalanced**: Nationality, educational special needs, international status (>97% in one category)
- **Zero information**: Daytime/evening attendance, displaced status
- **Weak predictors**: Economic indicators, previous qualification grade (correlation <0.10)

**Result**: 23 meaningful features suitable for linear classification.

### <span style="color: #2E86AB;">**2. High Cardinality Feature Engineering**</span>

**Parents' qualifications and occupations** (150+ total categories) were reduced to **2 binary indicators**:
- `parental_higher_education` - at least one parent with higher education
- `parental_professional_occupation` - at least one parent in professional role

**Benefits**: Captures family background whilst avoiding **coefficient inflation** from too many categories.

### <span style="color: #2E86AB;">**3. Withdrawal Rate Encoding**</span>

**Medium cardinality features** were replaced with their historical withdrawal rates:
- `application_mode` (18 categories) → `application_mode_withdrawal_rate`
- `course` (17 categories) → `course_withdrawal_rate` 
- `previous_qualification` (17 categories) → `previous_qualification_withdrawal_rate`

**Advantage**: Creates continuous predictors where higher values = higher risk, avoiding **sparse matrices** that harm model convergence.

### <span style="color: #2E86AB;">**4. One-Hot Encoding**</span>

**Low cardinality features** (2-8 categories) used **one-hot encoding** with `drop_first=True`:
- Marital status, application order, gender, scholarship holder, tuition fees status, debtor status
- Creates ~15 binary features
- **Drop-first prevents multicollinearity** - avoids perfect correlation between category indicators

### <span style="color: #e74c3c;">**Impact on Logistic Regression**</span>

**Model stability**: Strategic encoding prevents **multicollinearity** and **overfitting** whilst maintaining interpretable coefficients. **Early intervention capability**: Uses only first semester data for timely withdrawal prediction. **Regularisation ready**: 25 well-encoded features work effectively with **L1/L2 regularisation** techniques.

This feature selection balances predictive power with model stability, creating optimal conditions for Logistic Regression deployment in student retention systems.

## 2.B Features to Remove

In [119]:
# Data leakage - 2nd semester features
data_leakage_features = [
    'curricular_units_2nd_sem_credited',
    'curricular_units_2nd_sem_enrolled', 
    'curricular_units_2nd_sem_evaluations',
    'curricular_units_2nd_sem_approved',
    'curricular_units_2nd_sem_grade'
]

# Severely imbalanced features (>97% in one category)
imbalanced_features = [
    'nationality',
    'educational_special_needs',
    'international'
]

# Zero information value features
zero_info_features = [
    'daytime_evening_attendance',
    'displaced'
]

# Weak predictors (correlation < 0.10)
weak_predictors = [
    'unemployment_rate',
    'inflation_rate',
    'gdp',
    'previous_qualification_grade'
]

# Old target column
target_column = ['target']

# Combine all features to drop
features_to_drop = (data_leakage_features + 
                   imbalanced_features + 
                   zero_info_features + 
                   weak_predictors + 
                   target_column)

df_dataset.drop(columns=features_to_drop, inplace=True, errors='ignore')
tools.print_message('success', 'Dropped features', format_dict={'number of features': len(features_to_drop)})

## 2.C Reduce High Cardinality Features

In [120]:
# Check if parental features still exist in dataset
parental_features = ['mothers_qualification', 'fathers_qualification', 'mothers_occupation', 'fathers_occupation']
existing_features = [f for f in parental_features if f in df_dataset.columns]
print(f"Remaining parental features: {existing_features}")

Remaining parental features: ['mothers_qualification', 'fathers_qualification', 'mothers_occupation', 'fathers_occupation']


In [121]:
# To reduce the number of categories in the parental qualification and occupation features, we will group them into broader categories.
def create_parental_higher_ed(df):
    """
    Creates binary indicator for parental higher education.
    Returns 1 if at least one parent has higher education, 0 otherwise.
    """
    higher_ed_codes = [2, 3, 4, 5, 6, 39, 40, 41, 42, 43, 44]
    
    mother_higher_ed = df['mothers_qualification'].isin(higher_ed_codes)
    father_higher_ed = df['fathers_qualification'].isin(higher_ed_codes)
    
    # At least one parent has higher education
    df['parental_higher_education'] = (mother_higher_ed | father_higher_ed).astype(int)
    df = df.drop(columns=['mothers_qualification', 'fathers_qualification'])
    
    return df

# Usage:
df_dataset = create_parental_higher_ed(df_dataset)
df_dataset.parental_higher_education.value_counts()

parental_higher_education
0    3616
1     808
Name: count, dtype: int64

In [122]:
def create_parental_professional_occupation(df):
    """
    Creates binary indicator for parental professional occupation.
    Returns 1 if at least one parent has professional/managerial role, 0 otherwise.
    """
    professional_codes = [1, 2, 3, 101, 102, 112, 114, 121, 122, 123, 124, 
                          131, 132, 134, 135]
    
    mother_professional = df['mothers_occupation'].isin(professional_codes)
    father_professional = df['fathers_occupation'].isin(professional_codes)
    
    # At least one parent has professional occupation
    df['parental_professional_occupation'] = (mother_professional | father_professional).astype(int)
    df = df.drop(columns=['mothers_occupation', 'fathers_occupation'])
    
    return df

# Usage:
df_dataset = create_parental_professional_occupation(df_dataset)
df_dataset.parental_professional_occupation.value_counts()

parental_professional_occupation
0    3270
1    1154
Name: count, dtype: int64

In [123]:
print(f"Dataset shape after parental feature engineering: {df_dataset.shape}")
print(f"Remaining features: {df_dataset.columns.tolist()}")

Dataset shape after parental feature engineering: (4424, 21)
Remaining features: ['marital_status', 'application_mode', 'application_order', 'course', 'previous_qualification', 'admission_grade', 'debtor', 'tuition_fees_up_to_date', 'gender', 'scholarship_holder', 'age_at_enrollment', 'curricular_units_1st_sem_credited', 'curricular_units_1st_sem_enrolled', 'curricular_units_1st_sem_evaluations', 'curricular_units_1st_sem_approved', 'curricular_units_1st_sem_grade', 'curricular_units_1st_sem_without_evaluations', 'curricular_units_2nd_sem_without_evaluations', 'target_binary', 'parental_higher_education', 'parental_professional_occupation']


In [124]:
def encode_categorical_withdrawal_rate(df, cat_col, target_col='target_binary'):
    """
    Replace categorical column with withdrawal rate encoding.
    
    Parameters:
    df: pandas DataFrame
    cat_col: name of categorical column to encode
    target_col: name of target column where 0=withdrawn
    
    Returns:
    pandas DataFrame with categorical column replaced by withdrawal_rate
    """
    import pandas as pd
    
    df_encoded = df.copy()
    
    # Calculate withdrawal rate for each category
    withdrawal_rates = (df[target_col] == 0).groupby(df[cat_col]).mean()
    
    # Create new withdrawal rate column
    new_col_name = f'{cat_col.lower().replace(" ", "_")}_withdrawal_rate'
    df_encoded[new_col_name] = df[cat_col].map(withdrawal_rates)
    
    # Remove original column
    df_encoded = df_encoded.drop(columns=[cat_col])
    
    return df_encoded

# Usage:
df_dataset = encode_categorical_withdrawal_rate(df_dataset, 'application_mode')
df_dataset = encode_categorical_withdrawal_rate(df_dataset, 'course')
df_dataset = encode_categorical_withdrawal_rate(df_dataset, 'previous_qualification')
df_dataset.describe()

Unnamed: 0,marital_status,application_order,admission_grade,debtor,tuition_fees_up_to_date,gender,scholarship_holder,age_at_enrollment,curricular_units_1st_sem_credited,curricular_units_1st_sem_enrolled,...,curricular_units_1st_sem_approved,curricular_units_1st_sem_grade,curricular_units_1st_sem_without_evaluations,curricular_units_2nd_sem_without_evaluations,target_binary,parental_higher_education,parental_professional_occupation,application_mode_withdrawal_rate,course_withdrawal_rate,previous_qualification_withdrawal_rate
count,4424.0,4424.0,4424.0,4424.0,4424.0,4424.0,4424.0,4424.0,4424.0,4424.0,...,4424.0,4424.0,4424.0,4424.0,4424.0,4424.0,4424.0,4424.0,4424.0,4424.0
mean,1.178571,1.727848,126.978119,0.113698,0.880651,0.351718,0.248418,23.265145,0.709991,6.27057,...,4.7066,10.640822,0.137658,0.150316,0.678797,0.18264,0.26085,0.321203,0.321203,0.321203
std,0.605747,1.313793,14.482001,0.31748,0.324235,0.47756,0.432144,7.587816,2.360507,2.480178,...,3.094238,4.843663,0.69088,0.753774,0.466991,0.386415,0.439148,0.140266,0.121256,0.09994
min,1.0,0.0,95.0,0.0,0.0,0.0,0.0,17.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.154047,0.166667
25%,1.0,1.0,117.9,0.0,1.0,0.0,0.0,19.0,0.0,5.0,...,3.0,11.0,0.0,0.0,0.0,0.0,0.0,0.201991,0.183099,0.290019
50%,1.0,1.0,126.1,0.0,1.0,0.0,0.0,20.0,0.0,6.0,...,5.0,12.285714,0.0,0.0,1.0,0.0,0.0,0.293578,0.330233,0.290019
75%,1.0,2.0,134.8,0.0,1.0,1.0,0.0,25.0,0.0,7.0,...,6.0,13.4,0.0,0.0,1.0,0.0,1.0,0.36859,0.381395,0.290019
max,6.0,9.0,190.0,1.0,1.0,1.0,1.0,70.0,20.0,26.0,...,26.0,18.875,12.0,12.0,1.0,1.0,1.0,1.0,0.666667,1.0


In [125]:
def one_hot_encode_features(df, categorical_columns):
    """
    One-hot encode specified categorical columns.
    
    Parameters:
    df: pandas DataFrame
    categorical_columns: list of column names to encode
    
    Returns:
    pandas DataFrame with categorical columns one-hot encoded
    """
    import pandas as pd
    
    # Check which features actually exist in the dataset
    existing_categorical = [col for col in categorical_columns if col in df.columns]
    missing_categorical = [col for col in categorical_columns if col not in df.columns]
    
    if missing_categorical:
        print(f"Warning: These columns not found in dataset: {missing_categorical}")
    
    print(f"Categorical features to encode: {existing_categorical}")
    
    # One-hot encode with drop_first=True to avoid multicollinearity
    df_encoded = pd.get_dummies(df, 
                               columns=existing_categorical, 
                               drop_first=True, 
                               dtype=int)
    
    print(f"Shape before encoding: {df.shape}")
    print(f"Shape after encoding: {df_encoded.shape}")
    print(f"Features added: {df_encoded.shape[1] - df.shape[1]}")
    
    # Show new encoded column names
    new_columns = [col for col in df_encoded.columns if any(cat in col for cat in existing_categorical)]
    print(f"New encoded features: {new_columns}")
    
    return df_encoded

# Usage:
remaining_categorical = [
    'marital_status',
    'application_order',  
    'gender',
    'scholarship_holder',
    'tuition_fees_up_to_date',
    'debtor'
]

df_dataset = one_hot_encode_features(df_dataset, remaining_categorical)

Categorical features to encode: ['marital_status', 'application_order', 'gender', 'scholarship_holder', 'tuition_fees_up_to_date', 'debtor']
Shape before encoding: (4424, 21)
Shape after encoding: (4424, 31)
Features added: 10
New encoded features: ['marital_status_2', 'marital_status_3', 'marital_status_4', 'marital_status_5', 'marital_status_6', 'application_order_1', 'application_order_2', 'application_order_3', 'application_order_4', 'application_order_5', 'application_order_6', 'application_order_9', 'gender_1', 'scholarship_holder_1', 'tuition_fees_up_to_date_1', 'debtor_1']


# 3. Processing Pipeline

## 3.A Summary

### <span style="color: #e74c3c;">**Logistic Regression Processing Pipeline Summary**</span>

This section implemented a modular processing pipeline for logistic regression, separating train/test splitting and feature scaling into distinct functions to prevent data leakage and ensure proper workflow.

### <span style="color: #2E86AB;">**1. Two-Function Design**</span>

**Modular approach** improved maintainability:
- `create_train_test_split_logistic()` - handles stratified 80/20 data splitting
- `scale_features_logistic()` - applies configurable feature scaling

**Benefits**: **Single responsibility principle** ensures each function performs one task, **reusable components** work with different configurations, **flexible parameters** enable easy adjustments.

### <span style="color: #2E86AB;">**2. Critical Workflow Order**</span>

**Split first, scale second** prevents **data leakage**. **Data leakage** occurs when test set information influences preprocessing, creating artificially optimistic performance estimates.

**Correct sequence:**
1. Split data (training/test)
2. Fit scaler on training data only  
3. Apply same scaler to test data

**Why this matters**: Scaling before splitting would use test set statistics, compromising model evaluation reliability.

### <span style="color: #2E86AB;">**3. Results Achieved**</span>

**Perfect stratification**: Both training (3,539 samples) and test (885 samples) maintain identical 67.9%/32.1% class distributions.

**Configurable scaling** via `scaler_type` parameter enables easy comparison testing between `standard`, `minmax`, and `none` options. **StandardScaler preferred for logistic regression** because linear models like logistic regression initialise weights to 0 or small random values, making standardised features (mean=0, std=1) easier to learn. StandardScaler is particularly important when using regularisation, as it ensures fair penalty application across all features, preventing features with larger scales from dominating the model coefficients.

### <span style="color: #e74c3c;">**Pipeline Outcome**</span>

The pipeline produces **deployment-ready data** (X_train_scaled, X_test_scaled, y_train, y_test, scaler) whilst preventing data leakage and ensuring reliable model evaluation. **Modular design** enables easy debugging and configuration adjustments for optimal logistic regression performance.

## 3.B Train Test Split

In [126]:
def create_train_test_split_logistic(X_features, df_dataset, logistic_config):
    """
    Create stratified train/test split for logistic regression.
    
    Parameters:
    X_features: Feature matrix (unscaled)
    df_dataset: DataFrame containing target variable
    logistic_config: Logistic regression configuration dictionary
    
    Returns:
    tuple: X_train, X_test, y_train, y_test
    """
    
    # Define target variable y
    y = df_dataset['target_binary']

    # Get required parameters with defaults
    train_size = logistic_config.get('train_size', 0.8)
    use_stratify = logistic_config.get('stratify', True)

    # Validate parameter values
    if not 0 < train_size < 1:
        raise ValueError(f'train_size must be between 0 and 1, got {train_size}')

    if not isinstance(use_stratify, bool):
        raise ValueError(f'stratify must be true or false, got {use_stratify}')

    # Set stratify parameter
    stratify_param = y if use_stratify else None

    # Perform train/test split
    X_train, X_test, y_train, y_test = train_test_split(
        X_features,
        y,
        train_size=train_size,
        random_state=42,
        stratify=stratify_param
    )

    # Print split information
    print(f'Training set: {len(X_train)} samples ({train_size*100:.0f}%)')
    print(f'Test set: {len(X_test)} samples ({(1-train_size)*100:.0f}%)')

    if use_stratify:
        print(f'Training class distribution: {y_train.value_counts(normalize=True).round(3).to_dict()}')
        print(f'Test class distribution: {y_test.value_counts(normalize=True).round(3).to_dict()}')

    return X_train, X_test, y_train, y_test

logistic_config = config.get('logistic_regression_model')
X_features = df_dataset.drop('target_binary', axis=1)

# Split the data
X_train, X_test, y_train, y_test = create_train_test_split_logistic(
    X_features, df_dataset, logistic_config
)


Training set: 3539 samples (80%)
Test set: 885 samples (20%)
Training class distribution: {1: 0.679, 0: 0.321}
Test class distribution: {1: 0.679, 0: 0.321}


In [127]:
def scale_features_logistic(X_train, X_test, logistic_config):
    """
    Scale features using specified scaling method.
    
    Parameters:
    X_train: Training feature matrix
    X_test: Test feature matrix
    logistic_config: Logistic regression configuration dictionary
    
    Returns:
    tuple: X_train_scaled, X_test_scaled, scaler
    """
    
    # Get scaling method
    scaler_type = logistic_config.get('scaler_type', 'standard')
    
    # Validate scaler type
    if scaler_type not in ['standard', 'minmax', 'none']:
        raise ValueError(f'scaler_type must be "standard", "minmax", or "none", got {scaler_type}')

    # Create scaler
    if scaler_type == 'standard':
        scaler = StandardScaler()
    elif scaler_type == 'minmax':
        scaler = MinMaxScaler()
    else:  # scaler_type == 'none'
        scaler = None

    # Apply scaling
    if scaler is not None:
        # Fit scaler on training data only
        X_train_scaled = scaler.fit_transform(X_train)
        # Apply same scaler to test data (no re-fitting)
        X_test_scaled = scaler.transform(X_test)
    else:
        X_train_scaled = X_train
        X_test_scaled = X_test

    print(f'Scaling method applied: {scaler_type}')
    
    return X_train_scaled, X_test_scaled, scaler

# Scale the features
X_train_scaled, X_test_scaled, scaler = scale_features_logistic(
    X_train, X_test, logistic_config
)

Scaling method applied: standard


## 4. Model Training

## 4.A Summary

In [128]:
def save_model_performance_to_csv(
    model_name, 
    training_time, 
    best_cv_score, 
    best_params, 
    y_test, 
    y_pred, 
    models_performance_dir,
    additional_metrics=None
):
    """
    Save model performance metrics to CSV file.
    Creates CSV if it doesn't exist, appends if it does.
    
    Parameters:
    - model_name: str, name of the model
    - training_time: float, training time in seconds
    - best_cv_score: float, best cross-validation F1 score
    - best_params: dict, best hyperparameters
    - y_test: array, true test labels
    - y_pred: array, predicted test labels
    - models_performance_dir: str, directory to save CSV
    - additional_metrics: dict, any additional metrics to include
    """
    
    # Calculate test set metrics
    test_accuracy = accuracy_score(y_test, y_pred)
    test_precision = precision_score(y_test, y_pred)
    test_recall = recall_score(y_test, y_pred)
    test_f1 = f1_score(y_test, y_pred)
    
    # Calculate specificity (True Negative Rate)
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    test_specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
    
    # Create metrics dictionary
    metrics = {
        'model_name': model_name,
        'timestamp': pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S'),
        'training_time_seconds': round(training_time, 2),
        'best_cv_f1_score': round(best_cv_score, 4),
        'test_accuracy': round(test_accuracy, 4),
        'test_precision': round(test_precision, 4),
        'test_recall': round(test_recall, 4),
        'test_f1_score': round(test_f1, 4),
        'test_specificity': round(test_specificity, 4),
        'true_positives': int(tp),
        'false_positives': int(fp),
        'true_negatives': int(tn),
        'false_negatives': int(fn),
        'best_parameters': json.dumps(best_params)
    }
    
    # Add any additional metrics
    if additional_metrics:
        metrics.update(additional_metrics)
    
    # Convert to DataFrame
    new_row = pd.DataFrame([metrics])
    
    # CSV file path
    csv_path = os.path.join(models_performance_dir, 'model_performance_log.csv')
    
    # Check if CSV exists
    if os.path.exists(csv_path):
        # Append to existing CSV
        new_row.to_csv(csv_path, mode='a', header=False, index=False)
        print(f"📈 Performance metrics appended to {csv_path}")
    else:
        # Create new CSV
        new_row.to_csv(csv_path, mode='w', header=True, index=False)
        print(f"📊 New performance log created: {csv_path}")
    
    # Print summary
    print(f"✅ Logged performance for {model_name}")
    print(f"   🎯 Test F1: {test_f1:.4f}")
    print(f"   ⏱️  Training time: {training_time:.1f}s")
    print(f"   🔍 Specificity: {test_specificity:.4f}")
    
    return csv_path

In [129]:
# Global class definition (put this at the top of your file)
class LogisticRegressionNet(torch.nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.linear = torch.nn.Linear(input_dim, 1)
    
    def forward(self, x):
        return torch.sigmoid(self.linear(x)).squeeze()


In [130]:

def setup_pytorch_logistic_regression_from_config(config, X_train, device='cuda', param_profile='pytorch_logistic_regression_testing'):
    """
    Setup PyTorch logistic regression with parameters from config.
    """
    # Get config section
    pytorch_config = config[param_profile]
    input_dim = X_train.shape[1]
    
    # Check GPU availability
    if device == 'cuda' and not torch.cuda.is_available():
        print("⚠️ CUDA not available, using CPU")
        device = 'cpu'
    else:
        print(f"✅ Using device: {device}")
        if device == 'cuda':
            print(f"🚀 GPU: {torch.cuda.get_device_name(0)}")
    
    # Create skorch model
    model = NeuralNetClassifier(
        LogisticRegressionNet,
        module__input_dim=input_dim,
        criterion=nn.BCELoss,
        optimizer=torch.optim.Adam,
        lr=0.01,  # Will be tuned by grid search
        max_epochs=50,  # Will be tuned by grid search
        batch_size=32,  # Will be tuned by grid search
        device=device,
        train_split=None,  # Don't split training data
        verbose=0,
    )
    
    # Build parameter grid from config
    param_grid = {
        'lr': pytorch_config['lr_values'],
        'max_epochs': pytorch_config['max_epochs_values'],
        'optimizer__weight_decay': pytorch_config['weight_decay_values'],
        'batch_size': pytorch_config['batch_size_values']
    }
    
    # Cross-validation strategy
    cv_strategy = StratifiedKFold(
        n_splits=pytorch_config['cv_folds'],
        shuffle=True,
        random_state=pytorch_config['random_state']
    )
    
    # Create grid search
    grid_search = GridSearchCV(
        estimator=model,
        param_grid=param_grid,
        cv=cv_strategy,
        scoring=pytorch_config['scoring_metric'],
        n_jobs=1,  # Must be 1 for GPU training
        verbose=1
    )
    
    print(f"✅ Model configured for {input_dim} features")
    print(f"🔧 Parameter grid: {len(param_grid['lr']) * len(param_grid['max_epochs']) * len(param_grid['optimizer__weight_decay']) * len(param_grid['batch_size'])} combinations")
    
    return grid_search

In [131]:
def save_pytorch_model_with_timestamp(grid_search, models_dir, scaler, feature_columns, logistic_config):
    """Save PyTorch model with complete preprocessing pipeline"""
    utc_time = dt.now(tz=tz.utc)
    timestamp = utc_time.strftime("%y%m%d-%H%M%S")
    f1_score = grid_search.best_score_
    f1_formatted = f"F1-{f1_score:.4f}".replace(".", "-")
    
    # Create model name
    model_name = f"LR_model_{timestamp}_{f1_formatted}"
    
    # Create subdirectory for this model
    model_dir = os.path.join(models_dir, model_name)
    os.makedirs(model_dir, exist_ok=True)
    
    # Save model weights
    model_path = os.path.join(model_dir, f"{model_name}.pth")
    torch.save(grid_search.best_estimator_.module_.state_dict(), model_path)
    
    # Save scaler
    scaler_path = os.path.join(model_dir, "scaler.pkl")
    with open(scaler_path, 'wb') as f:
        pickle.dump(scaler, f)
    
    # Save feature columns order
    features_path = os.path.join(model_dir, "feature_columns.pkl")
    with open(features_path, 'wb') as f:
        pickle.dump(feature_columns, f)
    
    # Save comprehensive metadata
    metadata = {
        'model_name': model_name,
        'best_params': grid_search.best_params_,
        'best_score': grid_search.best_score_,
        'input_dim': grid_search.best_estimator_.module_.linear.in_features,
        'timestamp': timestamp,
        'f1_score': f1_score,
        'scaler_type': logistic_config.get('scaler_type', 'standard'),
        'num_features': len(feature_columns),
        'feature_names': list(feature_columns)  # Store as list for JSON serialization
    }
    
    metadata_path = os.path.join(model_dir, "metadata.json")
    with open(metadata_path, 'w') as f:
        json.dump(metadata, f, indent=2)
    
    print(f"✅ Model saved in directory: {model_name}/")
    print(f"📁 Model file: {model_name}.pth")
    print(f"🔧 Scaler: scaler.pkl")
    print(f"📋 Features: feature_columns.pkl")
    print(f"📋 Metadata: metadata.json")
    
    return model_path, metadata_path

In [132]:
def load_best_pytorch_model(models_dir, device='cuda'):
    """Load PyTorch model with complete preprocessing pipeline"""
    
    # Find all model subdirectories
    model_dirs = [d for d in os.listdir(models_dir) 
                  if os.path.isdir(os.path.join(models_dir, d)) 
                  and d.startswith("LR_model_")]
    
    if not model_dirs:
        print("❌ No model directories found!")
        return None
    
    # Find best F1 score across all models
    best_f1 = 0
    best_metadata = None
    best_model_dir = None
    
    for model_dir_name in model_dirs:
        metadata_path = os.path.join(models_dir, model_dir_name, "metadata.json")
        
        if os.path.exists(metadata_path):
            with open(metadata_path, 'r') as f:
                metadata = json.load(f)
                if metadata['f1_score'] > best_f1:
                    best_f1 = metadata['f1_score']
                    best_metadata = metadata
                    best_model_dir = model_dir_name
    
    if best_metadata is None:
        print("❌ No valid models found!")
        return None
    
    # Construct paths to best model components
    best_dir_path = os.path.join(models_dir, best_model_dir)
    model_file = os.path.join(best_dir_path, f"{best_metadata['model_name']}.pth")
    scaler_file = os.path.join(best_dir_path, "scaler.pkl")
    features_file = os.path.join(best_dir_path, "feature_columns.pkl")
    
    # Check all required files exist
    missing_files = []
    if not os.path.exists(model_file):
        missing_files.append("model weights")
    if not os.path.exists(scaler_file):
        missing_files.append("scaler")
    if not os.path.exists(features_file):
        missing_files.append("feature columns")
    
    if missing_files:
        print(f"❌ Missing files: {', '.join(missing_files)}")
        return None
    
    # Load scaler (CRITICAL FIX)
    with open(scaler_file, 'rb') as f:
        scaler = pickle.load(f)
    
    # Load feature columns (CRITICAL FIX)
    with open(features_file, 'rb') as f:
        feature_columns = pickle.load(f)
    
    # Recreate the model
    skorch_model = NeuralNetClassifier(
        LogisticRegressionNet,
        module__input_dim=best_metadata['input_dim'],
        criterion=torch.nn.BCELoss,
        optimizer=torch.optim.Adam,
        device=device,
        train_split=None,
        verbose=0,
        **best_metadata['best_params']
    )
    
    # Load the weights
    skorch_model.initialize()
    skorch_model.module_.load_state_dict(torch.load(model_file, map_location=device))
    
    print(f"📂 Loaded model from directory: {best_model_dir}/")
    print(f"🏆 F1 Score: {best_f1:.4f}")
    print(f"⚙️ Parameters: {best_metadata['best_params']}")
    print(f"🔧 Scaler type: {best_metadata.get('scaler_type', 'unknown')}")
    print(f"📊 Features: {len(feature_columns)}")
    
    return skorch_model, best_metadata, scaler, feature_columns

In [133]:
def preprocess_test_data_for_loaded_model(X_test_raw, scaler, expected_feature_columns):
    """
    Apply the same preprocessing pipeline to test data that was used during training.
    
    Parameters:
    X_test_raw: Raw test features (after feature engineering but before scaling)
    scaler: Fitted scaler from training
    expected_feature_columns: Feature column order from training
    
    Returns:
    X_test_processed: Scaled and correctly ordered features ready for prediction
    """
    
    # Ensure we have a DataFrame
    if isinstance(X_test_raw, np.ndarray):
        print("❌ Test data must be DataFrame to ensure correct feature alignment")
        return None
    
    # Check feature alignment
    missing_features = set(expected_feature_columns) - set(X_test_raw.columns)
    extra_features = set(X_test_raw.columns) - set(expected_feature_columns)
    
    if missing_features:
        print(f"❌ Missing features in test data: {missing_features}")
        return None
    
    if extra_features:
        print(f"⚠️ Extra features in test data (will be dropped): {extra_features}")
    
    # Reorder features to match training order
    X_test_ordered = X_test_raw[expected_feature_columns]
    
    # Apply scaling using fitted scaler
    X_test_scaled = scaler.transform(X_test_ordered)
    
    # Convert to format expected by PyTorch
    X_test_processed = np.array(X_test_scaled, dtype=np.float32)
    
    print(f"✅ Test data preprocessed: {X_test_processed.shape}")
    print(f"📊 Feature order verified: {len(expected_feature_columns)} features")
    
    return X_test_processed

In [134]:
def predict_with_loaded_model(model, scaler, feature_columns, X_test_engineered):
    """
    Make predictions using loaded model with proper preprocessing.
    
    Parameters:
    model: Loaded PyTorch model
    scaler: Fitted scaler from training
    feature_columns: Expected feature order from training
    X_test_engineered: Test data after feature engineering (DataFrame)
    
    Returns:
    y_pred: Binary predictions
    y_pred_proba: Prediction probabilities
    """
    
    # Apply preprocessing
    X_test_processed = preprocess_test_data_for_loaded_model(
        X_test_engineered, scaler, feature_columns
    )
    
    if X_test_processed is None:
        print("❌ Preprocessing failed")
        return None, None
    
    # Make predictions
    y_pred = model.predict(X_test_processed)
    y_pred_proba = model.predict_proba(X_test_processed)[:, 1]
    
    print(f"✅ Predictions completed: {len(y_pred)} samples")
    
    return y_pred, y_pred_proba

In [135]:
# Configuration flag
train_new_model = True  # Set to True to train new model, False to load best existing
param_profile = 'pytorch_logistic_regression_quick'  # Profile to use for training

# Convert data to required format
X_train_clean = np.array(X_train_scaled, dtype=np.float32)
y_train_clean = np.array(y_train, dtype=np.float32)
X_test_clean = np.array(X_test_scaled, dtype=np.float32)
y_test_clean = np.array(y_test, dtype=np.float32)

# FIXED: Store feature columns for saving with model
feature_columns = X_train.columns.tolist()  # Before scaling, get column names

# Main logic with performance tracking
if train_new_model:
    print("🚀 Training new model with performance tracking...")
    
    # Record start time
    start_time = time.time()
    
    # Setup and train model
    grid_search = setup_pytorch_logistic_regression_from_config(
        config, X_train_clean, device='cuda', param_profile=param_profile
    )
    grid_search.fit(X_train_clean, y_train_clean)
    
    # Calculate training time
    training_time = time.time() - start_time
    
    # FIXED: Save model with complete preprocessing pipeline
    model_path, metadata_path = save_pytorch_model_with_timestamp(
        grid_search, models_dir, scaler, feature_columns, logistic_config
    )
    
    # Get model name from path
    model_name = os.path.basename(os.path.dirname(model_path))
    
    # Make predictions
    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_test_clean)
    y_pred_proba = best_model.predict_proba(X_test_clean)[:, 1]
    
    # Log performance to CSV
    csv_path = save_model_performance_to_csv(
        model_name=model_name,
        training_time=training_time,
        best_cv_score=grid_search.best_score_,
        best_params=grid_search.best_params_,
        y_test=y_test_clean,
        y_pred=y_pred,
        models_performance_dir=models_performance,
        additional_metrics={'param_profile': param_profile}
    )
    
    f1_score = grid_search.best_score_
    
else:
    print("📂 Loading best existing model...")
    result = load_best_pytorch_model(models_dir)
    
    if result is None:
        print("🚀 No models found, training new one...")
        # Fallback to training with logging
        start_time = time.time()
        grid_search = setup_pytorch_logistic_regression_from_config(
            config, X_train_clean, device='cuda', param_profile=param_profile
        )
        grid_search.fit(X_train_clean, y_train_clean)
        training_time = time.time() - start_time
        
        # FIXED: Save with complete pipeline
        model_path, metadata_path = save_pytorch_model_with_timestamp(
            grid_search, models_dir, scaler, feature_columns, logistic_config
        )
        model_name = os.path.basename(os.path.dirname(model_path))
        
        best_model = grid_search.best_estimator_
        y_pred = best_model.predict(X_test_clean)
        
        save_model_performance_to_csv(
            model_name=model_name,
            training_time=training_time,
            best_cv_score=grid_search.best_score_,
            best_params=grid_search.best_params_,
            y_test=y_test_clean,
            y_pred=y_pred,
            models_performance_dir=models_performance,
            additional_metrics={'param_profile': param_profile}
        )
        
        f1_score = grid_search.best_score_
        best_metadata = None
        loaded_scaler = scaler
        loaded_feature_columns = feature_columns
    else:
        # FIXED: Unpack all components from loaded model
        best_model, best_metadata, loaded_scaler, loaded_feature_columns = result
        f1_score = best_metadata['f1_score']
        
        # FIXED: Use proper preprocessing for loaded model
        print("🔧 Applying loaded model preprocessing...")
        y_pred, y_pred_proba = predict_with_loaded_model(
            best_model, loaded_scaler, loaded_feature_columns, X_test
        )

print(f"✅ Ready to use model with F1: {f1_score:.4f}")

# FIXED: Performance evaluation with properly preprocessed data
if train_new_model or (not train_new_model and result is None):
    # For newly trained models, use existing predictions
    print(f"🏆 Best parameters: {grid_search.best_params_}")
    print(f"📈 Best CV F1-score: {grid_search.best_score_:.4f}")
    best_model = grid_search.best_estimator_
    
    # Predictions already made above
    if 'y_pred' not in locals():
        y_pred = best_model.predict(X_test_clean)
        y_pred_proba = best_model.predict_proba(X_test_clean)[:, 1]
        
else:
    # For loaded models, predictions made with proper preprocessing
    print(f"🏆 Best parameters: {best_metadata['best_params']}")
    print(f"📈 Best CV F1-score: {best_metadata['f1_score']:.4f}")

# Performance metrics
from sklearn.metrics import classification_report, confusion_matrix
print("📊 Test Set Performance:")
print(classification_report(y_test_clean, y_pred))
print("\n🔢 Confusion Matrix:")
print(confusion_matrix(y_test_clean, y_pred))

🚀 Training new model with performance tracking...
✅ Using device: cuda
🚀 GPU: NVIDIA GeForce RTX 3080
✅ Model configured for 30 features
🔧 Parameter grid: 36 combinations
Fitting 3 folds for each of 36 candidates, totalling 108 fits


✅ Model saved in directory: LR_model_250604-220510_F1-0-8986/
📁 Model file: LR_model_250604-220510_F1-0-8986.pth
🔧 Scaler: scaler.pkl
📋 Features: feature_columns.pkl
📋 Metadata: metadata.json
📈 Performance metrics appended to c:\Users\Craig\Documents\Python\msc_ai_module\models_performance\model_performance_log.csv
✅ Logged performance for LR_model_250604-220510_F1-0-8986
   🎯 Test F1: 0.9002
   ⏱️  Training time: 611.7s
   🔍 Specificity: 0.6725
✅ Ready to use model with F1: 0.8986
🏆 Best parameters: {'batch_size': 64, 'lr': 0.01, 'max_epochs': 50, 'optimizer__weight_decay': 0.001}
📈 Best CV F1-score: 0.8986
📊 Test Set Performance:
              precision    recall  f1-score   support

         0.0       0.85      0.67      0.75       284
         1.0       0.86      0.95      0.90       601

    accuracy                           0.86       885
   macro avg       0.86      0.81      0.83       885
weighted avg       0.86      0.86      0.85       885


🔢 Confusion Matrix:
[[191  93]
 

In [136]:
# Test script to verify the model loading fix
print("🧪 Testing Model Loading Fix...")

# Test 1: Load model and check components
print("\n1️⃣ Testing model loading...")
result = load_best_pytorch_model(models_dir)

if result is None:
    print("❌ No model found - need to train first")
else:
    model, metadata, scaler, feature_columns = result
    print(f"✅ Model loaded successfully")
    print(f"📊 Features expected: {len(feature_columns)}")
    print(f"🔧 Scaler type: {type(scaler).__name__}")

# Test 2: Check feature alignment
print("\n2️⃣ Testing feature alignment...")
if result is not None:
    print(f"Training features: {len(feature_columns)}")
    print(f"Test features: {len(X_test.columns)}")
    
    missing = set(feature_columns) - set(X_test.columns)
    extra = set(X_test.columns) - set(feature_columns)
    
    if missing:
        print(f"❌ Missing features: {missing}")
    if extra:
        print(f"⚠️ Extra features: {extra}")
    if not missing and not extra:
        print("✅ Perfect feature alignment")

# Test 3: Test preprocessing
print("\n3️⃣ Testing preprocessing pipeline...")
if result is not None:
    try:
        X_test_processed = preprocess_test_data_for_loaded_model(
            X_test, scaler, feature_columns
        )
        if X_test_processed is not None:
            print(f"✅ Preprocessing successful: {X_test_processed.shape}")
            print(f"📊 Data type: {X_test_processed.dtype}")
            print(f"📊 Value range: [{X_test_processed.min():.3f}, {X_test_processed.max():.3f}]")
        else:
            print("❌ Preprocessing failed")
    except Exception as e:
        print(f"❌ Preprocessing error: {e}")

# Test 4: Test predictions
print("\n4️⃣ Testing predictions...")
if result is not None:
    try:
        y_pred_test, y_proba_test = predict_with_loaded_model(
            model, scaler, feature_columns, X_test
        )
        
        if y_pred_test is not None:
            print(f"✅ Predictions successful: {len(y_pred_test)} samples")
            print(f"📊 Prediction distribution: {np.unique(y_pred_test, return_counts=True)}")
            print(f"📊 Probability range: [{y_proba_test.min():.3f}, {y_proba_test.max():.3f}]")
            
            # Quick performance check
            from sklearn.metrics import classification_report
            print("\n📈 Quick Performance Check:")
            print(classification_report(y_test, y_pred_test))
        else:
            print("❌ Predictions failed")
    except Exception as e:
        print(f"❌ Prediction error: {e}")

print("\n🏁 Test completed!")

🧪 Testing Model Loading Fix...

1️⃣ Testing model loading...
📂 Loaded model from directory: LR_model_250604-220510_F1-0-8986/
🏆 F1 Score: 0.8986
⚙️ Parameters: {'batch_size': 64, 'lr': 0.01, 'max_epochs': 50, 'optimizer__weight_decay': 0.001}
🔧 Scaler type: standard
📊 Features: 30
✅ Model loaded successfully
📊 Features expected: 30
🔧 Scaler type: StandardScaler

2️⃣ Testing feature alignment...
Training features: 30
Test features: 30
✅ Perfect feature alignment

3️⃣ Testing preprocessing pipeline...
✅ Test data preprocessed: (885, 30)
📊 Feature order verified: 30 features
✅ Preprocessing successful: (885, 30)
📊 Data type: float32
📊 Value range: [-2.725, 26.586]

4️⃣ Testing predictions...
✅ Test data preprocessed: (885, 30)
📊 Feature order verified: 30 features
✅ Predictions completed: 885 samples
✅ Predictions successful: 885 samples
📊 Prediction distribution: (array([0, 1]), array([224, 661]))
📊 Probability range: [0.001, 0.990]

📈 Quick Performance Check:
              precision    

In [137]:
# Convert test data to same format
# X_test_clean = np.array(X_test, dtype=np.float32)
y_test_clean = np.array(y_test, dtype=np.float32)

In [138]:
# For LOADED models, use metadata
if not train_new_model and result is not None:
    best_model, metadata, loaded_scaler, loaded_feature_columns = result
    print(f"🏆 Best parameters: {metadata['best_params']}")
    print(f"📈 Best CV F1-score: {metadata['f1_score']:.4f}")

# For TRAINED models, use grid_search
elif train_new_model or (not train_new_model and result is None):
    print(f"🏆 Best parameters: {grid_search.best_params_}")
    print(f"📈 Best CV F1-score: {grid_search.best_score_:.4f}")
    best_model = grid_search.best_estimator_

🏆 Best parameters: {'batch_size': 64, 'lr': 0.01, 'max_epochs': 50, 'optimizer__weight_decay': 0.001}
📈 Best CV F1-score: 0.8986


In [139]:
# FIXED: Use correct preprocessing based on model source
if not train_new_model and result is not None:
    # For LOADED models - use proper preprocessing pipeline
    best_model, metadata, loaded_scaler, loaded_feature_columns = result
    print(f"🏆 Best parameters: {metadata['best_params']}")
    print(f"📈 Best CV F1-score: {metadata['f1_score']:.4f}")
    
    # Use loaded model's preprocessing (THIS GIVES GOOD PERFORMANCE)
    print("🔧 Using loaded model preprocessing...")
    y_pred, y_pred_proba = predict_with_loaded_model(
        best_model, loaded_scaler, loaded_feature_columns, X_test
    )
    
else:
    # For NEWLY TRAINED models - use training data
    print(f"🏆 Best parameters: {grid_search.best_params_}")
    print(f"📈 Best CV F1-score: {grid_search.best_score_:.4f}")
    best_model = grid_search.best_estimator_
    
    # Use training session data
    y_pred = best_model.predict(X_test_clean)
    y_pred_proba = best_model.predict_proba(X_test_clean)[:, 1]

# Performance metrics (works for both cases)
from sklearn.metrics import classification_report, confusion_matrix
print("📊 Test Set Performance:")
print(classification_report(y_test_clean, y_pred))
print("\n🔢 Confusion Matrix:")
print(confusion_matrix(y_test_clean, y_pred))

🏆 Best parameters: {'batch_size': 64, 'lr': 0.01, 'max_epochs': 50, 'optimizer__weight_decay': 0.001}
📈 Best CV F1-score: 0.8986
📊 Test Set Performance:
              precision    recall  f1-score   support

         0.0       0.85      0.67      0.75       284
         1.0       0.86      0.95      0.90       601

    accuracy                           0.86       885
   macro avg       0.86      0.81      0.83       885
weighted avg       0.86      0.86      0.85       885


🔢 Confusion Matrix:
[[191  93]
 [ 33 568]]
