# 3 - Classification methods on Imputed Data

In this notebook we evaluate the performance of different classification methods on imputed datasets.


## Classification Methods:
1. **Random Forest** 
2. **Neural Network** 
3. **Support Vector Machine (SVM)** 

## Evaluation Metrics:
- **F1-Score** 
- **Accuracy** 

In [41]:
# libraries
import pandas as pd
import numpy as np
import os
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score
from sklearn.neural_network import MLPClassifier
from sklearn.inspection import permutation_importance

# reproducibility
RANDOM_SEED = 57
np.random.seed(RANDOM_SEED)

# create directories
os.makedirs('results/classification', exist_ok=True)

# Functions for data pre- and post- processing

In [None]:
# function to load datasets
def load_datasets(imputation_method, prefix="data/imputed/iteration_1"):
    """
    Load datasets from iteration_1 folder with updated file naming
    """
    # Complete datasets
    complete_path = "data/complete/iteration_1"
    train_complete = pd.read_csv(f"{complete_path}/train_complete.csv")
    test_complete = pd.read_csv(f"{complete_path}/test_complete.csv")
    
    if imputation_method == "Complete":
        return {
            "train_complete": train_complete,
            "test_complete": test_complete
        }
    
    # Updated file naming based on your iteration_1 files
    if imputation_method == "KNN":
        file_suffix = "_knn.csv"
    elif imputation_method == "MICE":
        file_suffix = "_mice.csv"
    elif imputation_method == "Simple":
        file_suffix = "_simple.csv"
    
    # Load from iteration_1 folder
    train_low = pd.read_csv(f"{prefix}/train_low{file_suffix}")
    test_low = pd.read_csv(f"{prefix}/test_low{file_suffix}")
    train_high = pd.read_csv(f"{prefix}/train_high{file_suffix}")
    test_high = pd.read_csv(f"{prefix}/test_high{file_suffix}")
    
    return {
        "train_low": train_low,
        "test_low": test_low,
        "train_high": train_high,
        "test_high": test_high,
        "train_complete": train_complete,
        "test_complete": test_complete
    }

In [43]:
# function to split data by features and target
def prepare_data(dataset, target_column="target"):
    features = [col for col in dataset.columns if col != target_column]
    X = dataset[features]
    y = dataset[target_column]
    return X, y, features

In [44]:
# function to evaluate the model
def evaluate_model(model, X_train, y_train, X_test, y_test):
    
    # train 
    train_pred = model.predict(X_train)
    train_accuracy = accuracy_score(y_train, train_pred)
    train_f1 = f1_score(y_train, train_pred)
    
    # test
    test_pred = model.predict(X_test)
    test_accuracy = accuracy_score(y_test, test_pred)
    test_f1 = f1_score(y_test, test_pred)
    
    return {
        "train_pred": train_pred,
        "train_accuracy": train_accuracy,
        "train_f1": train_f1,
        "test_pred": test_pred,
        "test_accuracy": test_accuracy,
        "test_f1": test_f1
    }

In [45]:
# function to save predictions
def save_predictions(actual, predicted, imputation_method, dataset_type, classifier):
    predictions_df = pd.DataFrame({
        "actual": actual,
        "predicted": predicted
    })
    
    filename = f"results/classification/predictions_{dataset_type}_{imputation_method}_{classifier}.csv"
    predictions_df.to_csv(filename, index=False)
    print(f"Predictions saved to {filename}")
    
    return filename


In [46]:
# function to get feature importance
def get_feature_importance(model, classifier_type, features, X_test=None, y_test=None):
    
    if classifier_type == "SVM" and hasattr(model, "coef_"):
        # SVM
        feature_coefficients = model.coef_[0]
        feature_importance = pd.DataFrame({
            'feature_name': features,
            'coefficient': feature_coefficients,
            'abs_coefficient': np.abs(feature_coefficients)
        }).sort_values('abs_coefficient', ascending=False)
        
    elif classifier_type == "RandomForest" and hasattr(model, "feature_importances_"):
        # Random Forest
        feature_importance = pd.DataFrame({
            'feature_name': features,
            'importance': model.feature_importances_
        }).sort_values('importance', ascending=False)
        
    elif classifier_type == "NeuralNetwork" and X_test is not None and y_test is not None:
        # Neural Network
        perm_importance = permutation_importance(
            model, X_test, y_test, 
            n_repeats=10, random_state=RANDOM_SEED, scoring='accuracy'
        )
        feature_importance = pd.DataFrame({
            'feature_name': features,
            'importance': perm_importance.importances_mean
        }).sort_values('importance', ascending=False)
        
    else:
        feature_importance = pd.DataFrame({
            'feature_name': features,
            'importance': np.zeros(len(features))
        })
        
    return feature_importance
        

# Classifiers

In [47]:
# SVM fine tuning 
def train_svm_model(X_train, y_train, param_grid, cv=10, verbose=1):
    
      # grid search
    svm_grid = GridSearchCV(
        SVC(random_state=RANDOM_SEED),
        param_grid,
        cv=cv,
        scoring='accuracy',
        n_jobs=-1,
        verbose=verbose
    )
    
    svm_grid.fit(X_train, y_train)
    
    # results
    print(f"\nBest parameters: {svm_grid.best_params_}")
    print(f"Best cross-validation Accuracy: {svm_grid.best_score_:.4f}")
    
    return svm_grid


In [48]:
# Random Forest fine tuning 
def train_random_forest_model(X_train, y_train, param_grid, cv=10, verbose=1):

    # grid search
    rf_grid = GridSearchCV(
        RandomForestClassifier(random_state=RANDOM_SEED),
        param_grid,
        cv=cv,
        scoring='accuracy',
        n_jobs=-1,
        verbose=verbose
    )
    
    rf_grid.fit(X_train, y_train)
    
    # results
    print(f"\nBest parameters: {rf_grid.best_params_}")
    print(f"Best cross-validation Accuracy: {rf_grid.best_score_:.4f}")
    
    return rf_grid

In [49]:
# Neural Network fine tuning 
def train_neural_network_model(X_train, y_train, param_grid, cv=5, verbose=1):
    
    # grid search
    nn_grid = GridSearchCV(
        MLPClassifier(random_state=RANDOM_SEED),
        param_grid,
        cv=cv,
        scoring='accuracy',
        n_jobs=-1,
        verbose=verbose
    )
    
    nn_grid.fit(X_train, y_train)
    
    # results
    print(f"\nBest parameters: {nn_grid.best_params_}")
    print(f"Best cross-validation Accuracy: {nn_grid.best_score_:.4f}")
    
    return nn_grid

In [50]:
# train and test model performance 
def analyze_model_performance(model_name, model, X_train, y_train, X_test, y_test, features, 
                              dataset_type, imputation_method):

    # evaluate performance
    results = evaluate_model(model, X_train, y_train, X_test, y_test)
    
    print(f"\nTraining performance of {imputation_method} imputed {dataset_type} {model_name}:")
    print(f"Accuracy: {results['train_accuracy']:.4f}, F1-Score: {results['train_f1']:.4f}")
    
    print(f"\nTest performance of {imputation_method} imputed {dataset_type} {model_name}:")
    print(f"Accuracy: {results['test_accuracy']:.4f}, F1-Score: {results['test_f1']:.4f}")

     # get feature importance
    if model_name == "SVM":
        feature_importance = get_feature_importance(model, "SVM", features)
    elif model_name == "RandomForest":
        feature_importance = get_feature_importance(model, "RandomForest", features)
    elif model_name == "NeuralNetwork":
        feature_importance = get_feature_importance(model, "NeuralNetwork", features, X_test, y_test)
    
    print(f"\nFeature Importance for {dataset_type.capitalize()} Missing Data {model_name}:")
    print(feature_importance.head(10))  
    
    # save predictions
    save_file = save_predictions(
        y_test, results["test_pred"], 
        imputation_method, dataset_type, model_name
    )
    
    return {
        "results": results,
        "feature_importance": feature_importance,
        "predictions_file": save_file
    }
    

# 1. Complete dataset

In [71]:
# load complete datasets
datasets = load_datasets("Complete")

## 1.1 SVM

In [72]:
X_train_complete, y_train_complete, features = prepare_data(datasets["train_complete"])
X_test_complete, y_test_complete, _ = prepare_data(datasets["test_complete"])

# SVM parameters for complete dataset
param_grid_svm_complete = {
    'C': [10, 20, 30, 40, 50],
    'kernel': ['linear', 'rbf'],
    'gamma': ['scale', 'auto']
}

# train model
svm_grid_complete = train_svm_model(X_train_complete, y_train_complete, param_grid_svm_complete, cv=5)
best_svm_complete = svm_grid_complete.best_estimator_

Fitting 5 folds for each of 20 candidates, totalling 100 fits

Best parameters: {'C': 10, 'gamma': 'scale', 'kernel': 'linear'}
Best cross-validation Accuracy: 0.9614


In [73]:
# SVM performance
svm_complete_results = analyze_model_performance(
    "SVM", best_svm_complete, X_train_complete, y_train_complete, X_test_complete, y_test_complete,
    features, "complete", "None"
)


Training performance of None imputed complete SVM:
Accuracy: 0.9843, F1-Score: 0.9846

Test performance of None imputed complete SVM:
Accuracy: 0.9767, F1-Score: 0.9773

Feature Importance for Complete Missing Data SVM:
   feature_name  coefficient  abs_coefficient
18       disc_3     4.620519         4.620519
20    cat_0_Yes     4.286011         4.286011
22    cat_2_Yes     4.213657         4.213657
7        cont_7     4.066811         4.066811
16       disc_1     3.695325         3.695325
3        cont_3     3.489271         3.489271
0        cont_0     3.370466         3.370466
1        cont_1     1.606070         1.606070
21    cat_1_Yes     1.570190         1.570190
15       disc_0     1.458330         1.458330
Predictions saved to results/classification/predictions_complete_None_SVM.csv


# 1.2 Random Forest

In [74]:
# Random Forest parameters for complete dataset
param_grid_rf_complete = {
    'n_estimators': [50, 100],
    'max_depth': [2, 3, 4],
    'min_samples_split': [50, 100, 150],
    'min_samples_leaf': [20, 30, 40],
    'max_features': ['sqrt', 'log2'],
    'bootstrap': [True],                         
    'oob_score': [True]
}

# train model
rf_grid_complete = train_random_forest_model(X_train_complete, y_train_complete, param_grid_rf_complete, cv=10)
best_rf_complete = rf_grid_complete.best_estimator_


Fitting 10 folds for each of 108 candidates, totalling 1080 fits

Best parameters: {'bootstrap': True, 'max_depth': 3, 'max_features': 'sqrt', 'min_samples_leaf': 30, 'min_samples_split': 50, 'n_estimators': 100, 'oob_score': True}
Best cross-validation Accuracy: 0.8243


In [75]:
# Random Forest performance
rf_complete_results = analyze_model_performance(
    "RandomForest", best_rf_complete, X_train_complete, y_train_complete, X_test_complete, y_test_complete,
    features, "complete", "None"
)


Training performance of None imputed complete RandomForest:
Accuracy: 0.8686, F1-Score: 0.8757

Test performance of None imputed complete RandomForest:
Accuracy: 0.7900, F1-Score: 0.8037

Feature Importance for Complete Missing Data RandomForest:
   feature_name  importance
7        cont_7    0.269249
3        cont_3    0.163734
18       disc_3    0.141353
0        cont_0    0.092660
9        cont_9    0.070920
16       disc_1    0.062508
1        cont_1    0.054330
11      cont_11    0.033219
13      cont_13    0.033022
20    cat_0_Yes    0.015484
Predictions saved to results/classification/predictions_complete_None_RandomForest.csv


# 1.3 Neural Network

In [54]:
# Neural Network parameters for complete dataset
param_grid_nn_complete = {
    'hidden_layer_sizes': [(50,), (100,), (50, 50), (100, 50)],  
    'activation': ['relu', 'tanh', 'logistic'],                  
    'alpha': [0.0001, 0.001, 0.01],                             
    'learning_rate_init': [0.001, 0.01, 0.1],                   
    'max_iter': [2000, 3000]                                     
}   

# train model
nn_grid_complete = train_neural_network_model(X_train_complete, y_train_complete, param_grid_nn_complete, cv=5)
best_nn_complete = nn_grid_complete.best_estimator_


Fitting 5 folds for each of 216 candidates, totalling 1080 fits

Best parameters: {'activation': 'tanh', 'alpha': 0.0001, 'hidden_layer_sizes': (100,), 'learning_rate_init': 0.01, 'max_iter': 2000}
Best cross-validation Accuracy: 0.9614


In [55]:
# Neural Network performance
nn_complete_results = analyze_model_performance(
    "NeuralNetwork", best_nn_complete, X_train_complete, y_train_complete, X_test_complete, y_test_complete,
    features, "complete", "None"
)


Training performance of None imputed complete NeuralNetwork:
Accuracy: 1.0000, F1-Score: 1.0000

Test performance of None imputed complete NeuralNetwork:
Accuracy: 0.9733, F1-Score: 0.9742

Feature Importance for Complete Missing Data NeuralNetwork:
   feature_name  importance
18       disc_3    0.162333
7        cont_7    0.155333
16       disc_1    0.133333
3        cont_3    0.131000
0        cont_0    0.127000
20    cat_0_Yes    0.057000
9        cont_9    0.047667
22    cat_2_Yes    0.043000
13      cont_13    0.041333
11      cont_11    0.039667
Predictions saved to results/classification/predictions_complete_None_NeuralNetwork.csv


# 2. KNN imputation

In [56]:
# load KNN datasets
datasets = load_datasets("KNN")

# 2.1 SVM 

## Low dataset

In [57]:
# low dataset
X_train_low, y_train_low, features = prepare_data(datasets["train_low"])
X_test_low, y_test_low, _ = prepare_data(datasets["test_low"])

# fine tune
param_grid_svm_low = {
    'C': [10, 15, 20, 25],
    'kernel': ['linear', 'rbf'],
    'gamma': ['scale', 'auto']
}

# train
svm_grid_low = train_svm_model(X_train_low, y_train_low, param_grid_svm_low, cv=5)
best_svm_low = svm_grid_low.best_estimator_


Fitting 5 folds for each of 16 candidates, totalling 80 fits

Best parameters: {'C': 15, 'gamma': 'scale', 'kernel': 'linear'}
Best cross-validation Accuracy: 0.9157


In [58]:
# SVM performance
svm_low_results = analyze_model_performance(
    "SVM", best_svm_low, X_train_low, y_train_low, X_test_low, y_test_low,
    features, "low", "KNN"
)


Training performance of KNN imputed low SVM:
Accuracy: 0.9371, F1-Score: 0.9387

Test performance of KNN imputed low SVM:
Accuracy: 0.9433, F1-Score: 0.9453

Feature Importance for Low Missing Data SVM:
   feature_name  coefficient  abs_coefficient
18       disc_3     2.217808         2.217808
20    cat_0_Yes     2.158231         2.158231
22    cat_2_Yes     2.057448         2.057448
7        cont_7     2.020510         2.020510
16       disc_1     1.938888         1.938888
3        cont_3     1.795299         1.795299
0        cont_0     1.786422         1.786422
21    cat_1_Yes     1.086582         1.086582
1        cont_1     0.776289         0.776289
9        cont_9     0.760927         0.760927
Predictions saved to results/classification/predictions_low_KNN_SVM.csv


## High dataset

In [59]:
# high dataset
X_train_high, y_train_high, features = prepare_data(datasets["train_high"])
X_test_high, y_test_high, _ = prepare_data(datasets["test_high"])

# fine tune
param_grid_svm_high = {
    'C': [1, 2, 3, 4, 5],
    'kernel': ['linear', 'rbf'],
    'gamma': ['scale', 'auto']
}

# train
svm_grid_high = train_svm_model(X_train_high, y_train_high, param_grid_svm_high, cv=5)
best_svm_high = svm_grid_high.best_estimator_

Fitting 5 folds for each of 20 candidates, totalling 100 fits

Best parameters: {'C': 1, 'gamma': 'scale', 'kernel': 'linear'}
Best cross-validation Accuracy: 0.8357


In [60]:
# SVM performance
svm_high_results = analyze_model_performance(
    "SVM", best_svm_high, X_train_high, y_train_high, X_test_high, y_test_high,
    features, "high", "KNN"
)


Training performance of KNN imputed high SVM:
Accuracy: 0.8543, F1-Score: 0.8555

Test performance of KNN imputed high SVM:
Accuracy: 0.8600, F1-Score: 0.8720

Feature Importance for High Missing Data SVM:
   feature_name  coefficient  abs_coefficient
18       disc_3     1.009584         1.009584
7        cont_7     1.004171         1.004171
3        cont_3     0.989985         0.989985
20    cat_0_Yes     0.914793         0.914793
16       disc_1     0.893182         0.893182
22    cat_2_Yes     0.865192         0.865192
0        cont_0     0.741537         0.741537
15       disc_0     0.593023         0.593023
9        cont_9     0.464895         0.464895
23    cat_3_Yes     0.434374         0.434374
Predictions saved to results/classification/predictions_high_KNN_SVM.csv


# 2.2 Random Forest  

## Low data

In [61]:
# low data
param_grid_rf_low = {
    'n_estimators': [50, 100],
    'max_depth': [2, 3, 4],
    'min_samples_split': [50, 100, 150],
    'min_samples_leaf': [20, 30, 40],
    'max_features': ['sqrt', 'log2'],
    'bootstrap': [True],                         
    'oob_score': [True]
}

# train
rf_grid_low = train_random_forest_model(X_train_low, y_train_low, param_grid_rf_low, cv=10)
best_rf_low = rf_grid_low.best_estimator_

Fitting 10 folds for each of 108 candidates, totalling 1080 fits

Best parameters: {'bootstrap': True, 'max_depth': 4, 'max_features': 'sqrt', 'min_samples_leaf': 20, 'min_samples_split': 100, 'n_estimators': 100, 'oob_score': True}
Best cross-validation Accuracy: 0.8429


In [62]:
# Random Forest performance
rf_low_results = analyze_model_performance(
    "RandomForest", best_rf_low, X_train_low, y_train_low, X_test_low, y_test_low,
    features, "low", "KNN"
)


Training performance of KNN imputed low RandomForest:
Accuracy: 0.8829, F1-Score: 0.8871

Test performance of KNN imputed low RandomForest:
Accuracy: 0.7600, F1-Score: 0.7616

Feature Importance for Low Missing Data RandomForest:
   feature_name  importance
7        cont_7    0.218401
3        cont_3    0.173709
0        cont_0    0.137404
16       disc_1    0.110196
18       disc_3    0.100170
1        cont_1    0.080287
9        cont_9    0.043076
13      cont_13    0.027967
11      cont_11    0.023298
8        cont_8    0.010865
Predictions saved to results/classification/predictions_low_KNN_RandomForest.csv


## High data

In [63]:
# High data
param_grid_rf_high = {
    'n_estimators': [100, 150],
    'max_depth': [1, 2],
    'min_samples_split': [150, 200, 250],
    'min_samples_leaf': [60, 70, 80],
    'max_features': ['sqrt', 'log2'],
    'bootstrap': [True],                         
    'oob_score': [True],
    'class_weight': ['balanced']    
}

# train Random forest
rf_grid_high = train_random_forest_model(X_train_high, y_train_high, param_grid_rf_high, cv=10)
best_rf_high = rf_grid_high.best_estimator_

Fitting 10 folds for each of 72 candidates, totalling 720 fits

Best parameters: {'bootstrap': True, 'class_weight': 'balanced', 'max_depth': 2, 'max_features': 'log2', 'min_samples_leaf': 60, 'min_samples_split': 150, 'n_estimators': 150, 'oob_score': True}
Best cross-validation Accuracy: 0.8514


In [64]:
# Random Forest performance
rf_high_results = analyze_model_performance(
    "RandomForest", best_rf_high, X_train_high, y_train_high, X_test_high, y_test_high,
    features, "high", "KNN"
)


Training performance of KNN imputed high RandomForest:
Accuracy: 0.8814, F1-Score: 0.8836

Test performance of KNN imputed high RandomForest:
Accuracy: 0.6667, F1-Score: 0.6988

Feature Importance for High Missing Data RandomForest:
   feature_name  importance
16       disc_1    0.197776
7        cont_7    0.163061
19       disc_4    0.128257
3        cont_3    0.124208
15       disc_0    0.110245
5        cont_5    0.044456
1        cont_1    0.043118
18       disc_3    0.042804
0        cont_0    0.036214
9        cont_9    0.029671
Predictions saved to results/classification/predictions_high_KNN_RandomForest.csv


## 2.3 Neural Network

## Low Data

In [65]:
# low data
param_grid_nn_low = {
    'hidden_layer_sizes': [(50,), (100,), (50, 50), (100, 50)],  
    'activation': ['relu', 'tanh', 'logistic'],                               
    'alpha': [0.0001, 0.001, 0.01],                             
    'learning_rate_init': [0.001, 0.01, 0.1],                        
    'max_iter': [2000, 3000]                                                
}

# train Neural Network 
nn_grid_low = train_neural_network_model(X_train_low, y_train_low, param_grid_nn_low, cv=5)
best_nn_low = nn_grid_low.best_estimator_


Fitting 5 folds for each of 216 candidates, totalling 1080 fits

Best parameters: {'activation': 'logistic', 'alpha': 0.01, 'hidden_layer_sizes': (50, 50), 'learning_rate_init': 0.001, 'max_iter': 2000}
Best cross-validation Accuracy: 0.9071


In [66]:
# Neural Network performance
nn_low_results = analyze_model_performance(
    "NeuralNetwork", best_nn_low, X_train_low, y_train_low, X_test_low, y_test_low,
    features, "low", "KNN"
)


Training performance of KNN imputed low NeuralNetwork:
Accuracy: 0.9343, F1-Score: 0.9356

Test performance of KNN imputed low NeuralNetwork:
Accuracy: 0.9300, F1-Score: 0.9325

Feature Importance for Low Missing Data NeuralNetwork:
   feature_name  importance
18       disc_3    0.147000
7        cont_7    0.127333
16       disc_1    0.114667
3        cont_3    0.099000
0        cont_0    0.091333
22    cat_2_Yes    0.040000
20    cat_0_Yes    0.037333
9        cont_9    0.032000
13      cont_13    0.026667
15       disc_0    0.025000
Predictions saved to results/classification/predictions_low_KNN_NeuralNetwork.csv


## KNN Neural Network High Data

In [67]:
# high data
param_grid_nn_high = {
    'hidden_layer_sizes': [(50,), (100,), (50, 50), (100, 50)],  
    'activation': ['relu', 'tanh', 'logistic'],                  
    'alpha': [0.0001, 0.001, 0.01],                             
    'learning_rate_init': [0.001, 0.01, 0.1],                   
    'max_iter': [2000, 3000]                                     
}

# train Neural Network 
nn_grid_high = train_neural_network_model(X_train_high, y_train_high, param_grid_nn_high, cv=5)
best_nn_high = nn_grid_high.best_estimator_


Fitting 5 folds for each of 216 candidates, totalling 1080 fits

Best parameters: {'activation': 'logistic', 'alpha': 0.0001, 'hidden_layer_sizes': (100,), 'learning_rate_init': 0.001, 'max_iter': 2000}
Best cross-validation Accuracy: 0.8357


In [68]:
# Neural Network performance
nn_high_results = analyze_model_performance(
    "NeuralNetwork", best_nn_high, X_train_high, y_train_high, X_test_high, y_test_high,
    features, "high", "KNN"
)


Training performance of KNN imputed high NeuralNetwork:
Accuracy: 0.8557, F1-Score: 0.8583

Test performance of KNN imputed high NeuralNetwork:
Accuracy: 0.8467, F1-Score: 0.8623

Feature Importance for High Missing Data NeuralNetwork:
   feature_name  importance
18       disc_3    0.124333
7        cont_7    0.113333
3        cont_3    0.049333
16       disc_1    0.040667
9        cont_9    0.032667
0        cont_0    0.025667
13      cont_13    0.020000
22    cat_2_Yes    0.013000
20    cat_0_Yes    0.012667
15       disc_0    0.007000
Predictions saved to results/classification/predictions_high_KNN_NeuralNetwork.csv


In [78]:
print("\n" + "=" * 80)
print("KNN imputation classification summary")
print("\n" + "=" * 80)

knn_results_summary = [
    # SVM 
    {
        'Dataset': 'KNN_Low_Missing',
        'Classifier': 'SVM',
        'Train_Accuracy': svm_low_results['results']['train_accuracy'],
        'Train_F1': svm_low_results['results']['train_f1'],
        'Test_Accuracy': svm_low_results['results']['test_accuracy'],
        'Test_F1': svm_low_results['results']['test_f1']
    },
    {
        'Dataset': 'KNN_High_Missing',
        'Classifier': 'SVM',
        'Train_Accuracy': svm_high_results['results']['train_accuracy'],
        'Train_F1': svm_high_results['results']['train_f1'],
        'Test_Accuracy': svm_high_results['results']['test_accuracy'],
        'Test_F1': svm_high_results['results']['test_f1']
    },
    # Random Forest 
    {
        'Dataset': 'KNN_Low_Missing',
        'Classifier': 'Random_Forest',
        'Train_Accuracy': rf_low_results['results']['train_accuracy'],
        'Train_F1': rf_low_results['results']['train_f1'],
        'Test_Accuracy': rf_low_results['results']['test_accuracy'],
        'Test_F1': rf_low_results['results']['test_f1']
    },
    {
        'Dataset': 'KNN_High_Missing',
        'Classifier': 'Random_Forest',
        'Train_Accuracy': rf_high_results['results']['train_accuracy'],
        'Train_F1': rf_high_results['results']['train_f1'],
        'Test_Accuracy': rf_high_results['results']['test_accuracy'],
        'Test_F1': rf_high_results['results']['test_f1']
    },
    # Neural Network 
    {
        'Dataset': 'KNN_Low_Missing',
        'Classifier': 'Neural_Network',
        'Train_Accuracy': nn_low_results['results']['train_accuracy'],
        'Train_F1': nn_low_results['results']['train_f1'],
        'Test_Accuracy': nn_low_results['results']['test_accuracy'],
        'Test_F1': nn_low_results['results']['test_f1']
    },
    {
        'Dataset': 'KNN_High_Missing',
        'Classifier': 'Neural_Network',
        'Train_Accuracy': nn_high_results['results']['train_accuracy'],
        'Train_F1': nn_high_results['results']['train_f1'],
        'Test_Accuracy': nn_high_results['results']['test_accuracy'],
        'Test_F1': nn_high_results['results']['test_f1']
    },
    # Complete 
    {
        'Dataset': 'Complete_Dataset',
        'Classifier': 'SVM',
        'Train_Accuracy': svm_complete_results['results']['train_accuracy'],
        'Train_F1': svm_complete_results['results']['train_f1'],
        'Test_Accuracy': svm_complete_results['results']['test_accuracy'],
        'Test_F1': svm_complete_results['results']['test_f1']
    },
    {
        'Dataset': 'Complete_Dataset',
        'Classifier': 'Random_Forest',
        'Train_Accuracy': rf_complete_results['results']['train_accuracy'],
        'Train_F1': rf_complete_results['results']['train_f1'],
        'Test_Accuracy': rf_complete_results['results']['test_accuracy'],
        'Test_F1': rf_complete_results['results']['test_f1']
    },
    {
        'Dataset': 'Complete_Dataset',
        'Classifier': 'Neural_Network',
        'Train_Accuracy': nn_complete_results['results']['train_accuracy'],
        'Train_F1': nn_complete_results['results']['train_f1'],
        'Test_Accuracy': nn_complete_results['results']['test_accuracy'],
        'Test_F1': nn_complete_results['results']['test_f1']
    }
]

knn_results_df = pd.DataFrame(knn_results_summary)
knn_results_df = knn_results_df.round(4)

print("\nKNN Imputation Classification performance summary:")
print(knn_results_df.to_string(index=False))

# pivot tables
print("\n" + "=" * 80)
print("Test Accuracy by Dataset and Classifier")
print("\n" + "=" * 80)
pivot_accuracy = knn_results_df.pivot(index='Dataset', columns='Classifier', values='Test_Accuracy')
print(pivot_accuracy.round(4))

print("\n" + "=" * 80)
print("Test F1 Score by Dataset and Classifier")
print("\n" + "=" * 80)
pivot_f1 = knn_results_df.pivot(index='Dataset', columns='Classifier', values='Test_F1')
print(pivot_f1.round(4))

# save results
knn_results_df.to_csv('results/classification/knn_classification_results.csv', index=False)
pivot_accuracy.to_csv('results/classification/knn_accuracy_pivot.csv')
pivot_f1.to_csv('results/classification/knn_f1_pivot.csv')


KNN imputation classification summary


KNN Imputation Classification performance summary:
         Dataset     Classifier  Train_Accuracy  Train_F1  Test_Accuracy  Test_F1
 KNN_Low_Missing            SVM          0.9371    0.9387         0.9433   0.9453
KNN_High_Missing            SVM          0.8543    0.8555         0.8600   0.8720
 KNN_Low_Missing  Random_Forest          0.8829    0.8871         0.7600   0.7616
KNN_High_Missing  Random_Forest          0.8814    0.8836         0.6667   0.6988
 KNN_Low_Missing Neural_Network          0.9343    0.9356         0.9300   0.9325
KNN_High_Missing Neural_Network          0.8557    0.8583         0.8467   0.8623
Complete_Dataset            SVM          0.9843    0.9846         0.9767   0.9773
Complete_Dataset  Random_Forest          0.8686    0.8757         0.7900   0.8037
Complete_Dataset Neural_Network          1.0000    1.0000         0.9733   0.9742

Test Accuracy by Dataset and Classifier

Classifier        Neural_Network  Random_Fores

# 3. MICE imputation


In [79]:
# load MICE datasets
datasets = load_datasets("MICE")

# 3.1 SVM Classifier

## Low dataset

In [80]:
X_train_low, y_train_low, features = prepare_data(datasets["train_low"])
X_test_low, y_test_low, _ = prepare_data(datasets["test_low"])

# fine tune
param_grid_svm_low = {
    'C': [2, 5, 10],
    'kernel': ['linear', 'rbf'],
    'gamma': ['scale', 'auto']
}

# train
svm_grid_low = train_svm_model(X_train_low, y_train_low, param_grid_svm_low, cv=10)
best_svm_low = svm_grid_low.best_estimator_

Fitting 10 folds for each of 12 candidates, totalling 120 fits

Best parameters: {'C': 2, 'gamma': 'scale', 'kernel': 'linear'}
Best cross-validation Accuracy: 0.9357


In [81]:
# SVM performance
svm_low_results = analyze_model_performance(
    "SVM", best_svm_low, X_train_low, y_train_low, X_test_low, y_test_low,
    features, "low", "MICE"
)


Training performance of MICE imputed low SVM:
Accuracy: 0.9514, F1-Score: 0.9529

Test performance of MICE imputed low SVM:
Accuracy: 0.9500, F1-Score: 0.9518

Feature Importance for Low Missing Data SVM:
   feature_name  coefficient  abs_coefficient
18       disc_3     2.077493         2.077493
20    cat_0_Yes     2.064850         2.064850
7        cont_7     1.915327         1.915327
22    cat_2_Yes     1.866358         1.866358
0        cont_0     1.724938         1.724938
16       disc_1     1.716302         1.716302
3        cont_3     1.693588         1.693588
21    cat_1_Yes     0.849451         0.849451
1        cont_1     0.724008         0.724008
13      cont_13     0.700471         0.700471
Predictions saved to results/classification/predictions_low_MICE_SVM.csv


## High dataset

In [91]:
X_train_high, y_train_high, features = prepare_data(datasets["train_high"])
X_test_high, y_test_high, _ = prepare_data(datasets["test_high"])

# fine tuning
param_grid_svm_high = {
    'C': [1, 2, 3, 4, 5],
    'kernel': ['linear', 'rbf'],
    'gamma': ['scale', 'auto']
}

# train SVM
svm_grid_high = train_svm_model(X_train_high, y_train_high, param_grid_svm_high, cv=10)
best_svm_high = svm_grid_high.best_estimator_

Fitting 10 folds for each of 20 candidates, totalling 200 fits

Best parameters: {'C': 1, 'gamma': 'scale', 'kernel': 'linear'}
Best cross-validation Accuracy: 0.9400


In [92]:
# SVM performance
svm_high_results = analyze_model_performance(
    "SVM", best_svm_high, X_train_high, y_train_high, X_test_high, y_test_high,
    features, "high", "MICE"
)


Training performance of MICE imputed high SVM:
Accuracy: 0.9600, F1-Score: 0.9614

Test performance of MICE imputed high SVM:
Accuracy: 0.8200, F1-Score: 0.8269

Feature Importance for High Missing Data SVM:
   feature_name  coefficient  abs_coefficient
3        cont_3     2.045080         2.045080
16       disc_1     1.872073         1.872073
7        cont_7     1.598041         1.598041
20    cat_0_Yes     1.504586         1.504586
0        cont_0     1.408046         1.408046
18       disc_3     1.402328         1.402328
17       disc_2    -1.215791         1.215791
22    cat_2_Yes     1.192336         1.192336
15       disc_0     1.096440         1.096440
9        cont_9     0.940989         0.940989
Predictions saved to results/classification/predictions_high_MICE_SVM.csv


# 3.2 Random Forest  

## Low data

In [90]:
# Random Forest low data
param_grid_rf_low = {
    'n_estimators': [50, 100, 150, 200],
    'max_depth': [5, 10],
    'min_samples_split': [50, 100, 150],
    'min_samples_leaf': [30, 40, 50],
    'max_features': ['sqrt', 'log2'],
    'bootstrap': [True],                         
    'oob_score': [True]
}

# train Random Forest 
rf_grid_low = train_random_forest_model(X_train_low, y_train_low, param_grid_rf_low, cv=10)
best_rf_low = rf_grid_low.best_estimator_


Fitting 10 folds for each of 144 candidates, totalling 1440 fits

Best parameters: {'bootstrap': True, 'max_depth': 5, 'max_features': 'sqrt', 'min_samples_leaf': 30, 'min_samples_split': 50, 'n_estimators': 100, 'oob_score': True}
Best cross-validation Accuracy: 0.8043


In [93]:
# Random Forest performance
rf_low_results = analyze_model_performance(
    "RandomForest", best_rf_low, X_train_low, y_train_low, X_test_low, y_test_low,
    features, "low", "MICE"
)


Training performance of MICE imputed low RandomForest:
Accuracy: 0.8743, F1-Score: 0.8820

Test performance of MICE imputed low RandomForest:
Accuracy: 0.8033, F1-Score: 0.8162

Feature Importance for Low Missing Data RandomForest:
   feature_name  importance
7        cont_7    0.250234
18       disc_3    0.139848
3        cont_3    0.137174
0        cont_0    0.121853
9        cont_9    0.059699
16       disc_1    0.056162
13      cont_13    0.046129
1        cont_1    0.043085
11      cont_11    0.033467
8        cont_8    0.018916
Predictions saved to results/classification/predictions_low_MICE_RandomForest.csv


## MICE Random Forest High data

In [94]:
# high dataset
param_grid_rf_high = {
    'n_estimators': [50, 100, 150, 200],
    'max_depth': [5, 10],
    'min_samples_split': [50, 100, 150],
    'min_samples_leaf': [30, 40, 50],
    'max_features': ['sqrt', 'log2'],
    'bootstrap': [True],                         
    'oob_score': [True]   
}

# train Random Forest 
rf_grid_high = train_random_forest_model(X_train_high, y_train_high, param_grid_rf_high, cv=10)
best_rf_high = rf_grid_high.best_estimator_

Fitting 10 folds for each of 144 candidates, totalling 1440 fits

Best parameters: {'bootstrap': True, 'max_depth': 5, 'max_features': 'sqrt', 'min_samples_leaf': 40, 'min_samples_split': 50, 'n_estimators': 150, 'oob_score': True}
Best cross-validation Accuracy: 0.8086


In [None]:
# Random Forest performance
rf_high_results = analyze_model_performance(
    "RandomForest", best_rf_high, X_train_high, y_train_high, X_test_high, y_test_high,
    features, "high", "MICE"
)


Training performance of MICE imputed high RandomForest:
Accuracy: 0.8714, F1-Score: 0.8767

Test performance of MICE imputed high RandomForest:
Accuracy: 0.8033, F1-Score: 0.8127

Feature Importance for High Missing Data RandomForest:
   feature_name  importance
7        cont_7    0.262234
18       disc_3    0.138783
3        cont_3    0.109310
1        cont_1    0.071751
0        cont_0    0.070087
9        cont_9    0.064326
16       disc_1    0.051261
15       disc_0    0.043541
11      cont_11    0.041286
13      cont_13    0.033040
Predictions saved to results/classification/predictions_high_MICE_RandomForest.csv


## 3.3 Neural Network

## Low Data

In [96]:
# Neural Network - low dataset
param_grid_nn_low = {
    'hidden_layer_sizes': [(50,), (100,), (50, 50), (100, 50)],  
    'activation': ['relu', 'tanh', 'logistic'],                               
    'alpha': [0.0001, 0.001, 0.01],                             
    'learning_rate_init': [0.001, 0.01, 0.1],                        
    'max_iter': [2000, 3000]                                                
}

# train Neural Network 
nn_grid_low = train_neural_network_model(X_train_low, y_train_low, param_grid_nn_low, cv=5)
best_nn_low = nn_grid_low.best_estimator_

Fitting 5 folds for each of 216 candidates, totalling 1080 fits

Best parameters: {'activation': 'tanh', 'alpha': 0.0001, 'hidden_layer_sizes': (100, 50), 'learning_rate_init': 0.01, 'max_iter': 2000}
Best cross-validation Accuracy: 0.9314


In [98]:
# Neural Network performance
nn_low_results = analyze_model_performance(
    "NeuralNetwork", best_nn_low, X_train_low, y_train_low, X_test_low, y_test_low,
    features, "low", "MICE"
)


Training performance of MICE imputed low NeuralNetwork:
Accuracy: 1.0000, F1-Score: 1.0000

Test performance of MICE imputed low NeuralNetwork:
Accuracy: 0.9500, F1-Score: 0.9511

Feature Importance for Low Missing Data NeuralNetwork:
   feature_name  importance
18       disc_3    0.141000
7        cont_7    0.125333
0        cont_0    0.105000
3        cont_3    0.101333
16       disc_1    0.100000
20    cat_0_Yes    0.046667
22    cat_2_Yes    0.037000
1        cont_1    0.029667
11      cont_11    0.022333
5        cont_5    0.020000
Predictions saved to results/classification/predictions_low_MICE_NeuralNetwork.csv


## High Data

In [105]:
# Neural Network - high dataset
param_grid_nn_high = {
    'hidden_layer_sizes': [(10,), (15,)],  
    'activation': ['relu', 'tanh', 'logistic'],                  
    'alpha': [1.0, 2.0, 5.0],                             
    'learning_rate_init': [0.001, 0.01],                   
    'max_iter': [2000, 3000]                                     
}

# train Neural Network 
nn_grid_high = train_neural_network_model(X_train_high, y_train_high, param_grid_nn_high, cv=5)
best_nn_high = nn_grid_high.best_estimator_


Fitting 5 folds for each of 72 candidates, totalling 360 fits

Best parameters: {'activation': 'relu', 'alpha': 2.0, 'hidden_layer_sizes': (10,), 'learning_rate_init': 0.01, 'max_iter': 2000}
Best cross-validation Accuracy: 0.9400


In [106]:
# Neural Network performance
nn_high_results = analyze_model_performance(
    "NeuralNetwork", best_nn_high, X_train_high, y_train_high, X_test_high, y_test_high,
    features, "high", "MICE"
)


Training performance of MICE imputed high NeuralNetwork:
Accuracy: 0.9557, F1-Score: 0.9570

Test performance of MICE imputed high NeuralNetwork:
Accuracy: 0.8233, F1-Score: 0.8307

Feature Importance for High Missing Data NeuralNetwork:
   feature_name  importance
16       disc_1    0.082000
18       disc_3    0.078000
0        cont_0    0.074000
3        cont_3    0.069667
7        cont_7    0.064667
15       disc_0    0.029333
9        cont_9    0.027333
22    cat_2_Yes    0.013333
20    cat_0_Yes    0.012667
13      cont_13    0.004667
Predictions saved to results/classification/predictions_high_MICE_NeuralNetwork.csv


# MICE imputation results for all classification methods

In [108]:
print("=" * 80)
print("MICE IMPUTATION RESULTS SUMMARY")
print("=" * 80)

mice_results_summary = [
    # SVM 
    {
        'Dataset': 'MICE_Low_Missing',
        'Classifier': 'SVM',
        'Train_Accuracy': svm_low_results['results']['train_accuracy'],
        'Train_F1': svm_low_results['results']['train_f1'],
        'Test_Accuracy': svm_low_results['results']['test_accuracy'],
        'Test_F1': svm_low_results['results']['test_f1']
    },
    {
        'Dataset': 'MICE_High_Missing',
        'Classifier': 'SVM',
        'Train_Accuracy': svm_high_results['results']['train_accuracy'],
        'Train_F1': svm_high_results['results']['train_f1'],
        'Test_Accuracy': svm_high_results['results']['test_accuracy'],
        'Test_F1': svm_high_results['results']['test_f1']
    },
    # Random Forest 
    {
        'Dataset': 'MICE_Low_Missing',
        'Classifier': 'Random_Forest',
        'Train_Accuracy': rf_low_results['results']['train_accuracy'],
        'Train_F1': rf_low_results['results']['train_f1'],
        'Test_Accuracy': rf_low_results['results']['test_accuracy'],
        'Test_F1': rf_low_results['results']['test_f1']
    },
    {
        'Dataset': 'MICE_High_Missing',
        'Classifier': 'Random_Forest',
        'Train_Accuracy': rf_high_results['results']['train_accuracy'],
        'Train_F1': rf_high_results['results']['train_f1'],
        'Test_Accuracy': rf_high_results['results']['test_accuracy'],
        'Test_F1': rf_high_results['results']['test_f1']
        },
    # Neural Network 
    {
        'Dataset': 'MICE_Low_Missing',
        'Classifier': 'Neural_Network',
        'Train_Accuracy': nn_low_results['results']['train_accuracy'],
        'Train_F1': nn_low_results['results']['train_f1'],
        'Test_Accuracy': nn_low_results['results']['test_accuracy'],
        'Test_F1': nn_low_results['results']['test_f1']
    },
    {
        'Dataset': 'MICE_High_Missing',
        'Classifier': 'Neural_Network',
        'Train_Accuracy': nn_high_results['results']['train_accuracy'],
        'Train_F1': nn_high_results['results']['train_f1'],
        'Test_Accuracy': nn_high_results['results']['test_accuracy'],
        'Test_F1': nn_high_results['results']['test_f1']
    },
    # Complete dataset results 
    {
        'Dataset': 'Complete_Dataset',
        'Classifier': 'SVM',
        'Train_Accuracy': svm_complete_results['results']['train_accuracy'],
        'Train_F1': svm_complete_results['results']['train_f1'],
        'Test_Accuracy': svm_complete_results['results']['test_accuracy'],
        'Test_F1': svm_complete_results['results']['test_f1']
    },
    {
        'Dataset': 'Complete_Dataset',
        'Classifier': 'Random_Forest',
        'Train_Accuracy': rf_complete_results['results']['train_accuracy'],
        'Train_F1': rf_complete_results['results']['train_f1'],
        'Test_Accuracy': rf_complete_results['results']['test_accuracy'],
        'Test_F1': rf_complete_results['results']['test_f1']
    },
    {
        'Dataset': 'Complete_Dataset',
        'Classifier': 'Neural_Network',
        'Train_Accuracy': nn_complete_results['results']['train_accuracy'],
        'Train_F1': nn_complete_results['results']['train_f1'],
        'Test_Accuracy': nn_complete_results['results']['test_accuracy'],
        'Test_F1': nn_complete_results['results']['test_f1']
    }
]


mice_results_df = pd.DataFrame(mice_results_summary)
mice_results_df = mice_results_df.round(4)

print("\nMICE Imputation Classification performance summary:")
print(mice_results_df.to_string(index=False))

# pivot tables
print("=" * 80)
print("Test Accuracy by Dataset and Classifier")
print("=" * 80)
pivot_accuracy = mice_results_df.pivot(index='Dataset', columns='Classifier', values='Test_Accuracy')
print(pivot_accuracy.round(4))

print("=" * 80)
print("Test F1 Score by Dataset and Classifier")
print("=" * 80)
pivot_f1 = mice_results_df.pivot(index='Dataset', columns='Classifier', values='Test_F1')
print(pivot_f1.round(4))

# save results
mice_results_df.to_csv('results/classification/mice_classification_results.csv', index=False)
pivot_accuracy.to_csv('results/classification/mice_accuracy_pivot.csv')
pivot_f1.to_csv('results/classification/mice_f1_pivot.csv')

MICE IMPUTATION RESULTS SUMMARY

MICE Imputation Classification performance summary:
          Dataset     Classifier  Train_Accuracy  Train_F1  Test_Accuracy  Test_F1
 MICE_Low_Missing            SVM          0.9514    0.9529         0.9500   0.9518
MICE_High_Missing            SVM          0.9600    0.9614         0.8200   0.8269
 MICE_Low_Missing  Random_Forest          0.8743    0.8820         0.8033   0.8162
MICE_High_Missing  Random_Forest          0.8714    0.8767         0.8033   0.8127
 MICE_Low_Missing Neural_Network          1.0000    1.0000         0.9500   0.9511
MICE_High_Missing Neural_Network          0.9557    0.9570         0.8233   0.8307
 Complete_Dataset            SVM          0.9843    0.9846         0.9767   0.9773
 Complete_Dataset  Random_Forest          0.8686    0.8757         0.7900   0.8037
 Complete_Dataset Neural_Network          1.0000    1.0000         0.9733   0.9742
Test Accuracy by Dataset and Classifier
Classifier         Neural_Network  Random_For

# 4. Simple Imputer

In [109]:
# load Simple imputed datasets
datasets = load_datasets("Simple")

# 4.1 SVM

# Low data

In [111]:
X_train_low, y_train_low, features = prepare_data(datasets["train_low"])
X_test_low, y_test_low, _ = prepare_data(datasets["test_low"])

# fine tuning
param_grid_svm_low = {
    'C': [2, 5, 10, 15, 20, 25],
    'kernel': ['linear', 'rbf'],
    'gamma': ['scale', 'auto']
}

# train SVM model
svm_grid_low = train_svm_model(X_train_low, y_train_low, param_grid_svm_low, cv=5)
best_svm_low = svm_grid_low.best_estimator_


Fitting 5 folds for each of 24 candidates, totalling 120 fits

Best parameters: {'C': 10, 'gamma': 'scale', 'kernel': 'linear'}
Best cross-validation Accuracy: 0.8814


In [112]:
# SVM performance
svm_low_results = analyze_model_performance(
    "SVM", best_svm_low, X_train_low, y_train_low, X_test_low, y_test_low,
    features, "low", "Simple"
)


Training performance of Simple imputed low SVM:
Accuracy: 0.9071, F1-Score: 0.9096

Test performance of Simple imputed low SVM:
Accuracy: 0.9500, F1-Score: 0.9515

Feature Importance for Low Missing Data SVM:
   feature_name  coefficient  abs_coefficient
18       disc_3     1.616319         1.616319
7        cont_7     1.580065         1.580065
20    cat_0_Yes     1.467887         1.467887
16       disc_1     1.457456         1.457456
22    cat_2_Yes     1.453295         1.453295
0        cont_0     1.324782         1.324782
3        cont_3     1.300425         1.300425
21    cat_1_Yes     0.646626         0.646626
13      cont_13     0.620832         0.620832
9        cont_9     0.617603         0.617603
Predictions saved to results/classification/predictions_low_Simple_SVM.csv


# High

In [113]:
X_train_high, y_train_high, features = prepare_data(datasets["train_high"])
X_test_high, y_test_high, _ = prepare_data(datasets["test_high"])

# fine tuning SVM
param_grid_svm_high = {
    'C': [1, 2, 3, 4, 5],
    'kernel': ['linear', 'rbf'],
    'gamma': ['scale', 'auto']
}

# train SVM 
svm_grid_high = train_svm_model(X_train_high, y_train_high, param_grid_svm_high, cv=5)
best_svm_high = svm_grid_high.best_estimator_


Fitting 5 folds for each of 20 candidates, totalling 100 fits

Best parameters: {'C': 3, 'gamma': 'scale', 'kernel': 'linear'}
Best cross-validation Accuracy: 0.8157


In [114]:
# SVM performance
svm_high_results = analyze_model_performance(
    "SVM", best_svm_high, X_train_high, y_train_high, X_test_high, y_test_high,
    features, "high", "Simple"
)


Training performance of Simple imputed high SVM:
Accuracy: 0.8386, F1-Score: 0.8388

Test performance of Simple imputed high SVM:
Accuracy: 0.8800, F1-Score: 0.8882

Feature Importance for High Missing Data SVM:
   feature_name  coefficient  abs_coefficient
18       disc_3     1.115167         1.115167
7        cont_7     1.108995         1.108995
20    cat_0_Yes     1.083222         1.083222
16       disc_1     0.917040         0.917040
3        cont_3     0.895048         0.895048
0        cont_0     0.887821         0.887821
22    cat_2_Yes     0.815311         0.815311
9        cont_9     0.514801         0.514801
15       disc_0     0.485869         0.485869
23    cat_3_Yes     0.439827         0.439827
Predictions saved to results/classification/predictions_high_Simple_SVM.csv


# 4.2 Random Forest

# Low

In [115]:
# Random Forest - low data
param_grid_rf_low = {
    'n_estimators': [50, 100],
    'max_depth': [2, 3, 4],
    'min_samples_split': [50, 100, 150],
    'min_samples_leaf': [20, 30, 40],
    'max_features': ['sqrt', 'log2'],
    'bootstrap': [True],                         
    'oob_score': [True]
}

# train Random Forest 
rf_grid_low = train_random_forest_model(X_train_low, y_train_low, param_grid_rf_low, cv=10)
best_rf_low = rf_grid_low.best_estimator_


Fitting 10 folds for each of 108 candidates, totalling 1080 fits

Best parameters: {'bootstrap': True, 'max_depth': 4, 'max_features': 'sqrt', 'min_samples_leaf': 20, 'min_samples_split': 100, 'n_estimators': 100, 'oob_score': True}
Best cross-validation Accuracy: 0.7914


In [116]:
# Random Forest performance
rf_low_results = analyze_model_performance(
    "RandomForest", best_rf_low, X_train_low, y_train_low, X_test_low, y_test_low,
    features, "low", "Simple"
)


Training performance of Simple imputed low RandomForest:
Accuracy: 0.8386, F1-Score: 0.8487

Test performance of Simple imputed low RandomForest:
Accuracy: 0.7800, F1-Score: 0.7885

Feature Importance for Low Missing Data RandomForest:
   feature_name  importance
7        cont_7    0.290323
18       disc_3    0.158698
3        cont_3    0.121800
9        cont_9    0.082002
0        cont_0    0.079184
13      cont_13    0.049370
16       disc_1    0.047804
1        cont_1    0.038454
11      cont_11    0.037351
8        cont_8    0.016305
Predictions saved to results/classification/predictions_low_Simple_RandomForest.csv


# High

In [117]:
# Random Forest - high data
param_grid_rf_high = {
    'n_estimators': [100, 150],
    'max_depth': [1, 2],
    'min_samples_split': [150, 200, 250],
    'min_samples_leaf': [60, 70, 80],
    'max_features': ['sqrt', 'log2'],
    'bootstrap': [True],                         
    'oob_score': [True]   
}

# train Random Forest 
rf_grid_high = train_random_forest_model(X_train_high, y_train_high, param_grid_rf_high, cv=10)
best_rf_high = rf_grid_high.best_estimator_


Fitting 10 folds for each of 72 candidates, totalling 720 fits

Best parameters: {'bootstrap': True, 'max_depth': 2, 'max_features': 'log2', 'min_samples_leaf': 60, 'min_samples_split': 250, 'n_estimators': 150, 'oob_score': True}
Best cross-validation Accuracy: 0.7600


In [118]:
# Random Forest performance
rf_high_results = analyze_model_performance(
    "RandomForest", best_rf_high, X_train_high, y_train_high, X_test_high, y_test_high,
    features, "high", "Simple"
)


Training performance of Simple imputed high RandomForest:
Accuracy: 0.7743, F1-Score: 0.7927

Test performance of Simple imputed high RandomForest:
Accuracy: 0.7000, F1-Score: 0.7273

Feature Importance for High Missing Data RandomForest:
   feature_name  importance
7        cont_7    0.213331
9        cont_9    0.132970
18       disc_3    0.124307
11      cont_11    0.106349
3        cont_3    0.073656
0        cont_0    0.070218
13      cont_13    0.063492
16       disc_1    0.058489
8        cont_8    0.027258
6        cont_6    0.023280
Predictions saved to results/classification/predictions_high_Simple_RandomForest.csv


# 4.3 Neural Network

# Low

In [119]:
# Neural Network - low data
param_grid_nn_low = {
    'hidden_layer_sizes': [(50,), (100,), (50, 50), (100, 50)],  
    'activation': ['relu', 'tanh', 'logistic'],                               
    'alpha': [0.0001, 0.001, 0.01],                             
    'learning_rate_init': [0.001, 0.01, 0.1],                        
    'max_iter': [2000, 3000]                                                
}

# train Neural Network 
nn_grid_low = train_neural_network_model(X_train_low, y_train_low, param_grid_nn_low, cv=5)
best_nn_low = nn_grid_low.best_estimator_

Fitting 5 folds for each of 216 candidates, totalling 1080 fits

Best parameters: {'activation': 'logistic', 'alpha': 0.0001, 'hidden_layer_sizes': (100, 50), 'learning_rate_init': 0.001, 'max_iter': 2000}
Best cross-validation Accuracy: 0.8914


In [120]:
#  Neural Network performance
nn_low_results = analyze_model_performance(
    "NeuralNetwork", best_nn_low, X_train_low, y_train_low, X_test_low, y_test_low,
    features, "low", "Simple"
)


Training performance of Simple imputed low NeuralNetwork:
Accuracy: 0.9157, F1-Score: 0.9184

Test performance of Simple imputed low NeuralNetwork:
Accuracy: 0.9433, F1-Score: 0.9450

Feature Importance for Low Missing Data NeuralNetwork:
   feature_name  importance
18       disc_3    0.159667
7        cont_7    0.139333
16       disc_1    0.121000
3        cont_3    0.106333
0        cont_0    0.099333
20    cat_0_Yes    0.046000
9        cont_9    0.043667
22    cat_2_Yes    0.041667
11      cont_11    0.039000
13      cont_13    0.033000
Predictions saved to results/classification/predictions_low_Simple_NeuralNetwork.csv


# High

In [121]:
# Neural Network - high 
param_grid_nn_high = {
    'hidden_layer_sizes': [(50,), (100,), (50, 50), (100, 50)],  
    'activation': ['relu', 'tanh', 'logistic'],                  
    'alpha': [0.0001, 0.001, 0.01],                             
    'learning_rate_init': [0.001, 0.01, 0.1],                   
    'max_iter': [2000, 3000]                                     
}

# train Neural Network 
nn_grid_high = train_neural_network_model(X_train_high, y_train_high, param_grid_nn_high, cv=5)
best_nn_high = nn_grid_high.best_estimator_

Fitting 5 folds for each of 216 candidates, totalling 1080 fits

Best parameters: {'activation': 'logistic', 'alpha': 0.01, 'hidden_layer_sizes': (50,), 'learning_rate_init': 0.001, 'max_iter': 2000}
Best cross-validation Accuracy: 0.8186


In [122]:
# Neural Network performance
nn_high_results = analyze_model_performance(
    "NeuralNetwork", best_nn_high, X_train_high, y_train_high, X_test_high, y_test_high,
    features, "high", "Simple"
)


Training performance of Simple imputed high NeuralNetwork:
Accuracy: 0.8400, F1-Score: 0.8427

Test performance of Simple imputed high NeuralNetwork:
Accuracy: 0.8667, F1-Score: 0.8750

Feature Importance for High Missing Data NeuralNetwork:
   feature_name  importance
18       disc_3    0.136333
7        cont_7    0.131333
16       disc_1    0.059667
3        cont_3    0.056000
0        cont_0    0.045667
13      cont_13    0.040333
9        cont_9    0.037667
22    cat_2_Yes    0.024000
20    cat_0_Yes    0.021667
11      cont_11    0.014667
Predictions saved to results/classification/predictions_high_Simple_NeuralNetwork.csv


# Simple summary

In [123]:
print("=" * 80)
print("SIMPLE IMPUTATION RESULTS SUMMARY")
print("=" * 80)

simple_results_summary = [
    # SVM 
    {
        'Dataset': 'Simple_Low_Missing',
        'Classifier': 'SVM',
        'Train_Accuracy': svm_low_results['results']['train_accuracy'],
        'Train_F1': svm_low_results['results']['train_f1'],
        'Test_Accuracy': svm_low_results['results']['test_accuracy'],
        'Test_F1': svm_low_results['results']['test_f1']
    },
    {
        'Dataset': 'Simple_High_Missing',
        'Classifier': 'SVM',
        'Train_Accuracy': svm_high_results['results']['train_accuracy'],
        'Train_F1': svm_high_results['results']['train_f1'],
        'Test_Accuracy': svm_high_results['results']['test_accuracy'],
        'Test_F1': svm_high_results['results']['test_f1']
    },
    # Random Forest 
    {
        'Dataset': 'Simple_Low_Missing',
        'Classifier': 'Random_Forest',
        'Train_Accuracy': rf_low_results['results']['train_accuracy'],
        'Train_F1': rf_low_results['results']['train_f1'],
        'Test_Accuracy': rf_low_results['results']['test_accuracy'],
        'Test_F1': rf_low_results['results']['test_f1']
    },
    {
        'Dataset': 'Simple_High_Missing',
        'Classifier': 'Random_Forest',
        'Train_Accuracy': rf_high_results['results']['train_accuracy'],
        'Train_F1': rf_high_results['results']['train_f1'],
        'Test_Accuracy': rf_high_results['results']['test_accuracy'],
        'Test_F1': rf_high_results['results']['test_f1']
    },
    # Neural Network 
    {
        'Dataset': 'Simple_Low_Missing',
        'Classifier': 'Neural_Network',
        'Train_Accuracy': nn_low_results['results']['train_accuracy'],
        'Train_F1': nn_low_results['results']['train_f1'],
        'Test_Accuracy': nn_low_results['results']['test_accuracy'],
        'Test_F1': nn_low_results['results']['test_f1']
    },
    {
        'Dataset': 'Simple_High_Missing',
        'Classifier': 'Neural_Network',
        'Train_Accuracy': nn_high_results['results']['train_accuracy'],
        'Train_F1': nn_high_results['results']['train_f1'],
        'Test_Accuracy': nn_high_results['results']['test_accuracy'],
        'Test_F1': nn_high_results['results']['test_f1']
    },
    # Complete dataset 
    {
        'Dataset': 'Complete_Dataset',
        'Classifier': 'SVM',
        'Train_Accuracy': svm_complete_results['results']['train_accuracy'],
        'Train_F1': svm_complete_results['results']['train_f1'],
        'Test_Accuracy': svm_complete_results['results']['test_accuracy'],
        'Test_F1': svm_complete_results['results']['test_f1']
    },
    {
        'Dataset': 'Complete_Dataset',
        'Classifier': 'Random_Forest',
        'Train_Accuracy': rf_complete_results['results']['train_accuracy'],
        'Train_F1': rf_complete_results['results']['train_f1'],
        'Test_Accuracy': rf_complete_results['results']['test_accuracy'],
        'Test_F1': rf_complete_results['results']['test_f1']
    },
    {
        'Dataset': 'Complete_Dataset',
        'Classifier': 'Neural_Network',
        'Train_Accuracy': nn_complete_results['results']['train_accuracy'],
        'Train_F1': nn_complete_results['results']['train_f1'],
        'Test_Accuracy': nn_complete_results['results']['test_accuracy'],
        'Test_F1': nn_complete_results['results']['test_f1']
    }
]


simple_results_df = pd.DataFrame(simple_results_summary)
simple_results_df = simple_results_df.round(4)

print("\nSimple Imputation Classification performance summary:")
print(simple_results_df.to_string(index=False))

# pivot tables
print("=" * 80)
print("Test Accuracy by Dataset and Classifier")
print("=" * 80)
pivot_accuracy = simple_results_df.pivot(index='Dataset', columns='Classifier', values='Test_Accuracy')
print(pivot_accuracy.round(4))

print("=" * 80)
print("Test F1 Score by Dataset and Classifier")
print("=" * 80)
pivot_f1 = simple_results_df.pivot(index='Dataset', columns='Classifier', values='Test_F1')
print(pivot_f1.round(4))

# save results
simple_results_df.to_csv('results/classification/simple_classification_results.csv', index=False)
pivot_accuracy.to_csv('results/classification/simple_accuracy_pivot.csv')

SIMPLE IMPUTATION RESULTS SUMMARY

Simple Imputation Classification performance summary:
            Dataset     Classifier  Train_Accuracy  Train_F1  Test_Accuracy  Test_F1
 Simple_Low_Missing            SVM          0.9071    0.9096         0.9500   0.9515
Simple_High_Missing            SVM          0.8386    0.8388         0.8800   0.8882
 Simple_Low_Missing  Random_Forest          0.8386    0.8487         0.7800   0.7885
Simple_High_Missing  Random_Forest          0.7743    0.7927         0.7000   0.7273
 Simple_Low_Missing Neural_Network          0.9157    0.9184         0.9433   0.9450
Simple_High_Missing Neural_Network          0.8400    0.8427         0.8667   0.8750
   Complete_Dataset            SVM          0.9843    0.9846         0.9767   0.9773
   Complete_Dataset  Random_Forest          0.8686    0.8757         0.7900   0.8037
   Complete_Dataset Neural_Network          1.0000    1.0000         0.9733   0.9742
Test Accuracy by Dataset and Classifier
Classifier           

# Master comparison table

In [None]:
# master comparison for all imputation methods
print("=" * 80)
print("MASTER COMPARISON - ALL IMPUTATION METHODS")
print("=" * 80)

all_results = pd.concat([
    knn_results_df, 
    mice_results_df, 
    simple_results_df
])


all_results = all_results.drop_duplicates()


all_results['Imputation'] = all_results['Dataset'].apply(
    lambda x: 'None' if 'Complete' in x 
              else 'KNN' if 'KNN' in x 
              else 'MICE' if 'MICE' in x 
              else 'Simple' if 'Simple' in x 
              else 'Unknown'
)
all_results['Missing_Level'] = all_results['Dataset'].apply(
    lambda x: 'None' if 'Complete' in x 
              else 'Low' if 'Low' in x 
              else 'High' if 'High' in x 
              else 'Unknown'
)

# test results low
print("\nTEST RESULTS - LOW MISSING DATA")
print("-" * 50)
low_results = all_results[
    (all_results['Missing_Level'] == 'Low') | 
    (all_results['Missing_Level'] == 'None')
]

low_summary = low_results.pivot_table(
    index='Imputation',
    columns='Classifier',
    values=['Test_Accuracy', 'Test_F1']
).round(4)

print(low_summary)
low_summary.to_csv('results/classification/low_missing_results.csv')

# 2. test high
print("\nTEST RESULTS - HIGH MISSING DATA")
print("-" * 50)
high_results = all_results[
    (all_results['Missing_Level'] == 'High') | 
    (all_results['Missing_Level'] == 'None')
]

high_summary = high_results.pivot_table(
    index='Imputation',
    columns='Classifier',
    values=['Test_Accuracy', 'Test_F1']
).round(4)

print(high_summary)
high_summary.to_csv('results/classification/high_missing_results.csv')



MASTER COMPARISON - ALL IMPUTATION METHODS

TEST RESULTS - LOW MISSING DATA
--------------------------------------------------
            Test_Accuracy                              Test_F1                \
Classifier Neural_Network Random_Forest     SVM Neural_Network Random_Forest   
Imputation                                                                     
KNN                0.9300        0.7600  0.9433         0.9325        0.7616   
MICE               0.9500        0.8033  0.9500         0.9511        0.8162   
None               0.9733        0.7900  0.9767         0.9742        0.8037   
Simple             0.9433        0.7800  0.9500         0.9450        0.7885   

                    
Classifier     SVM  
Imputation          
KNN         0.9453  
MICE        0.9518  
None        0.9773  
Simple      0.9515  

TEST RESULTS - HIGH MISSING DATA
--------------------------------------------------
            Test_Accuracy                              Test_F1                \
