In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import normalize
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from sklearn.impute import KNNImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.impute import SimpleImputer

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

from xgboost import XGBRegressor

from missforest.missforest import MissForest
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier

import pickle as pkl

from impute import impute

/Users/francescobraicovich/Documents/Personale/default_prediction_bemacs/src/models
True


In [2]:
# open the data
train_df = pd.read_csv('../../../data/raw/train.csv')
test_df = pd.read_csv('../../../data/raw/test.csv')

# save the length of the train data
ntrain = train_df.shape[0]

# concatenate the data
data = pd.concat([train_df, test_df], ignore_index=True)
del train_df, test_df

In [3]:
data.head()

Unnamed: 0.1,Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
0,0,0.0,0.034949,59.0,0.0,0.004933,7959.688894,5.0,0.0,0.0,0.0,0.0
1,1,0.0,0.155308,47.0,0.0,881.0,,6.0,0.0,1.0,0.0,0.0
2,2,0.0,0.165166,62.0,1.0,0.020327,2851.722407,8.0,0.0,0.0,0.0,0.0
3,3,0.0,0.010886,61.0,0.0,0.642979,1115.657341,6.0,0.0,1.0,0.0,0.0
4,4,0.0,0.000717,49.0,0.0,3603.0,,15.0,0.0,3.0,0.0,0.0


In [4]:
data.isna().sum()

Unnamed: 0                                  0
SeriousDlqin2yrs                        37500
RevolvingUtilizationOfUnsecuredLines        0
age                                         0
NumberOfTime30-59DaysPastDueNotWorse        0
DebtRatio                                   0
MonthlyIncome                           29731
NumberOfOpenCreditLinesAndLoans             0
NumberOfTimes90DaysLate                     0
NumberRealEstateLoansOrLines                0
NumberOfTime60-89DaysPastDueNotWorse        0
NumberOfDependents                       3924
dtype: int64

In [5]:
# count the unique values for each column
data.nunique()

Unnamed: 0                              112500
SeriousDlqin2yrs                             2
RevolvingUtilizationOfUnsecuredLines    125728
age                                         89
NumberOfTime30-59DaysPastDueNotWorse        16
DebtRatio                               114194
MonthlyIncome                           118636
NumberOfOpenCreditLinesAndLoans             58
NumberOfTimes90DaysLate                     19
NumberRealEstateLoansOrLines                28
NumberOfTime60-89DaysPastDueNotWorse        13
NumberOfDependents                          13
dtype: int64

In [6]:
data_to_impute = data.copy()

# remove the columns
columns_to_drop = ['Unnamed: 0', 'SeriousDlqin2yrs']
data_to_impute.drop(columns_to_drop, axis=1, inplace=True)
length_of_dataset = data_to_impute.shape[0]

# calculate number of missing values by column
missing_values_by_column = data_to_impute.isna().sum()
prc_missing_values_by_column = missing_values_by_column / length_of_dataset

# drop missing values
data_to_impute.dropna(inplace=True)

# split the data into train and test
X_train, X_test = train_test_split(data_to_impute, test_size=0.2, random_state=0) # 80% train, 20% test both with no missing values
X_train, X_test = X_train.values, X_test.values # convert to numpy arrays
y_train, y_test = np.copy(X_train), np.copy(X_test) # copy the test data to use as the ground truth

# assert there are np missing values in any of the datasets
assert np.isnan(X_train).sum() == 0
assert np.isnan(X_test).sum() == 0
assert np.isnan(y_train).sum() == 0
assert np.isnan(y_test).sum() == 0

del data_to_impute

In [7]:
def create_missing_values(X, prc_missing_values_by_column, seed=0):
    
    np.random.seed(seed)
    # create a boolean array of the size of the data
    missing_value_mask = np.zeros(X.shape, dtype=bool)

    for i in range(X_test.shape[1]):
        prc_missing = prc_missing_values_by_column[i]

        # calculate how many missing values to create
        n_to_nan = int(prc_missing * X_test.shape[0])

        # randomly select n_missing indexes for the column
        idx = np.random.choice(X_test.shape[0], n_to_nan, replace=False)
        missing_value_mask[idx, i] = True

    # create the missing values
    X[missing_value_mask] = np.nan

    return X, missing_value_mask

# create missing values in the train data
X_train, missing_value_mask_train = create_missing_values(X_train.copy(), prc_missing_values_by_column)

# create missing values in the test data
X_test, missing_value_mask_test = create_missing_values(X_test.copy(), prc_missing_values_by_column)


In [8]:
def update_metrics(metrics_df, X_test_imputed, y, mask, model_name):
    """
    Update the metrics dataframe with Mean Absolute Error (MAE) and Mean Squared Error (MSE) for each missing index.

    Parameters:
    - metrics_df (pandas.DataFrame): The metrics dataframe to update.
    - X_test_imputed (numpy.ndarray): The imputed test data.
    - y (numpy.ndarray): The ground truth test data.
    - mask (numpy.ndarray): The mask indicating missing values.
    - model_name (str): The name of the model.

    Returns:
    - metrics_df (pandas.DataFrame): The updated metrics dataframe.
    """
    missing_idx = np.unique(np.where(mask)[1])
    for i in missing_idx:
        mae = np.mean(np.abs(X_test_imputed[mask[:, i], i] - y[mask[:, i], i]))
        mse = np.mean((X_test_imputed[mask[:, i], i] - y[mask[:, i], i])**2)

        metrics_df.loc[model_name, f'mae_{i}'] = mae
        metrics_df.loc[model_name, f'mse_{i}'] = mse

    return metrics_df

# Simple Imputer

In [9]:
# impute the data with the simple imputer
imputer = SimpleImputer(strategy='mean')

# fit the imputer
imputer.fit(X_train.copy())

# create a dataframe to store the metrics
missing_idx = np.unique(np.where(missing_value_mask_test)[1])
columns_of_df = [f'mae_{i}' for i in missing_idx] + [f'mse_{i}' for i in missing_idx]
metrics = pd.DataFrame(columns=columns_of_df)

# impute the data
X_test_imputed = imputer.transform(X_test.copy())

# calculate mae and mse for the imputed data for each column in the missing_idx
metrics = update_metrics(metrics, X_test_imputed, y_test, missing_value_mask_test, 'SimpleImputer')
print(metrics)

                     mae_4     mae_9          mse_4     mse_9
SimpleImputer  3307.973839  0.956714  27552228.6627  1.409101


# Iterative Imputer

In [10]:
def neg_prc_mae(estimator, X, y, parameters):

    y_pred = estimator.set_params(**parameters).transform(X.copy())

    negative_value_mask = y_pred < 0
    sum_of_negative_values = np.sum(y_pred[negative_value_mask])

    # calculate the mae for each column
    mae = np.mean((y_pred - y))

    # calculate prc_mae for each column
    prc_mae = mae / np.mean(np.abs(y))

    # calculate r squared
    r_squared = 1 - np.sum((y - y_pred)**2) / np.sum((y - np.mean(y))**2)

    # sum the prc_mae for each column
    prc_mae_sum = np.sum(prc_mae)

    return - prc_mae_sum + r_squared + sum_of_negative_values

In [11]:
# define the path to save the model
path_to_save = '../../../models/impute/iterative_imputer.pkl'

model = IterativeImputer(random_state=0)

parameter_space = {
    'max_iter': [25, 50, 75],
    'n_nearest_features': [5, 10, 15],
    'initial_strategy': ['mean', 'median', 'most_frequent'],
}

retrain = True

# scoring function, additional variables added only for compatibility with the scoring function
scoring = lambda estimator, X, y, **param: neg_prc_mae(estimator, X, y, param)

# impute the data with the iterative imputer
X_test_imputed = impute(X_train.copy(), X_test.copy(), y_train.copy(), model,
                        param_space=parameter_space, scoring=scoring, 
                        path_to_save=path_to_save, normalize=False, retrain_if_exists=retrain)

# calculate mae and mse for the imputed data for each column in the missing_idx
metrics = update_metrics(metrics, X_test_imputed, y_test, missing_value_mask_test, 'IterativeImputer')
print(metrics)

Fine tuning the imputer...
Fitting 3 folds for each of 27 candidates, totalling 81 fits

Best Score: [1m0.6372152976490068[0m
Best Hyperparameters:
initial_strategy: mean
max_iter: 25
n_nearest_features: 5

Fine tuning completed.
Imputer saved.
Imputing the test data...
Imputation completed.
                        mae_4     mae_9            mse_4     mse_9
SimpleImputer     3307.973839  0.956714    27552228.6627  1.409101
IterativeImputer  2952.872716  0.906348  24813904.064018  1.302942


# KNN Imputer

In [12]:
# impute the data with knn imputer
path = '../../../models/impute/knn_imputer.pkl'
path_data = '../../../data/processed/impute_test/knn_imputed.pkl'

model = KNNImputer()

parameter_space = {
    'n_neighbors': [3, 5, 6,  7],
    'weights': ['uniform', 'distance'],
    'metric': ['nan_euclidean'],
}

# scoring function, additional variables added only for compatibility with the scoring function
scoring = lambda estimator, X, y, **param: neg_prc_mae(estimator, X, y.copy(), param)

retrain = True

# impute the data with the knn imputer
X_test_imputed = impute(X_train.copy(), X_test.copy(), y_train.copy(), model,
                        param_space=parameter_space, scoring=scoring, 
                        path_to_save=path, path_data=path_data, normalize=False, retrain_if_exists=retrain)

# calculate mae and mse for the imputed data for each column in the missing_idx
metrics = update_metrics(metrics, X_test_imputed, y_test, missing_value_mask_test, 'KNNImputer')
print(metrics)

Fine tuning the imputer...
Fitting 3 folds for each of 8 candidates, totalling 24 fits

Best Score: [1m0.6530581891447848[0m
Best Hyperparameters:
metric: nan_euclidean
n_neighbors: 7
weights: distance

Fine tuning completed.
Imputer saved.
Imputing the test data...
Imputation completed.
                        mae_4     mae_9            mse_4     mse_9
SimpleImputer     3307.973839  0.956714    27552228.6627  1.409101
IterativeImputer  2952.872716  0.906348  24813904.064018  1.302942
KNNImputer        2863.905205   0.88298  25430505.059991  1.429329


# XGBoost

In [13]:
path = '../../../models/impute/xgb_imputer'

model = XGBRegressor()

parameter_space = {
    'n_estimators': [100, 150, 200],
    'max_depth': [2, 3, 5],
    'learning_rate': [0.01, 0.1],
    'subsample': [0.6, 0.8, 1],
    'colsample_bytree': [0.6, 0.8, 1],
    'colsample_bylevel': [0.6, 0.8, 1],
}

retrain = True

# define new train and test data because xgboost is not an imputer
X_train_xgb = y_train.copy()
X_test_xgb = y_test.copy()

X_test_imputed = y_test.copy()

scoring = lambda y_pred, y_true: np.mean((y_pred - y_true)**2) 

for idx in missing_idx:
    print('Working on column:', idx)
    new_path = path + f'_{idx}.pkl'

    # select the column to predict
    y_train_xgb = X_train_xgb[:, idx]

    # drop the column to predict
    X_train_xgb_i = np.delete(np.copy(X_train_xgb), idx, axis=1)
    X_test_xgb_i = np.delete(np.copy(X_test_xgb), idx, axis=1)
    y_train_xgb_i = np.copy(X_train_xgb[:, idx])

    # train the model
    col_to_insert = impute(X_train_xgb_i, X_test_xgb_i, y_train_xgb_i, model, 
                           param_space=parameter_space, 
                           scoring='neg_mean_squared_error', 
                           path_to_save=new_path, normalize=False, 
                           retrain_if_exists=retrain, refit='neg_mean_squared_error')
    
    X_test_imputed[:, idx] = col_to_insert

# calculate mae and mse for the imputed data for each column in the missing_idx
metrics = update_metrics(metrics, X_test_imputed, y_test, missing_value_mask_test, 'XGBRegressor')

print(metrics)

Working on column: 4
Fine tuning the imputer...
Fitting 3 folds for each of 486 candidates, totalling 1458 fits

Best Score: [1m-170787283.44281635[0m
Best Hyperparameters:
colsample_bylevel: 0.6
colsample_bytree: 1
learning_rate: 0.1
max_depth: 3
n_estimators: 100
subsample: 1

Fine tuning completed.
Imputer saved.
Imputing the test data...
Imputation completed.
Working on column: 9
Fine tuning the imputer...
Fitting 3 folds for each of 486 candidates, totalling 1458 fits

Best Score: [1m-1.0601627292866216[0m
Best Hyperparameters:
colsample_bylevel: 0.6
colsample_bytree: 1
learning_rate: 0.1
max_depth: 5
n_estimators: 100
subsample: 0.8

Fine tuning completed.
Imputer saved.
Imputing the test data...
Imputation completed.
                        mae_4     mae_9             mse_4     mse_9
SimpleImputer     3307.973839  0.956714     27552228.6627  1.409101
IterativeImputer  2952.872716  0.906348   24813904.064018  1.302942
KNNImputer        2863.905205   0.88298   25430505.059991 

# MissForest

In [14]:
"""path = '../../../models/impute/miss_forest_imputer.pkl'
clf = RandomForestClassifier(n_jobs=-1)
rgr = RandomForestRegressor(n_jobs=-1)

model = lambda params: MissForest(clf(**params), rgr(**params))

parameter_space = {
    'n_estimators': [15, 25, 35],
    'max_depth': [2, 3, 5, 7],
}

retrain = True

# impute the data with the miss forest imputer
X_test_imputed = impute(X_train.copy(), X_test.copy(), y_train.copy(), model,
                        param_space=parameter_space, scoring='neg_mean_squared_error', 
                        path_to_save=path, normalize=False, retrain_if_exists=retrain)

# calculate mae and mse for the imputed data for each column in the missing_idx
metrics = update_metrics(metrics, X_test_imputed, y_test, missing_value_mask_test, 'MissForest')

print(metrics)"""

"path = '../../../models/impute/miss_forest_imputer.pkl'\nclf = RandomForestClassifier(n_jobs=-1)\nrgr = RandomForestRegressor(n_jobs=-1)\n\nmodel = lambda params: MissForest(clf(**params), rgr(**params))\n\nparameter_space = {\n    'n_estimators': [15, 25, 35],\n    'max_depth': [2, 3, 5, 7],\n}\n\nretrain = True\n\n# impute the data with the miss forest imputer\nX_test_imputed = impute(X_train.copy(), X_test.copy(), y_train.copy(), model,\n                        param_space=parameter_space, scoring='neg_mean_squared_error', \n                        path_to_save=path, normalize=False, retrain_if_exists=retrain)\n\n# calculate mae and mse for the imputed data for each column in the missing_idx\nmetrics = update_metrics(metrics, X_test_imputed, y_test, missing_value_mask_test, 'MissForest')\n\nprint(metrics)"

In [15]:
try:
    # try load the miss forest imputer with pickle
    with open('../../../models/impute/miss_forest.pkl', 'rb') as f:
        imputer = pkl.load(f)

except:
    # impute the data with the miss forest imputer
    params = {
        'n_estimators': 15,
        'max_depth': 3 
        }

    clf = RandomForestClassifier(n_jobs=-1, **params)
    rgr = RandomForestRegressor(n_jobs=-1, **params)
    imputer = MissForest(clf, rgr)

    # fit the imputer
    X_train_df = pd.DataFrame(X_train) # convert to dataframe for MissForest
    imputer.fit(X_train_df)

    # save the imputer with pickle
    with open('../../../models/impute/miss_forest.pkl', 'wb') as f:
        pkl.dump(imputer, f)

X_test_df = pd.DataFrame(X_test) # convert to dataframe for MissForest
X_test_imputed = imputer.transform(X_test_df.copy())
X_test_imputed = X_test_imputed.values # convert to numpy array

# calculate mae and mse for the imputed data for each column in the missing_idx
metrics = update_metrics(metrics, X_test_imputed, y_test, missing_value_mask_test, 'MissForest')
print(metrics)

                        mae_4     mae_9             mse_4     mse_9
SimpleImputer     3307.973839  0.956714     27552228.6627  1.409101
IterativeImputer  2952.872716  0.906348   24813904.064018  1.302942
KNNImputer        2863.905205   0.88298   25430505.059991  1.429329
XGBRegressor      2614.571848  0.801925  215684985.646753  1.116717
MissForest        2466.242003  0.856916   20861725.111218  1.264165


# Impute the real data

In [16]:
# scale the columns of the metrics dataframe
scaler = StandardScaler()
metrics_scaled = scaler.fit_transform(metrics)

# sum along each row to get the total error
metrics_scaled = np.sum(metrics_scaled, axis=1)

# find the indices of the n best imputers
n = 2 # optimal number of imputers
best_imputers_idx = metrics_scaled.argsort()[:n]

# find the names of the best imputers
best_imputers = metrics.index[best_imputers_idx]

In [17]:
print(f'Metrics scaled: {metrics_scaled}')
print(f'Best imputers: {best_imputers}')

Metrics scaled: [ 3.54603455  0.36664074  0.7364284  -1.98465299 -2.6644507 ]
Best imputers: Index(['MissForest', 'XGBRegressor'], dtype='object')


## Test the ensmble

In [20]:
main_path = '../../../models/impute/'
imputers_path = {
    'SimpleImputer': main_path + 'simple_imputer.pkl',
    'IterativeImputer': main_path + 'iterative_imputer.pkl',
    'KNNImputer': main_path + 'knn_imputer.pkl',
    'XGBRegressor': main_path + 'xgb_imputer',
    'MissForest': main_path + 'miss_forest.pkl'
}

# try opnening the ensembled imputed data
try:
    with open('../../../data/processed/impute_test/ensembled_imputed.pkl', 'rb') as f:
        X_test_imputed = pkl.load(f)
except:
    # create the ensembled imputed data
    X_test_imputed = np.zeros(X_test.shape)

    for imputer_name in best_imputers:
        print('Working on imputer:', imputer_name)
        
        if imputer_name != 'XGBRegressor':
            with open(imputers_path[imputer_name], 'rb') as f:
                imputer = pkl.load(f)

            if imputer_name == 'MissForest':
                X_test_df = pd.DataFrame(X_test)
                imputed = imputer.transform(X_test_df).values

            else:
                imputed = imputer.transform(X_test.copy())

            # set values to 0 if they are negative
            imputed[imputed < 0] = 0

            X_test_imputed += imputed

        else:
            for idx in missing_idx:
                with open(imputers_path[imputer_name] + f'_{idx}.pkl', 'rb') as f:
                    imputer = pkl.load(f)

                y_test_i = np.copy(y_test[:, idx])
                X_test_i = np.delete(np.copy(X_test), idx, axis=1)

                imputed = imputer.predict(X_test_i)
                imputed[imputed < 0] = 0
                X_test_imputed[:, idx] += imputed

    X_test_imputed /= n

    # save the ensembled imputed data
    with open('../../../data/processed/impute_test/ensembled_imputed.pkl', 'wb') as f:
        pkl.dump(X_test_imputed, f)

# calculate mae and mse for the imputed data for each column in the missing_idx
metrics = update_metrics(metrics, X_test_imputed, y_test, missing_value_mask_test, 'Ensembled')
print(metrics)

Working on imputer: MissForest
Working on imputer: XGBRegressor
                        mae_4     mae_9             mse_4     mse_9
SimpleImputer     3307.973839  0.956714     27552228.6627  1.409101
IterativeImputer  2952.872716  0.906348   24813904.064018  1.302942
KNNImputer        2863.905205   0.88298   25430505.059991  1.429329
XGBRegressor      2614.571848  0.801925  215684985.646753  1.116717
MissForest        2466.242003  0.856916   20861725.111218  1.264165
Ensembled         2454.672494  0.837318   67573048.722437  1.191029


In [21]:
try:
    # try read the data
    train_df = pd.read_csv('../../../data/proessed/imputed_data/train_imputed.csv')
    test_df = pd.read_csv('../../../data/processed/imputed_data/test_imputed.csv')

except:
    data_to_impute = data.copy()

    # remove the columns
    columns_to_drop = ['Unnamed: 0', 'SeriousDlqin2yrs']
    data_to_impute.drop(columns_to_drop, axis=1, inplace=True)
    columns_of_data = data_to_impute.columns

    # create an array to store the imputed data
    imputed_data = np.copy(data_to_impute.values)
    missing_value_mask = (data_to_impute).isna()
    missing_value_mask = missing_value_mask.values

    # set all values to zero
    imputed_data[missing_value_mask] = 0

    # impute the data with the best imputers
    for imputer_name in best_imputers:
        print('Working on imputer:', imputer_name)
        
        if imputer_name != 'XGBRegressor':
            with open(imputers_path[imputer_name], 'rb') as f:
                imputer = pkl.load(f)

            if imputer_name == 'MissForest':
                data_to_impute = pd.DataFrame(np.copy(data_to_impute))
                imputed = imputer.transform(data_to_impute).values

            else:
                imputed = imputer.transform(data_to_impute.copy())

            # set values to 0 if they are negative
            imputed[imputed < 0] = 0

            imputed_data[missing_value_mask] += imputed[missing_value_mask]

        else:
            for idx in missing_idx:
                with open(imputers_path[imputer_name] + f'_{idx}.pkl', 'rb') as f:
                    imputer = pkl.load(f)
                
                X = np.delete(np.copy(data_to_impute), idx, axis=1)
                imputed = imputer.predict(X)
                imputed[imputed < 0] = 0
                xgb_mask_i = missing_value_mask[:, idx]
                xgb_mask = missing_value_mask & (np.arange(data_to_impute.shape[1]) == idx)
                imputed_data[xgb_mask] += imputed[xgb_mask_i]

    imputed_data[missing_value_mask] /= n

    # make the imputed data a dataframe with the same columns as the original data
    imputed_data = pd.DataFrame(imputed_data, columns=columns_of_data)

    # add back the columns that were dropped
    imputed_data['Unnamed: 0'] = data['Unnamed: 0']
    imputed_data['SeriousDlqin2yrs'] = data['SeriousDlqin2yrs']

    # split back into train and test
    train_df = imputed_data.iloc[:ntrain, :]
    test_df = imputed_data.iloc[ntrain:, :]

    # save the data
    train_df.to_csv('../../../data/processed/imputed_data/train_imputed.csv', index=False)
    test_df.to_csv('../../../data/processed/imputed_data/test_imputed.csv', index=False)

Working on imputer: MissForest
Working on imputer: XGBRegressor
