In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import normalize
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from sklearn.impute import KNNImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.impute import SimpleImputer

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

from xgboost import XGBRegressor

from missforest.missforest import MissForest
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier

import pickle as pkl

In [2]:
# open the data
train_df = pd.read_csv('../../data/raw/train.csv')
test_df = pd.read_csv('../../data/raw/test.csv')

# save the length of the train data
ntrain = train_df.shape[0]

# concatenate the data
data = pd.concat([train_df, test_df], ignore_index=True)
del train_df, test_df

In [3]:
data.head()

Unnamed: 0.1,Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
0,0,0.0,0.034949,59.0,0.0,0.004933,7959.688894,5.0,0.0,0.0,0.0,0.0
1,1,0.0,0.155308,47.0,0.0,881.0,,6.0,0.0,1.0,0.0,0.0
2,2,0.0,0.165166,62.0,1.0,0.020327,2851.722407,8.0,0.0,0.0,0.0,0.0
3,3,0.0,0.010886,61.0,0.0,0.642979,1115.657341,6.0,0.0,1.0,0.0,0.0
4,4,0.0,0.000717,49.0,0.0,3603.0,,15.0,0.0,3.0,0.0,0.0


In [4]:
data.isna().sum()

Unnamed: 0                                  0
SeriousDlqin2yrs                        37500
RevolvingUtilizationOfUnsecuredLines        0
age                                         0
NumberOfTime30-59DaysPastDueNotWorse        0
DebtRatio                                   0
MonthlyIncome                           29731
NumberOfOpenCreditLinesAndLoans             0
NumberOfTimes90DaysLate                     0
NumberRealEstateLoansOrLines                0
NumberOfTime60-89DaysPastDueNotWorse        0
NumberOfDependents                       3924
dtype: int64

In [5]:
# count the unique values for each column
data.nunique()

Unnamed: 0                              112500
SeriousDlqin2yrs                             2
RevolvingUtilizationOfUnsecuredLines    125728
age                                         89
NumberOfTime30-59DaysPastDueNotWorse        16
DebtRatio                               114194
MonthlyIncome                           118636
NumberOfOpenCreditLinesAndLoans             58
NumberOfTimes90DaysLate                     19
NumberRealEstateLoansOrLines                28
NumberOfTime60-89DaysPastDueNotWorse        13
NumberOfDependents                          13
dtype: int64

In [6]:
data_to_impute = data.copy()

# remove the columns
columns_to_drop = ['Unnamed: 0', 'SeriousDlqin2yrs']
data_to_impute.drop(columns_to_drop, axis=1, inplace=True)

# calculate number of missing values by column
missing = data_to_impute.isna().sum() 

# drop missing values
data_to_impute.dropna(inplace=True)

# split the data into train and test
X_train, X_test = train_test_split(data_to_impute, test_size=0.2, random_state=42)

# save the indexes
train_idx = X_train.index
test_idx = X_test.index

# assert there are no missing values in the train set
assert X_train.isna().sum().sum() == 0

X_train, X_test = X_train.values, X_test.values
y = np.copy(X_test)

print('X_train shape:', X_train.shape)
print('X_test shape:', X_test.shape)

# create a boolean array of the size of the data
mask = np.zeros(X_test.shape, dtype=bool)

for i in range(X_test.shape[1]):
    n_missing = missing.iloc[i]
    prc_missing = 0.85 if n_missing != 0 else 0 # percentage of missing values
    n_missing = int(prc_missing * X_test.shape[0])

    # randomly select n_missing indexes for the column
    idx = np.random.choice(X_test.shape[0], n_missing, replace=False)
    mask[idx, i] = True

# create the missing values
X_test[mask] = np.nan

X_train shape: (96215, 10)
X_test shape: (24054, 10)


In [7]:
def update_metrics(metrics_df, missing_idx, X_test_imputed, y, mask, model_name):
    for i in missing_idx:
        mae = np.mean(np.abs(X_test_imputed[mask[:, i], i] - y[mask[:, i], i]))
        mse = np.mean((X_test_imputed[mask[:, i], i] - y[mask[:, i], i])**2)

        metrics_df.loc[model_name, f'mae_{i}'] = mae
        metrics_df.loc[model_name, f'mse_{i}'] = mse

    return metrics_df

# Simple Imputer

In [8]:
# find the indices of the columns with missing values
missing_idx = np.where(np.sum(mask, axis=0) > 0)[0]

In [9]:
# impute the data with the simple imputer
imputer = SimpleImputer(strategy='mean')

# fit the imputer
imputer.fit(X_train.copy())

# create a dataframe to store the metrics
columns_of_df = [f'mae_{i}' for i in missing_idx] + [f'mse_{i}' for i in missing_idx]
metrics = pd.DataFrame(columns=columns_of_df)

# impute the data
X_test_imputed = imputer.transform(X_test.copy())

# calculate mae and mse for the imputed data for each column in the missing_idx
metrics = update_metrics(metrics, missing_idx, X_test_imputed, y, mask, 'SimpleImputer')
print(metrics)

                     mae_4    mae_9            mse_4     mse_9
SimpleImputer  3381.682334  0.93371  93671886.902909  1.338178


# Iterative Imputer

In [10]:
try:
    # try load the iterative imputer with pickle
    with open('../../models/impute/iterative_imputer.pkl', 'rb') as f:
        imputer = pkl.load(f)

except:
    # impute the data with the iterative imputer
    imputer = IterativeImputer(max_iter=1000, random_state=0)

    # fit the imputer
    imputer.fit(X_train.copy())

    # save the imputer with pickle
    with open('../../models/impute/iterative_imputer.pkl', 'wb') as f:
        pkl.dump(imputer, f)

X_test_imputed = imputer.transform(X_test.copy())

# calculate mae and mse for the imputed data for each column in the missing_idx
update_metrics(metrics, missing_idx, X_test_imputed, y, mask, 'IterativeImputer')
print(metrics)

                        mae_4     mae_9            mse_4     mse_9
SimpleImputer     3381.682334   0.93371  93671886.902909  1.338178
IterativeImputer  3051.184182  0.890809  90441212.917151  1.260874


# KNN Imputer

In [11]:
try:
    # try load the knn imputer with pickle
    with open('../../models/impute/knn_imputer.pkl', 'rb') as f:
        imputer = pkl.load(f)

    # try load the imputed data
    with open('../../data/processed/impute/knn_imputed.pkl', 'rb') as f:
        X_test_imputed = pkl.load(f)
except:
    # impute the data with the knn imputer
    imputer = KNNImputer(n_neighbors=5)

    # fit the imputer
    imputer.fit(X_train.copy())

    # impute the data
    X_test_imputed = imputer.transform(X_test.copy())

    # save the imputer with pickle
    with open('../../models/impute/knn_imputer.pkl', 'wb') as f:
        pkl.dump(imputer, f)

    # save the imputed data
    with open('../../data/processed/impute/knn_imputed.pkl', 'wb') as f:
        pkl.dump(X_test_imputed, f)

# calculate mae and mse for the imputed data for each column in the missing_idx
update_metrics(metrics, missing_idx, X_test_imputed, y, mask, 'KNNImputer')
print(metrics)

                        mae_4     mae_9            mse_4     mse_9
SimpleImputer     3381.682334   0.93371  93671886.902909  1.338178
IterativeImputer  3051.184182  0.890809  90441212.917151  1.260874
KNNImputer          2485.7553  0.730379  91527493.288134  1.134163


In [12]:
print(metrics)

                        mae_4     mae_9            mse_4     mse_9
SimpleImputer     3381.682334   0.93371  93671886.902909  1.338178
IterativeImputer  3051.184182  0.890809  90441212.917151  1.260874
KNNImputer          2485.7553  0.730379  91527493.288134  1.134163


# XGBoost

In [13]:
def tune(X, y, space, scoring, 
         model, modeltype='clf', search_type='grid', n_iter_random=100,
         n_splits=5, n_repeats=3, random_state=1,
         verbose=True, display_plots=False):
    
    # define evaluation
    cv = RepeatedKFold(n_splits=n_splits, n_repeats=n_repeats, random_state=random_state)
  
    if verbose:
        verbosity = 1

    # define search
    if search_type == 'grid':
        search = GridSearchCV(model, space, scoring=scoring, n_jobs=-1, cv=cv, verbose=verbosity)
    elif search_type == 'random':
        search = RandomizedSearchCV(model, space, scoring=scoring, n_jobs=-1, cv=cv, n_iter=n_iter_random, verbose=verbosity)
    
    # execute search
    result = search.fit(X, y)
    
    # plot results
    if display_plots:
        results_df = pd.DataFrame(result.cv_results_)
        for key, values in space.items():
            
            # group the results by the hyperparameter
            param_means = []
            param_stds = []
            for value in values:
                mask = results_df['param_' + key] == value
                param_means.append(np.mean(results_df[mask]['mean_test_score']))
                param_stds.append(np.std(results_df[mask]['mean_test_score']))
            
            # create plot with two subplots side by side
            fig, ax = plt.subplots(1, 2, figsize=(12, 5))
            fig.suptitle(key)
            ax[0].plot(values, param_means)
            ax[0].set_title('Mean test scores')
            ax[0].set_xlabel(key)
            ax[0].set_ylabel('mean scores')
            padding = 0.1
            ax[0].set_ylim(max(0, min(param_means) - padding), min(1, max(param_means) + padding))

            ax[1].plot(values, param_stds)
            ax[1].set_title('Mean score std')
            ax[1].set_xlabel(key)
            ax[1].set_ylabel('score std')
            padding = 0.05
            ax[1].set_ylim(max(0, min(param_stds) - padding), min(1, max(param_stds) + padding))

            plt.show()

    # summarize result
    if verbose:
        print('Best Score: %s' % result.best_score_)
        print('Best Hyperparameters:')
        for k, v in result.best_params_.items():
            print('%s: %s' % (k, v))

    # best model
    best_model = result.best_estimator_

    return result.best_params_, best_model

In [14]:
# find indices of the columns with missing values
missing_idx = np.where(np.sum(mask, axis=0) > 0)[0]

# define the space of hyperparameters
param_space = {
    'n_estimators': [150, 250],
    'max_depth': [2, 3, 5],
    'learning_rate': [0.01, 0.1],
    'subsample': [0.6, 0.8, 1],
    'colsample_bytree': [0.6, 0.8, 1],
    'colsample_bylevel': [0.6, 0.8, 1]
}

X_test_imputed = np.copy(X_test)

for idx in missing_idx:
    print(f'Imputing column {idx}...')
    X_train_xgboost = np.copy(X_train)
    X_train_xgboost = np.delete(X_train, idx, axis=1)
    y_train_xgboost = np.copy(X_train[:, idx])

    X_test_xgboost = np.copy(X_test)
    X_test_xgboost = np.delete(X_test, idx, axis=1)
    y_test_xgboost = np.copy(X_test[:, idx])

    # find the the indices of the missing values in the column
    nan_mask = np.isnan(y_test_xgboost)

    try:
        # try load the model with pickle
        with open(f'../../models/impute/xgboost_{idx}.pkl', 'rb') as f:
            best_model = pkl.load(f)
    except:
        # tune the model
        best_params, best_model = tune(X=X_train_xgboost, y=y_train_xgboost, space=param_space, 
                                       scoring='neg_mean_squared_error', model=XGBRegressor(), 
                                       modeltype='reg', search_type='grid', n_iter_random=100, 
                                       n_splits=3, n_repeats=1, random_state=1, verbose=True, display_plots=False)

        # save the model with pickle
        with open(f'../../models/impute/xgboost_{idx}.pkl', 'wb') as f:
            pkl.dump(best_model, f)

    # impute the data with the xgboost imputer
    y_pred = best_model.predict(X_test_xgboost)

    # replace the missing values
    X_test_imputed[nan_mask, idx] = y_pred[nan_mask]

    print('')

# for each column in the missing_idx calculate the mae and mse
update_metrics(metrics, missing_idx, X_test_imputed, y, mask, 'XGBoost')
print(metrics)

Imputing column 4...
Fitting 3 folds for each of 324 candidates, totalling 972 fits
Best Score: -175761413.3339593
Best Hyperparameters:
colsample_bylevel: 1
colsample_bytree: 1
learning_rate: 0.01
max_depth: 3
n_estimators: 250
subsample: 0.8

Imputing column 9...
Fitting 3 folds for each of 324 candidates, totalling 972 fits
Best Score: -1.0585339250269108
Best Hyperparameters:
colsample_bylevel: 0.8
colsample_bytree: 0.8
learning_rate: 0.1
max_depth: 3
n_estimators: 250
subsample: 0.8

                        mae_4     mae_9            mse_4     mse_9
SimpleImputer     3381.682334   0.93371  93671886.902909  1.338178
IterativeImputer  3051.184182  0.890809  90441212.917151  1.260874
KNNImputer          2485.7553  0.730379  91527493.288134  1.134163
XGBoost           2793.073859  1.001222  71740878.555235  1.482481


# MissForest

In [15]:
# try the miss forest imputer

try:
    # try load the miss forest imputer with pickle
    with open('../../models/impute/miss_forest.pkl', 'rb') as f:
        imputer = pkl.load(f)

except:
    # impute the data with the miss forest imputer
    clf = RandomForestClassifier(n_jobs=-1)
    rgr = RandomForestRegressor(n_jobs=-1)
    imputer = MissForest(clf, rgr, max_iter=1000000)

    # fit the imputer
    imputer.fit(X_train.copy())

    # save the imputer with pickle
    with open('../../models/impute/miss_forest.pkl', 'wb') as f:
        pkl.dump(imputer, f)

X_test_df = pd.DataFrame(X_test)
X_test_imputed = imputer.transform(X_test_df.copy())
X_test_imputed = X_test_imputed.values

# calculate mae and mse for the imputed data for each column in the missing_idx
update_metrics(metrics, missing_idx, X_test_imputed, y, mask, 'MissForest')
print(metrics)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  x[c].fillna(initial_imputations[c], inplace=True)


                        mae_4     mae_9            mse_4     mse_9
SimpleImputer     3381.682334   0.93371  93671886.902909  1.338178
IterativeImputer  3051.184182  0.890809  90441212.917151  1.260874
KNNImputer          2485.7553  0.730379  91527493.288134  1.134163
XGBoost           2793.073859  1.001222  71740878.555235  1.482481
MissForest        2721.477326  0.851409  84767098.263874  1.237515


# Impute the real data

In [16]:
# scale the columns of the metrics dataframe
scaler = StandardScaler()
metrics_scaled = scaler.fit_transform(metrics)

# sum along each row to get the total error
metrics_scaled = np.sum(metrics_scaled, axis=1)

# find the indices of the n best imputers
n = 3
best_imputers_idx = metrics_scaled.argsort()[:n]

# find the names of the best imputers
best_imputers = metrics.index[best_imputers_idx]

In [18]:
print(f'Metrics scaled: {metrics_scaled}')
print(f'Best imputers: {best_imputers}')

Metrics scaled: [ 3.51899877  0.89052029 -3.68520821  0.8160262  -1.54033705]
Best imputers: Index(['KNNImputer', 'MissForest', 'XGBoost'], dtype='object')


In [17]:
imputers_dict = {
    'SimpleImputer': SimpleImputer(strategy='mean'),
    'IterativeImputer': IterativeImputer(max_iter=1000, random_state=0),
    'KNNImputer': KNNImputer(n_neighbors=5),
    'MissForest': MissForest(clf, rgr, max_iter=1000000)
}

try:
    # try load the ensembled imputed data
    with open('../../data/processed/impute/ensemble_imputed.pkl', 'rb') as f:
        X_test_imputed = pkl.load(f)
except:
    X_test_imputed = np.copy(X_test)
    mask = np.isnan(X_test)
    X_test_imputed[mask] = 0

    for imputer_name in best_imputers:
        print(f'Imputing with {imputer_name}...')
        
        imputer = imputers_dict[imputer_name]

        # fit the imputer
        imputer.fit(X_train.copy())
        print('Imputer fitted...')

        # impute the data
        try:
            data_to_add = imputer.transform(np.copy(X_test))
        except:
            data_to_add = imputer.transform(pd.DataFrame(np.copy(X_test)))
            data_to_add = data_to_add.values
        print('Data imputed...')

        # add the imputed data to the imputed_X_test
        X_test_imputed[mask] += data_to_add[mask]
        print('')

    # average the imputed data
    X_test_imputed[mask] /= n

    # save the imputed data as a pickle
    with open('../../data/processed/impute/ensemble_imputed.pkl', 'wb') as f:
        pkl.dump(X_test_imputed, f)

# calculate mae and mse for the imputed data for each column in the missing_idx
update_metrics(metrics, missing_idx, X_test_imputed, y, mask, 'Ensemble')
print(metrics)

NameError: name 'clf' is not defined

In [None]:
try:
    # try read the data
    train_df = pd.read_csv('../../data/proessed/train_imputed.csv')
    test_df = pd.read_csv('../../data/processed/test_imputed.csv')
except:
    data_to_impute = data.copy()

    # remove the columns
    data_to_impute.drop(columns_to_drop, axis=1, inplace=True)
    mask = data_to_impute.isna()

    imputed_data = np.copy(data_to_impute.values)

    # set all values to zero
    imputed_data[mask] = 0

    # impute the data with the best imputers
    for imputer_name in best_imputers:
        print(f'Imputing with {imputer_name}...')
        
        imputer = imputers_dict[imputer_name]

        # fit the imputer
        imputer.fit(data_to_impute.copy())
        print('Imputer fitted...')

        # impute the data
        try:
            data_to_add = imputer.transform(data_to_impute.copy())
            data_to_add = data_to_add.values
        except:
            data_to_add = imputer.transform(data_to_impute.copy().values)
        print('Data imputed...')

        # add the imputed data to the imputed_X_test
        imputed_data[mask] += data_to_add[mask]
        print('')

    # average the imputed data
    imputed_data[mask] /= n

    # make the imputed data a dataframe with the same columns as the original data
    imputed_data = pd.DataFrame(imputed_data, columns=data_to_impute.columns)

    # add back the columns that were dropped
    imputed_data['Unnamed: 0'] = data['Unnamed: 0']
    imputed_data['SeriousDlqin2yrs'] = data['SeriousDlqin2yrs']

    # split back into train and test
    train_df = imputed_data.iloc[:ntrain, :]
    test_df = imputed_data.iloc[ntrain:, :]

    # save the data
    train_df.to_csv('../../data/processed/imputed_data/train_imputed.csv', index=False)
    test_df.to_csv('../../data/processed/imputed_data/test_imputed.csv', index=False)

Imputing with KNNImputer...
Imputer fitted...




Data imputed...

Imputing with MissForest...
Imputer fitted...


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  x[c].fillna(initial_imputations[c], inplace=True)


Data imputed...

Imputing with IterativeImputer...
Imputer fitted...
Data imputed...



