In [57]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import normalize
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from sklearn.impute import KNNImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.impute import SimpleImputer

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

from xgboost import XGBRegressor

from xgbimputer import XGBImputer

import pickle as pkl

In [43]:
# open the data
train_df = pd.read_csv('../../data/raw/train.csv')
test_df = pd.read_csv('../../data/raw/test.csv')

# save the length of the train data
ntrain = train_df.shape[0]

# concatenate the data
data = pd.concat([train_df, test_df], ignore_index=True)
del train_df, test_df

In [44]:
data.head()

Unnamed: 0.1,Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
0,0,0.0,0.034949,59.0,0.0,0.004933,7959.688894,5.0,0.0,0.0,0.0,0.0
1,1,0.0,0.155308,47.0,0.0,881.0,,6.0,0.0,1.0,0.0,0.0
2,2,0.0,0.165166,62.0,1.0,0.020327,2851.722407,8.0,0.0,0.0,0.0,0.0
3,3,0.0,0.010886,61.0,0.0,0.642979,1115.657341,6.0,0.0,1.0,0.0,0.0
4,4,0.0,0.000717,49.0,0.0,3603.0,,15.0,0.0,3.0,0.0,0.0


In [45]:
data.isna().sum()

Unnamed: 0                                  0
SeriousDlqin2yrs                        37500
RevolvingUtilizationOfUnsecuredLines        0
age                                         0
NumberOfTime30-59DaysPastDueNotWorse        0
DebtRatio                                   0
MonthlyIncome                           29731
NumberOfOpenCreditLinesAndLoans             0
NumberOfTimes90DaysLate                     0
NumberRealEstateLoansOrLines                0
NumberOfTime60-89DaysPastDueNotWorse        0
NumberOfDependents                       3924
dtype: int64

In [46]:
data_to_impute = data.copy()

# remove the columns
columns = ['Unnamed: 0', 'SeriousDlqin2yrs']
data_to_impute.drop(columns, axis=1, inplace=True)

# calculate number of missing values by column
missing = data_to_impute.isna().sum() 

# drop missing values
data_to_impute.dropna(inplace=True)

# split the data into train and test
X_test = data_to_impute.copy().values

# create a boolean array of the size of the data
mask = np.zeros(X_test.shape, dtype=bool)

for i in range(X_test.shape[1]):
    n_missing = missing.iloc[i]

    # randomly select n_missing indexes for the column
    idx = np.random.choice(X_test.shape[0], n_missing, replace=False)
    mask[idx, i] = True

# create the train data
X_train = np.copy(X_test)
X_train[mask] = np.nan

del data_to_impute

# Simple Imputer

In [47]:
# find the indices of the columns with missing values
missing_idx = np.where(np.sum(mask, axis=0) > 0)[0]

In [48]:
# impute the data with the simple imputer
imputer = SimpleImputer(strategy='mean')

X_train_imputed = imputer.fit_transform(X_train.copy())

# create a dataframe to store the metrics
columns_of_df = [f'mae_{i}' for i in missing_idx] + [f'mse_{i}' for i in missing_idx]
metrics = pd.DataFrame(columns=columns_of_df)

# calculate mae and mse for the imputed data for each column in the missing_idx
for i in missing_idx:
    mae = np.mean(np.abs(X_train_imputed[mask[:, i], i] - X_test[mask[:, i], i]))
    mse = np.mean((X_train_imputed[mask[:, i], i] - X_test[mask[:, i], i])**2)

    metrics.loc['SimpleImputer', f'mae_{i}'] = mae
    metrics.loc['SimpleImputer', f'mse_{i}'] = mse

In [49]:
print(metrics)

                     mae_4    mae_9             mse_4     mse_9
SimpleImputer  3485.852782  0.91753  220723944.015543  1.274156


# Iterative Imputer

In [50]:
try:
    # try load the iterative imputer with pickle
    with open('../../models/impute/iterative_imputer.pkl', 'rb') as f:
        imputer = pkl.load(f)

except:
    # impute the data with the iterative imputer
    imputer = IterativeImputer(max_iter=10000, random_state=0)

    # fit the imputer
    imputer.fit(X_train.copy())

    # save the imputer with pickle
    with open('../../models/impute/iterative_imputer.pkl', 'wb') as f:
        pkl.dump(imputer, f)

X_train_imputed = imputer.transform(X_train.copy())

# for each column in the missing_idx calculate the mae and mse
for i in missing_idx:
    mae = np.mean(np.abs(X_train_imputed[mask[:, i], i] - X_test[mask[:, i], i]))
    mse = np.mean((X_train_imputed[mask[:, i], i] - X_test[mask[:, i], i])**2)

    metrics.loc['IterativeImputer', f'mae_{i}'] = mae
    metrics.loc['IterativeImputer', f'mse_{i}'] = mse

# KNN Imputer

In [51]:
try:
    # try load the knn imputer with pickle
    with open('../../models/impute/knn_imputer.pkl', 'rb') as f:
        imputer = pkl.load(f)

    # try load the imputed data with pickle
    with open('../../data/processed/knn_imputed.pkl', 'rb') as f:
        X_train_imputed = pkl.load(f)
except:
    # impute the data with the knn imputer
    imputer = KNNImputer(n_neighbors=5)

    # fit the imputer
    imputer.fit(X_train.copy())

    # save the imputer with pickle
    with open('../../models/impute/knn_imputer.pkl', 'wb') as f:
        pkl.dump(imputer, f)

    # save the imputed data
    with open('../../data/processed/knn_imputed.pkl', 'wb') as f:
        pkl.dump(X_train_imputed, f)

# for each column in the missing_idx calculate the mae and mse
for i in missing_idx:
    mae = np.mean(np.abs(X_train_imputed[mask[:, i], i] - X_test[mask[:, i], i]))
    mse = np.mean((X_train_imputed[mask[:, i], i] - X_test[mask[:, i], i])**2)

    metrics.loc['KNNImputer', f'mae_{i}'] = mae
    metrics.loc['KNNImputer', f'mse_{i}'] = mse

In [52]:
print(metrics)

                        mae_4     mae_9             mse_4     mse_9
SimpleImputer     3485.852782   0.91753  220723944.015543  1.274156
IterativeImputer  3090.039388  0.871028  216601837.981775  1.195354
KNNImputer         829.578368  0.021721  128680176.844073  0.023727


# XGBoost

In [53]:
def tune_xgboost(X, y, space, scoring, n_estimators=250, balance=False):
    
    # define evaluation
    cv = RepeatedKFold(n_splits=5, n_repeats=3, random_state=1)

    # define the model
    model = XGBRegressor(n_estimators=n_estimators, device='cuda')

    # define search
    search = RandomizedSearchCV(model, space, scoring=scoring, n_jobs=-1, cv=cv, n_iter=100)
    
    # execute search
    result = search.fit(X, y)
    
    # plot results
    results_df = pd.DataFrame(result.cv_results_)
    for key, values in space.items():
        
        # group the results by the hyperparameter
        param_means = []
        param_stds = []
        for value in values:
            mask = results_df['param_' + key] == value
            param_means.append(np.mean(results_df[mask]['mean_test_score']))
            param_stds.append(np.std(results_df[mask]['mean_test_score']))
        """
        # create plot with two subplots side by side
        fig, ax = plt.subplots(1, 2, figsize=(12, 5))
        fig.suptitle(key)
        ax[0].plot(values, param_means)
        ax[0].set_title('Mean test scores')
        ax[0].set_xlabel(key)
        ax[0].set_ylabel('mean scores')
        padding = 0.1
        ax[0].set_ylim(max(0, min(param_means) - padding), min(1, max(param_means) + padding))

        ax[1].plot(values, param_stds)
        ax[1].set_title('Mean score std')
        ax[1].set_xlabel(key)
        ax[1].set_ylabel('score std')
        padding = 0.05
        ax[1].set_ylim(max(0, min(param_stds) - padding), min(1, max(param_stds) + padding))

        plt.show()"""

    # summarize result
    print('Best Score: %s' % result.best_score_)
    print('Best Hyperparameters:')
    for k, v in result.best_params_.items():
        print('%s: %s' % (k, v))

    # best model
    best_model = result.best_estimator_

    return result.best_params_, best_model

In [54]:
# find indices of the columns with missing values
missing_idx = np.where(np.sum(mask, axis=0) > 0)[0]

# define the space of hyperparameters
space = {
    'subsample': [0.5, 0.8, 1.0],
    'gamma': [0, 0.5, 2, 5],
    'reg_alpha': [0, 0.001, 0.01, 0.05],
    'reg_lambda': [0, 0.001, 0.01, 0.05],
}

X_imputed = np.copy(X_train)

for idx in missing_idx:
    X = np.copy(X_train)
    X = np.delete(X, idx, axis=1)
    y = np.copy(X_imputed[:, idx])

    # find the the indices of the missing values in the column
    nan_mask = np.isnan(y)

    X_train_xgboost = X[~nan_mask]
    y_train_xgboost = y[~nan_mask]
    X_test_xgboost = X[nan_mask]

    try:
        # try load the model with pickle
        with open(f'../../models/impute/xgboost_{idx}.pkl', 'rb') as f:
            best_model = pkl.load(f)
    except:
        # tune the model
        best_params, model = tune_xgboost(X_train_xgboost, y_train_xgboost, space, scoring='neg_mean_squared_error', n_estimators=400)

        # define new number of estimators
        n_estimators = 20000

        # add the number of estimators to the best params
        best_params['n_estimators'] = n_estimators

        # fit a new model with the best parameters
        best_model = XGBRegressor(**best_params)

        # fit the model
        best_model.fit(X_train_xgboost, y_train_xgboost)

        # save the model with pickle
        with open(f'../../models/impute/xgboost_{idx}.pkl', 'wb') as f:
            pkl.dump(best_model, f)

    # impute the data with the xgboost imputer
    y_pred = model.predict(X_test_xgboost)

    # replace the missing values
    X_imputed[nan_mask, idx] = y_pred

    print(np.isnan(X_imputed).sum())

    # for each column in the missing_idx calculate the mae and mse
    for i in missing_idx:
        mae = np.mean(np.abs(X_imputed[mask[:, i], i] - X_test[mask[:, i], i]))
        mse = np.mean((X_imputed[mask[:, i], i] - X_test[mask[:, i], i])**2)

        metrics.loc['XGBoostImputer', f'mae_{i}'] = mae
        metrics.loc['XGBoostImputer', f'mse_{i}'] = mse

Best Score: -184040234.49742836
Best Hyperparameters:
subsample: 0.5
reg_lambda: 0.05
reg_alpha: 0
gamma: 5
3924
Best Score: -1.07458968222369
Best Hyperparameters:
subsample: 1.0
reg_lambda: 0.05
reg_alpha: 0.001
gamma: 5
0


In [55]:
print(metrics)

                        mae_4     mae_9             mse_4     mse_9
SimpleImputer     3485.852782   0.91753  220723944.015543  1.274156
IterativeImputer  3090.039388  0.871028  216601837.981775  1.195354
KNNImputer         829.578368  0.021721  128680176.844073  0.023727
XGBoostImputer    2837.246798  0.789816  461646645.675941  1.039176


# XGB Imputer

In [None]:
try:
    # try load the xgb imputer with pickle
    with open('../../models/impute/xgb_imputer.pkl', 'rb') as f:
        imputer = pkl.load(f)

    # try load the imputed data with pickle
    with open('../../data/processed/xgb_imputed.pkl', 'rb') as f:
        X_train_imputed = pkl.load(f)

In [56]:
def dist_with_miss(a, b, l=0.0):
    """
    Calculate the distance between two arrays, taking into account missing values.

    Parameters:
    a (array-like): The first array.
    b (array-like): The second array.
    l (float, optional): The penalty for missing values. Defaults to 0.0.

    Returns:
    float: The distance between the two arrays.

    Raises:
    None

    """
    if len(a) != len(b):
        return np.inf
    ls = l * np.ones(len(a))
    msk = ~ (np.isnan(a) | np.isnan(b))
    res = np.sum((np.abs(a - b)[msk])) + np.sum((ls[~msk]))
    return res