In [99]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import normalize
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from sklearn.impute import KNNImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.impute import SimpleImputer

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

from xgboost import XGBRegressor

import pickle as pkl

In [95]:
# open the data
train_df = pd.read_csv('../../data/raw/train.csv')
test_df = pd.read_csv('../../data/raw/test.csv')

# save the length of the train data
ntrain = train_df.shape[0]

# concatenate the data
data = pd.concat([train_df, test_df], ignore_index=True)
del train_df, test_df

In [96]:
data.isna().sum()

Unnamed: 0                                  0
SeriousDlqin2yrs                        37500
RevolvingUtilizationOfUnsecuredLines        0
age                                         0
NumberOfTime30-59DaysPastDueNotWorse        0
DebtRatio                                   0
MonthlyIncome                           29731
NumberOfOpenCreditLinesAndLoans             0
NumberOfTimes90DaysLate                     0
NumberRealEstateLoansOrLines                0
NumberOfTime60-89DaysPastDueNotWorse        0
NumberOfDependents                       3924
dtype: int64

In [97]:
data_to_impute = data.copy()

# remove the columns
columns = ['Unnamed: 0', 'SeriousDlqin2yrs']
data_to_impute.drop(columns, axis=1, inplace=True)

# calculate number of missing values by column
missing = data_to_impute.isna().sum() 

# drop missing values
data_to_impute.dropna(inplace=True)

# split the data into train and test
X_test = data_to_impute.copy().values

# create a boolean array of the size of the data
mask = np.zeros(X_test.shape, dtype=bool)

for i in range(X_test.shape[1]):
    n_missing = missing.iloc[i]

    # randomly select n_missing indexes for the column
    idx = np.random.choice(X_test.shape[0], n_missing, replace=False)
    mask[idx, i] = True

# create the train data
X_train = np.copy(X_test)
X_train[mask] = np.nan

del data_to_impute

# Simple Imputer

In [83]:
# impute the data with the simple imputer
imputer = SimpleImputer(strategy='mean')

X_train_imputed = imputer.fit_transform(X_train.copy())

# create a dataframe to store the metrics
metrics = pd.DataFrame(columns=['mae', 'mse'])

# calculate mae and mse for the imputed data
mae = np.mean(np.abs(X_test[mask] - X_train_imputed[mask]))
mse = np.mean((X_test[mask] - X_train_imputed[mask])**2)

metrics.loc['SimpleImputer'] = [mae, mse]

# Iterative Imputer

In [84]:
try:
    # try load the iterative imputer with pickle
    with open('../../models/impute/iterative_imputer.pkl', 'rb') as f:
        imputer = pkl.load(f)

except:
    # impute the data with the iterative imputer
    imputer = IterativeImputer(max_iter=1000, random_state=0)

    # fit the imputer
    imputer.fit(X_train.copy())

    # save the imputer with pickle
    with open('../../models/impute/iterative_imputer.pkl', 'wb') as f:
        pkl.dump(imputer, f)

X_train_imputed = imputer.transform(X_train.copy())

# calculate mae and mse for the imputed data
mae = np.mean(np.abs(X_test[mask] - X_train_imputed[mask]))
mse = np.mean((X_test[mask] - X_train_imputed[mask])**2)

metrics.loc['IterativeImputer'] = [mae, mse]

# KNN Imputer

In [91]:
try:
    # try load the knn imputer with pickle
    with open('../../models/impute/knn_imputer.pkl', 'rb') as f:
        imputer = pkl.load(f)
except:
    # impute the data with the knn imputer
    imputer = KNNImputer(n_neighbors=5)

    # scale the data
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train.copy())

    # fit the imputer
    imputer.fit(X_train)

    # save the imputer with pickle
    with open('../../models/impute/knn_imputer.pkl', 'wb') as f:
        pkl.dump(imputer, f)

X_train_imputed = imputer.transform(X_train.copy())

# calculate mae and mse for the imputed data
mae = np.mean(np.abs(X_test[mask] - X_train_imputed[mask]))
mse = np.mean((X_test[mask] - X_train_imputed[mask])**2)

metrics.loc['KNNImputer'] = [mae, mse]

In [92]:
print(metrics)

                          mae           mse
SimpleImputer     3050.222905  1.009808e+08
IterativeImputer  2779.194077  9.781320e+07
KNNImputer        2918.852035  1.436448e+08


# XGBoost

In [None]:
# find the indices of the columns with missing values
missing_idx = np.where(np.sum(mask, axis=0) > 0)[0]

def tune_xgboost(X, y, space, scoring='mse', n_estimators=500, balance=False):
    
    # define evaluation
    cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=1)

    # define the model
    model = XGBRegressor(n_estimators=n_estimators, device='cuda')

    # define search
    search = RandomizedSearchCV(model, space, scoring=scoring, n_jobs=-1, cv=cv, n_iter=50)
    
    # execute search
    result = search.fit(X, y)
    
    # plot results
    results_df = pd.DataFrame(result.cv_results_)
    for key, values in space.items():
        
        # group the results by the hyperparameter
        param_means = []
        param_stds = []
        for value in values:
            mask = results_df['param_' + key] == value
            param_means.append(np.mean(results_df[mask]['mean_test_score']))
            param_stds.append(np.std(results_df[mask]['mean_test_score']))

        # create plot with two subplots side by side
        fig, ax = plt.subplots(1, 2, figsize=(12, 5))
        fig.suptitle(key)
        ax[0].plot(values, param_means)
        ax[0].set_title('Mean test scores')
        ax[0].set_xlabel(key)
        ax[0].set_ylabel('mean scores')
        padding = 0.1
        ax[0].set_ylim(max(0, min(param_means) - padding), min(1, max(param_means) + padding))

        ax[1].plot(values, param_stds)
        ax[1].set_title('Mean score std')
        ax[1].set_xlabel(key)
        ax[1].set_ylabel('score std')
        padding = 0.05
        ax[1].set_ylim(max(0, min(param_stds) - padding), min(1, max(param_stds) + padding))

        plt.show()

    # summarize result
    print('Best Score: %s' % result.best_score_)
    print('Best Hyperparameters:')
    for k, v in result.best_params_.items():
        print('%s: %s' % (k, v))

    # best model
    best_model = result.best_estimator_

    return result.best_params_, best_model

In [100]:
# find indices of the columns with missing values
missing_idx = np.where(np.sum(mask, axis=0) > 0)[0]

# define the space of hyperparameters
space = {
    'subsample': [0.5, 0.8, 1.0],
    'gamma': [0, 0.5, 2, 5],
    'reg_alpha': [0, 0.001, 0.01, 0.05],
    'reg_lambda': [0, 0.001, 0.01, 0.05],
    'n_estimators': [500]
}

for idx in missing_idx:
    X = np.copy(X_train)
    X = np.delete(X, idx, axis=1)
    y = np.copy(X_train_imputed[:, idx])

    try:
        # try load the model with pickle
        with open(f'../../models/impute/xgboost_imputer_{idx}.pkl', 'rb') as f:
            model = pkl.load(f)

        



[4 9]


In [87]:
def dist_with_miss(a, b, l=0.0):
    """
    Calculate the distance between two arrays, taking into account missing values.

    Parameters:
    a (array-like): The first array.
    b (array-like): The second array.
    l (float, optional): The penalty for missing values. Defaults to 0.0.

    Returns:
    float: The distance between the two arrays.

    Raises:
    None

    """
    if len(a) != len(b):
        return np.inf
    ls = l * np.ones(len(a))
    msk = ~ (np.isnan(a) | np.isnan(b))
    res = np.sum((np.abs(a - b)[msk])) + np.sum((ls[~msk]))
    return res