# Imputation using XGB

Combine iterative imputation with XGB
- use XGB as base estimator for iterative imputation


In [20]:
from pathlib import Path

import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, IterativeImputer
from sklearn.compose import ColumnTransformer

from xgboost import XGBRegressor

from tqdm import tqdm

p = 1/55
RANDOM_STATE=42
INPUT_PATH = Path('./input')

In [2]:
dtypes = {'row_id' : 'int',
          'F_2_0' : 'int', 'F_2_1' : 'int', 'F_2_2' : 'int',
          'F_2_3' : 'int', 'F_2_4' : 'int', 'F_2_5' : 'int', 
          'F_2_6' : 'int', 'F_2_7' : 'int', 'F_2_8' : 'int',
          'F_2_9' : 'int', 'F_2_10' : 'int', 'F_2_11' : 'int',
          'F_2_12' : 'int', 'F_2_13' : 'int', 'F_2_14' : 'int',
          'F_2_15' : 'int', 'F_2_16' : 'int', 'F_2_17' : 'int',
          'F_2_18' : 'int', 'F_2_19' : 'int', 'F_2_20' : 'int',
          'F_2_21' : 'int', 'F_2_22' : 'int', 'F_2_23' : 'int',
          'F_2_24' : 'int'}

data = pd.read_csv(INPUT_PATH / 'data.csv', 
                   index_col='row_id',
                   dtype = dtypes)
submission = pd.read_csv(INPUT_PATH / 'sample_submission.csv', 
                         index_col='row-col')

In [3]:
def cols_by_prefix(columns, prefix):
    return [x for x in columns if x.startswith(prefix)]

cols_f1 = cols_by_prefix(data.columns, 'F_1')
cols_f2 = cols_by_prefix(data.columns, 'F_2')
cols_f3 = cols_by_prefix(data.columns, 'F_3')
cols_f4 = cols_by_prefix(data.columns, 'F_4')
cols_f134 = cols_f1 + cols_f3 + cols_f4
cols_f123 = cols_f1 + cols_f2 + cols_f3

data_f134 = data[cols_f134]
data_f1 = data[cols_f1]
data_f2 = data[cols_f2]
data_f3 = data[cols_f3]
data_f4 = data[cols_f4]

In [4]:
def make_training(df, n, p, random_state):
    """
    This makes training data from the records with no missing values. Produces a training set with
    synthetic missing values in the same proportion as the original dataset
    """
    # first find all rows with *no* NaN; sample n rows
    df = df[~df.isnull().any(axis=1)]
    if n > 0:
        df = df.sample(n=n, random_state=random_state)
    
    # random mask of NaN locations; only cols F_1*, F_3*, F_4*
    mask = np.random.random(df[cols_f134].shape) < p
    df_na = df[cols_f134].mask(mask)

    # put it back together with F_2*
    df_na = pd.concat([df_na[cols_f1], df[cols_f2], df_na[cols_f3], df_na[cols_f4]], axis=1)
    return df, df_na, df_na.isna().sum().sum()

def sse_cols(df1, df2):
    return ((df1 - df2).pow(2)).sum()

def rmse(df1, df2, n):
    return (sse_cols(df1, df2).sum()/n)**0.5

### Super simple baseline - mean imput all columns

In [5]:
%%time

train, train_na, na_count = make_training(data, -1, p, RANDOM_STATE)
imputer = SimpleImputer(strategy="mean")
train_na[:] = imputer.fit_transform(train_na)
print(f'RMSE={rmse(train, train_na, na_count)}')


RMSE=1.4215675334756572
CPU times: total: 5.89 s
Wall time: 5.35 s


### Whats the lowest possible RMSE score
- mean impute f1-f3 (this just is a no-op on f2)
- assume f4 is a perfect match

In [7]:
mean_imputer = SimpleImputer(strategy="mean")

imputer = ColumnTransformer(
    transformers=[
        ("mean1", mean_imputer, cols_f1),
        ("mean2", mean_imputer, cols_f2),
        ("mean3", mean_imputer, cols_f3),
    ],
    remainder='passthrough'
)

train, train_na, na_count = make_training(data, -1, p, RANDOM_STATE)
train_na[:] = imputer.fit_transform(train_na)

print(f'RMSE={rmse(train, train_na, na_count)}')


RMSE=0.8280211270947019


### Using IterativeImputer + XGB

- mean impute f1-f3 before imputing f_4
- just run 2 iterations

In [8]:
mean_imputer = SimpleImputer(strategy="mean")

imputer = ColumnTransformer(
    transformers=[
        ("mean1", mean_imputer, cols_f1),
        ("mean2", mean_imputer, cols_f2),
        ("mean3", mean_imputer, cols_f3),
    ],
    remainder='passthrough'
)

train, train_na, na_count = make_training(data, -1, p, RANDOM_STATE)
train_na[:] = imputer.fit_transform(train_na)


In [9]:
%%time

imputer = IterativeImputer(estimator=XGBRegressor(n_estimators=2000, 
                                                  tree_method='gpu_hist',
                                                  random_state=RANDOM_STATE), 
                                     max_iter=2,
                                     verbose=2,
                                    random_state=RANDOM_STATE)
train_na[cols_f4] = imputer.fit_transform(train_na[cols_f4])
print(f'RMSE={rmse(train, train_na, na_count)}')

[IterativeImputer] Completing matrix with shape (364774, 15)
[IterativeImputer] Ending imputation round 1/2, elapsed time 359.35
[IterativeImputer] Change: 21.422502074717578, scaled tolerance: 0.029931176 
[IterativeImputer] Ending imputation round 2/2, elapsed time 727.39
[IterativeImputer] Change: 6.377307265996933, scaled tolerance: 0.029931176 




RMSE=0.8949962823803645
CPU times: total: 13min 13s
Wall time: 12min 8s


### IterativeImputer on everything

In [11]:
%%time

mean_imputer = SimpleImputer(strategy="mean")
imputer = ColumnTransformer(
    transformers=[
        ("mean1", mean_imputer, cols_f1),
        ("mean2", mean_imputer, cols_f2),
        ("mean3", mean_imputer, cols_f3),
    ],
    remainder='passthrough'
)
train, train_na, na_count = make_training(data, -1, p, RANDOM_STATE)
train_na[:] = imputer.fit_transform(train_na)

imputer = IterativeImputer(verbose=2, max_iter=20, random_state=RANDOM_STATE)
train_na[:] = imputer.fit_transform(train_na)
print(f'RMSE={rmse(train, train_na, na_count)}')

[IterativeImputer] Completing matrix with shape (364774, 80)
[IterativeImputer] Ending imputation round 1/20, elapsed time 131.37
[IterativeImputer] Change: 0.0, scaled tolerance: 0.029931176 
[IterativeImputer] Early stopping criterion reached.
RMSE=0.8949962823803645
CPU times: total: 13min 4s
Wall time: 2min 13s


### XGB on full dataset for submission

In [12]:
mean_imputer = SimpleImputer(strategy="mean")

imputer = ColumnTransformer(
    transformers=[
        ("mean1", mean_imputer, cols_f1),
        ("mean2", mean_imputer, cols_f2),
        ("mean3", mean_imputer, cols_f3),
    ],
    remainder='passthrough'
)

data[:] = imputer.fit_transform(data)

In [13]:
%%time
imputer = IterativeImputer(estimator=XGBRegressor(n_estimators=2000, 
                                                  tree_method='gpu_hist',
                                                  random_state=RANDOM_STATE), 
                                     max_iter=2,
                                     verbose=2,
                                    random_state=RANDOM_STATE)
data[cols_f4] = imputer.fit_transform(data[cols_f4])


[IterativeImputer] Completing matrix with shape (1000000, 15)
[IterativeImputer] Ending imputation round 1/2, elapsed time 518.41
[IterativeImputer] Change: 24.4230605560112, scaled tolerance: 0.031229363 
[IterativeImputer] Ending imputation round 2/2, elapsed time 1049.60
[IterativeImputer] Change: 6.591387063264847, scaled tolerance: 0.031229363 




CPU times: total: 19min 45s
Wall time: 17min 32s


### Make a submission file

In [6]:
for i in tqdm(submission.index):
    row = int(i.split('-')[0])
    col = i.split('-')[1]
    submission.loc[i, 'value'] = data.loc[row, col]

submission.to_csv('submission.csv')

100%|█████████████████████████████████████████████████████████████████████| 1000000/1000000 [01:00<00:00, 16594.96it/s]
