# 2 - Missing Data Imputation Methods

KNN and MICE were implemented in R

# Simple Imputer

In [2]:
from sklearn.impute import SimpleImputer
import pandas as pd
import numpy as np
from sklearn.metrics import root_mean_squared_error, mean_absolute_error

In [3]:
# upload data 
train_complete = pd.read_csv('data/complete/train_complete.csv')
test_complete = pd.read_csv('data/complete/test_complete.csv')

# low missing
train_low_missing = pd.read_csv('data/amputed/train_low.csv')
test_low_missing = pd.read_csv('data/amputed/test_low.csv')

# high missing
train_high_missing = pd.read_csv('data/amputed/train_high.csv')
test_high_missing = pd.read_csv('data/amputed/test_high.csv')

In [None]:
# features and data types
features = [col for col in train_low_missing.columns if col != 'target']
categorical_cols = [col for col in features if col.startswith('cat_')]
numerical_cols = [col for col in features if col not in categorical_cols]

# Low data

In [None]:
# low missing data
numerical_imputer_low = SimpleImputer(strategy='mean')
categorical_imputer_low = SimpleImputer(strategy='most_frequent')

In [None]:
# train imputation - low missing
X_train_low = train_low_missing[features].copy()
train_low_mask = X_train_low.isnull()

# fit and transform train data
X_train_low[numerical_cols] = numerical_imputer_low.fit_transform(X_train_low[numerical_cols])
X_train_low[categorical_cols] = categorical_imputer_low.fit_transform(X_train_low[categorical_cols])

# evaluate train performance
train_low_mask_np = train_low_mask.to_numpy()
y_true_low = train_complete[features].to_numpy()[train_low_mask_np]
y_pred_low = X_train_low.to_numpy()[train_low_mask_np]

train_low_rmse = root_mean_squared_error(y_true_low, y_pred_low)
train_low_mae = mean_absolute_error(y_true_low, y_pred_low)
print(f"Train Low:  RMSE = {train_low_rmse:.4f}, MAE = {train_low_mae:.4f}")

Train Low:  RMSE = 0.8988, MAE = 0.6612


In [None]:
# test imputation - low missing
X_test_low = test_low_missing[features].copy()
test_low_mask = X_test_low.isnull()

# transform test data 
X_test_low[numerical_cols] = numerical_imputer_low.transform(X_test_low[numerical_cols])
X_test_low[categorical_cols] = categorical_imputer_low.transform(X_test_low[categorical_cols])

# evaluate test performance
test_low_mask_np = test_low_mask.to_numpy()
y_true_low_test = test_complete[features].to_numpy()[test_low_mask_np]
y_pred_low_test = X_test_low.to_numpy()[test_low_mask_np]

test_low_rmse = root_mean_squared_error(y_true_low_test, y_pred_low_test)
test_low_mae = mean_absolute_error(y_true_low_test, y_pred_low_test)
print(f"Test Low:   RMSE = {test_low_rmse:.4f}, MAE = {test_low_mae:.4f}")

Test Low:   RMSE = 0.9091, MAE = 0.6724


# High data

In [None]:
# high missing data
numerical_imputer_high = SimpleImputer(strategy='mean')
categorical_imputer_high = SimpleImputer(strategy='most_frequent')

In [None]:
# train - high missing
X_train_high = train_high_missing[features].copy()
train_high_mask = X_train_high.isnull()

# fit and transform train data
X_train_high[numerical_cols] = numerical_imputer_high.fit_transform(X_train_high[numerical_cols])
X_train_high[categorical_cols] = categorical_imputer_high.fit_transform(X_train_high[categorical_cols])

# evaluate train performance
train_high_mask_np = train_high_mask.to_numpy()
y_true_high = train_complete[features].to_numpy()[train_high_mask_np]
y_pred_high = X_train_high.to_numpy()[train_high_mask_np]

train_high_rmse = root_mean_squared_error(y_true_high, y_pred_high)
train_high_mae = mean_absolute_error(y_true_high, y_pred_high)
print(f"Train High: RMSE = {train_high_rmse:.4f}, MAE = {train_high_mae:.4f}")

Train High: RMSE = 0.8908, MAE = 0.6559


In [None]:
# test  - high missing
X_test_high = test_high_missing[features].copy()
test_high_mask = X_test_high.isnull()

# transform test data 
X_test_high[numerical_cols] = numerical_imputer_high.transform(X_test_high[numerical_cols])
X_test_high[categorical_cols] = categorical_imputer_high.transform(X_test_high[categorical_cols])

# evaluate test performance
test_high_mask_np = test_high_mask.to_numpy()
y_true_high_test = test_complete[features].to_numpy()[test_high_mask_np]
y_pred_high_test = X_test_high.to_numpy()[test_high_mask_np]

test_high_rmse = root_mean_squared_error(y_true_high_test, y_pred_high_test)
test_high_mae = mean_absolute_error(y_true_high_test, y_pred_high_test)
print(f"Test High:  RMSE = {test_high_rmse:.4f}, MAE = {test_high_mae:.4f}")

Test High:  RMSE = 0.9303, MAE = 0.6923


# Save datasets

In [None]:
train_low_simple = X_train_low.copy()
train_low_simple['target'] = train_low_missing['target'].values

test_low_simple = X_test_low.copy()
test_low_simple['target'] = test_low_missing['target'].values

train_high_simple = X_train_high.copy()
train_high_simple['target'] = train_high_missing['target'].values

test_high_simple = X_test_high.copy()
test_high_simple['target'] = test_high_missing['target'].values

# save imputed datasets
train_low_simple.to_csv("data/imputed/train_low_imputed_simple.csv", index=False)
test_low_simple.to_csv("data/imputed/test_low_imputed_simple.csv", index=False)
train_high_simple.to_csv("data/imputed/train_high_imputed_simple.csv", index=False)
test_high_simple.to_csv("data/imputed/test_high_imputed_simple.csv", index=False)