# Imputation Evaluation

Although we did not focus on our imputation technique in this project, we now run a short evaluation to check it gives somewhat sensible results.

Here we load the cleaned data set from Kaggle that contains no missing data. I then randomly created missingness (each cell fills null with probability 0.05) and use our imputation code (KNN imputation) to fill the null values. I then compare the difference between the original data and the imputed to evaluate the performance of our imputation. We got a RMSE of 0.33, which isn't great but better than random.

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

In [2]:
data = pd.read_csv('heart_disease_train_nomissing (1).csv')
data = data.drop('HeartDiseaseorAttack',axis=1)
data.head()

Unnamed: 0,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,Diabetes,PhysActivity,Fruits,Veggies,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
0,1.0,1.0,1.0,30.0,1.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,5.0,30.0,30.0,1.0,0.0,9.0,5.0,1.0
1,1.0,1.0,1.0,22.0,0.0,1.0,0.0,0.0,1.0,0.0,...,1.0,0.0,3.0,30.0,0.0,1.0,0.0,12.0,4.0,4.0
2,1.0,1.0,1.0,37.0,1.0,1.0,0.0,0.0,0.0,1.0,...,1.0,0.0,5.0,0.0,0.0,1.0,1.0,10.0,6.0,5.0
3,1.0,1.0,1.0,30.0,1.0,0.0,0.0,1.0,0.0,0.0,...,1.0,0.0,4.0,10.0,17.0,1.0,0.0,9.0,4.0,1.0
4,1.0,1.0,1.0,33.0,1.0,0.0,0.0,1.0,0.0,1.0,...,1.0,0.0,3.0,1.0,0.0,0.0,1.0,11.0,6.0,6.0


In [3]:
from sklearn.utils import resample #sample 10,000 from dataset
sample_data = resample(data,
             replace=True,
             n_samples=10000,
             random_state=42)

print(sample_data.shape)

(10000, 21)


In [4]:

test = resample(data, # take exact same sample and store in test (this will be what we compare imputation to)
             replace=True,
             n_samples=10000,
             random_state=42)

test.head()

Unnamed: 0,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,Diabetes,PhysActivity,Fruits,Veggies,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
121958,0.0,1.0,1.0,23.0,0.0,0.0,0.0,1.0,1.0,1.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,11.0,5.0,6.0
146867,0.0,1.0,1.0,29.0,1.0,0.0,0.0,1.0,0.0,1.0,...,1.0,0.0,4.0,5.0,30.0,0.0,1.0,10.0,4.0,1.0
131932,0.0,1.0,1.0,24.0,1.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,1.0,0.0,0.0,0.0,1.0,8.0,6.0,7.0
103694,1.0,1.0,1.0,27.0,0.0,0.0,0.0,1.0,0.0,1.0,...,1.0,0.0,4.0,20.0,2.0,1.0,0.0,10.0,4.0,3.0
119879,1.0,0.0,1.0,34.0,0.0,0.0,0.0,1.0,0.0,1.0,...,1.0,0.0,2.0,10.0,0.0,0.0,1.0,5.0,6.0,8.0


In [5]:
#for each column, randomly replace approx 5% of rows with Nan (so )
for col in sample_data.columns:
    sample_data.loc[sample_data.sample(frac=0.05).index, col] = np.nan

In [6]:
sample_data.head()

Unnamed: 0,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,Diabetes,PhysActivity,Fruits,Veggies,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
121958,0.0,1.0,1.0,23.0,0.0,0.0,0.0,1.0,1.0,1.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,11.0,5.0,6.0
146867,0.0,1.0,1.0,29.0,1.0,0.0,,,0.0,1.0,...,1.0,0.0,4.0,5.0,30.0,0.0,1.0,10.0,4.0,1.0
131932,0.0,1.0,1.0,24.0,1.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,1.0,0.0,0.0,0.0,1.0,8.0,6.0,7.0
103694,1.0,1.0,1.0,27.0,0.0,0.0,0.0,1.0,0.0,1.0,...,1.0,0.0,4.0,20.0,2.0,1.0,0.0,10.0,4.0,3.0
119879,1.0,0.0,1.0,34.0,0.0,0.0,0.0,1.0,0.0,1.0,...,1.0,0.0,2.0,10.0,0.0,0.0,1.0,5.0,6.0,8.0


In [7]:
number_nan = sample_data.isna().sum().sum()
print(number_nan)

10938


In [8]:
from sklearn.impute import KNNImputer
from sklearn.preprocessing import MinMaxScaler

In [9]:
scaler = MinMaxScaler()
sample_data = pd.DataFrame(scaler.fit_transform(sample_data), columns = sample_data.columns)

In [10]:
test = pd.DataFrame(scaler.fit_transform(test), columns = test.columns)

In [11]:
imputer = KNNImputer(n_neighbors=5)
df = pd.DataFrame(imputer.fit_transform(sample_data),columns = sample_data.columns)

In [12]:
np.nan in df.values #no more nan values

False

In [13]:
df.head()

Unnamed: 0,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,Diabetes,PhysActivity,Fruits,Veggies,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
0,0.0,1.0,1.0,0.25,0.0,0.0,0.0,1.0,1.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.833333,0.8,0.714286
1,0.0,1.0,1.0,0.315217,1.0,0.0,0.0,0.4,0.0,1.0,...,1.0,0.0,0.75,0.166667,1.0,0.0,1.0,0.75,0.6,0.0
2,0.0,1.0,1.0,0.26087,1.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.583333,1.0,0.857143
3,1.0,1.0,1.0,0.293478,0.0,0.0,0.0,1.0,0.0,1.0,...,1.0,0.0,0.75,0.666667,0.066667,1.0,0.0,0.75,0.6,0.285714
4,1.0,0.0,1.0,0.369565,0.0,0.0,0.0,1.0,0.0,1.0,...,1.0,0.0,0.25,0.333333,0.0,0.0,1.0,0.333333,1.0,1.0


https://stefvanbuuren.name/fimd/sec-evaluation.html - How to evaluate imputation methods

Now df has our imputed scaled values, test has the original scaled. Time to compare

In [14]:
comparison_df = pd.DataFrame()
comparison_df


In [15]:
for col in df.columns:
    comparison_df[col]=np.where(df[col] == test[col], 0, (df[col] - test[col])**2) # store squared difference

In [16]:
comparison_df.shape

(10000, 21)

In [17]:
comparison_df.head()

Unnamed: 0,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,Diabetes,PhysActivity,Fruits,Veggies,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.36,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
totals = list()
for col in df.columns:
    totals.append(comparison_df[col].sum())

In [19]:
totals

[132.0,
 131.24,
 25.64,
 3.4914272211720223,
 137.44000000000003,
 15.360000000000003,
 9.600000000000001,
 100.00000000000001,
 136.48000000000002,
 80.72000000000001,
 28.680000000000003,
 24.76,
 35.16000000000001,
 28.995,
 25.1528,
 30.801466666666663,
 50.48000000000001,
 132.76,
 30.97499999999999,
 18.337600000000013,
 42.34448979591838]

Using root mean square error

In [20]:
rmse = (sum(totals)/number_nan)**0.5

In [21]:
rmse

0.3340298697619632