### Impute missing data using kNN by fancyimpute
Note: the code needs to run in python 3

In [1]:
import numpy as np
from fancyimpute import KNN

Using TensorFlow backend.


In [2]:
import pandas as pd
s = pd.read_csv('diabetes_data.csv',sep= ',', header= None)
s.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
1,6,148,72,35,0,33.6,0.627,50,1
2,1,85,66,29,0,26.6,0.351,31,0
3,8,183,64,0,0,23.3,0.672,32,1
4,1,89,66,23,94,28.1,0.167,21,0


In [3]:
# select features and target:
s = np.array(s)

# features:
X = s[1:,:8].astype(float)

# target:
y = s[1:,8].astype(float)

In [4]:
print(X[0:5])

[[6.000e+00 1.480e+02 7.200e+01 3.500e+01 0.000e+00 3.360e+01 6.270e-01
  5.000e+01]
 [1.000e+00 8.500e+01 6.600e+01 2.900e+01 0.000e+00 2.660e+01 3.510e-01
  3.100e+01]
 [8.000e+00 1.830e+02 6.400e+01 0.000e+00 0.000e+00 2.330e+01 6.720e-01
  3.200e+01]
 [1.000e+00 8.900e+01 6.600e+01 2.300e+01 9.400e+01 2.810e+01 1.670e-01
  2.100e+01]
 [0.000e+00 1.370e+02 4.000e+01 3.500e+01 1.680e+02 4.310e+01 2.288e+00
  3.300e+01]]


In [5]:
# avoid the first feature (number of pregnancies) as it can be zero
X1 = X[:,1:]

In [6]:
# convert missing points (zeros) to nan
X1[X1 == 0] = np.nan

In [7]:
# impute the missing points
X1 = KNN(k=20).fit_transform(X1)

Imputing row 1/768 with 1 missing, elapsed time: 0.165
Imputing row 101/768 with 2 missing, elapsed time: 0.168
Imputing row 201/768 with 1 missing, elapsed time: 0.170
Imputing row 301/768 with 3 missing, elapsed time: 0.173
Imputing row 401/768 with 2 missing, elapsed time: 0.175
Imputing row 501/768 with 0 missing, elapsed time: 0.178
Imputing row 601/768 with 1 missing, elapsed time: 0.181
Imputing row 701/768 with 0 missing, elapsed time: 0.183


In [8]:
X1[0:5]

array([[1.48000000e+02, 7.20000000e+01, 3.50000000e+01, 1.89419613e+02,
        3.36000000e+01, 6.27000000e-01, 5.00000000e+01],
       [8.50000000e+01, 6.60000000e+01, 2.90000000e+01, 6.32566607e+01,
        2.66000000e+01, 3.51000000e-01, 3.10000000e+01],
       [1.83000000e+02, 6.40000000e+01, 2.71197640e+01, 2.22401769e+02,
        2.33000000e+01, 6.72000000e-01, 3.20000000e+01],
       [8.90000000e+01, 6.60000000e+01, 2.30000000e+01, 9.40000000e+01,
        2.81000000e+01, 1.67000000e-01, 2.10000000e+01],
       [1.37000000e+02, 4.00000000e+01, 3.50000000e+01, 1.68000000e+02,
        4.31000000e+01, 2.28800000e+00, 3.30000000e+01]])

In [9]:
X2 = np.hstack([X[:,0][:,np.newaxis],X1])

Xy = np.hstack([X2,y[:,np.newaxis]])

In [10]:
np.savetxt('diabetes_data_imputed_knn20.txt',Xy,fmt='% f')