# Data Imputation - KNNImpute

In [1]:
#Python 3.x
import numpy as np
from numpy import genfromtxt
from fancyimpute import KNN
import os, sys
from os.path import isfile, join

# change current folder to parent folder
sys.path.append("..")
from imputation import Imputation

Using TensorFlow backend.


### Define knnImpute Modules

In [2]:
class knnImpute(Imputation):

    def preprocess(self, inputData):
        """
	    Reads a dataset (complete dataset without missing values) and introduces missingness in the dataset.
        :param inputData: 
			FilePath to the (complete) dataset
        :return:
            X_incomplete: numpy array with dropped entries
        """
        X = genfromtxt(inputData, delimiter=',')
        # X is a data matrix which we're going to randomly drop entries from
        missing_mask = np.random.rand(*X.shape) < 0.1
        X_incomplete = X.copy()
        # missing entries indicated with NaN
        X_incomplete[missing_mask] = np.nan
        return X_incomplete


    def train(self, train_data):
        # KNN is a lazy learning machine learning algorithm - no training is required
        pass


    def test(self, trained_model, test_data):
        # No testing
        pass
		
    def impute(self, trained_model, input):
        """
        Loads the input table and gives the imputed table
    
    	:param trained_model: trained model returned by train function - not used in our case
    	:param input: input table which needs to be imputed
    	:return:
    		X_filled_knn: imputed table as a numpy array
        """
        # Use 3 nearest rows which have a feature to fill in each row's missing features
        # will not use trained_model as training happens during imputation
        X_incomplete = input
        knnImpute = KNN(k=3)
        X_filled_knn = knnImpute.fit_transform(X_incomplete)
        return X_filled_knn
    

    def evaluate(self, trained_model, input, *args, **kwargs):
        """
        Loads the original dataset and calculates the performance on the imputed table through RMSE.

        :param trained_model: trained model returned by train function- not used in our case
        :param input: imputed table on which model needs to be evaluated
        :param kwargs:
            kwargs.inputData: FilePath to the (complete) dataset
        :return:
            knn_mse: rmse
        """
        inputData = kwargs['inputData']      
        X_filled_knn = input
        X = genfromtxt(inputData, delimiter=',')
        missing_mask = np.random.rand(*X.shape) < 0.1
        #take X, original table through args
        knn_mse = ((X_filled_knn[missing_mask] - X[missing_mask]) ** 2).mean()
        # normalize the RMSE
        knn_mse = knn_mse if knn_mse < 1 else knn_mse / 1000
        return knn_mse
    
    def save_model(self, file):
        # No models saved
        pass

    def load_model(self, file):
        # No models loaded
        pass

## Processing Code

In [3]:
# initialize with the dataset that needs to be imputed
input_file_path = 'wdbc.csv'
# switch to data directory which is outside our codebase and contains the dataset
input_file_path = join(os.pardir, "data", input_file_path)
#print a numpy array without scientific notation
np.set_printoptions(suppress=True)

### create an instance of the knnImpute class

In [4]:
knnimpute = knnImpute()

### preprocess the data - introduce missingness

In [5]:
preprocess = knnimpute.preprocess(input_file_path)
print("Incomplete Data:")
print(preprocess)

Incomplete Data:
[[  1.        1.       15.46    ...   0.1514    0.2837    0.08019]
 [  2.        2.       12.89    ...   0.05366   0.2309    0.06915]
 [  3.        2.       14.96    ...   0.1489    0.2962        nan]
 ...
 [567.            nan  27.42    ...       nan   0.2641    0.07427]
 [568.            nan       nan ...   0.08288   0.321     0.07863]
 [569.            nan  13.17    ...   0.1045    0.2235    0.06925]]


### impute the data

In [6]:
impute = knnimpute.impute(trained_model = '', input = preprocess)
print("Imputed Data:")
print(impute)

Imputing row 1/569 with 0 missing, elapsed time: 0.256
Imputing row 101/569 with 2 missing, elapsed time: 0.261
Imputing row 201/569 with 3 missing, elapsed time: 0.267
Imputing row 301/569 with 5 missing, elapsed time: 0.276
Imputing row 401/569 with 3 missing, elapsed time: 0.285
Imputing row 501/569 with 0 missing, elapsed time: 0.291
Imputed Data:
[[  1.           1.          15.46       ...   0.1514       0.2837
    0.08019   ]
 [  2.           2.          12.89       ...   0.05366      0.2309
    0.06915   ]
 [  3.           2.          14.96       ...   0.1489       0.2962
    0.08168809]
 ...
 [567.           1.27183173  27.42       ...   0.19671593   0.2641
    0.07427   ]
 [568.           2.00000004  11.75798509 ...   0.08288      0.321
    0.07863   ]
 [569.           1.99999997  13.17       ...   0.1045       0.2235
    0.06925   ]]


### evaluate the imputed data with RMS Error

In [7]:
evaluate = knnimpute.evaluate(trained_model = '', input = impute, inputData = input_file_path)
print("RMSE:")
print(evaluate)

RMSE:
0.2854681490733865


### save imputed data as a csv

In [8]:
output_file_path = "imputation_test_output.csv"
np.savetxt("imputation_test_output.csv", impute, delimiter=",")
# return file path of the imputed data (stored as a csv file)
#return output_file_path