# Data Imputation - MIKNNImpute

In [1]:
#Python 3.x
import numpy as np
from numpy import genfromtxt
from fancyimpute import BiScaler, SoftImpute
import os, sys
from os.path import isfile, join

# change current folder to parent folder
sys.path.append("..")
from imputation import Imputation

Using TensorFlow backend.


### Define miknnImpute Modules

In [2]:
class miknnImpute(Imputation):

    def preprocess(self, inputData):
        """
	    Reads a dataset (complete dataset without missing values) and introduces missingness in the dataset.
        :param inputData: 
			FilePath to the (complete) dataset
        :return:
            X_incomplete: numpy array with dropped entries
        """
        X = genfromtxt(inputData, delimiter=',')
        # X is a data matrix which we're going to randomly drop entries from
        missing_mask = np.random.rand(*X.shape) < 0.1
        X_incomplete = X.copy()
        # missing entries indicated with NaN
        X_incomplete[missing_mask] = np.nan
        return X_incomplete


    def train(self, train_data):
        # MIKNN - a variation of KNN is a lazy learning machine learning algorithm - no training is required
        pass


    def test(self, trained_model, test_data):
        # No testing
        pass
		
    def impute(self, trained_model, input):
        """
        Loads the input table and gives the imputed table
    
    	:param trained_model: trained model returned by train function - not used in our case
    	:param input: input table which needs to be imputed
    	:return:
    		X_filled_softimpute: imputed table as a numpy array
        """
        X_incomplete = input
        softImpute = SoftImpute()
        biscaler = BiScaler()
        X_incomplete_normalized = biscaler.fit_transform(X_incomplete)
        X_filled_softimpute_normalized = softImpute.fit_transform(X_incomplete_normalized)
        X_filled_softimpute = biscaler.inverse_transform(X_filled_softimpute_normalized)
        return X_filled_softimpute
    

    def evaluate(self, trained_model, input, *args, **kwargs):
        """
        Loads the original dataset and calculates the performance on the imputed table through RMSE.

        :param trained_model: trained model returned by train function- not used in our case
        :param input: imputed table on which model needs to be evaluated
        :param kwargs:
            kwargs.inputData: FilePath to the (complete) dataset
        :return:
            softImpute_mse: rmse
        """
        inputData = kwargs['inputData']
        X_filled_softimpute = input
        X = genfromtxt(inputData, delimiter=',')
        missing_mask = np.random.rand(*X.shape) < 0.1
        #take X, original table through args
        softImpute_mse = ((X_filled_softimpute[missing_mask] - X[missing_mask]) ** 2).mean()
        # normalize the RMSE
        softImpute_mse = softImpute_mse if softImpute_mse < 1 else softImpute_mse / 1000
        return softImpute_mse
    
    def save_model(self, file):
        # No models saved
        pass

    def load_model(self, file):
        # No models loaded
        pass

## Processing Code

In [3]:
# initialize with the dataset that needs to be imputed
input_file_path = 'wdbc.csv'
# switch to data directory which is outside our codebase and contains the dataset
input_file_path = join(os.pardir, "data", input_file_path)
#print a numpy array without scientific notation
np.set_printoptions(suppress=True)

### create an instance of the miknnImpute class

In [4]:
miknnimpute = miknnImpute()

### preprocess the data - introduce missingness

In [5]:
preprocess = miknnimpute.preprocess(input_file_path)
print("Incomplete Data:")
print(preprocess)

Incomplete Data:
[[      nan   1.       15.46    ...   0.1514    0.2837    0.08019]
 [  2.            nan  12.89    ...   0.05366   0.2309    0.06915]
 [  3.        2.       14.96    ...   0.1489    0.2962        nan]
 ...
 [567.        1.       27.42    ...   0.2625    0.2641        nan]
 [568.        2.       11.6     ...   0.08288       nan   0.07863]
 [569.        2.       13.17    ...   0.1045    0.2235    0.06925]]


### impute the data

In [6]:
impute = miknnimpute.impute(trained_model = '', input = preprocess)
print("Imputed Data:")
print(impute)

Imputed Data:
[[285.54897574   1.          15.46       ...   0.1514       0.2837
    0.08019   ]
 [  2.           2.0342342   12.89       ...   0.05366      0.2309
    0.06915   ]
 [  3.           2.          14.96       ...   0.1489       0.2962
    0.08443819]
 ...
 [567.           1.          27.42       ...   0.2625       0.2641
    0.09119885]
 [568.           2.          11.6        ...   0.08288      0.28030888
    0.07863   ]
 [569.           2.          13.17       ...   0.1045       0.2235
    0.06925   ]]


### evaluate the imputed data with RMS Error

In [7]:
evaluate = miknnimpute.evaluate(trained_model = '', input = impute, inputData = input_file_path)
print("RMSE:")
print(evaluate)

RMSE:
0.14860967309305653


### save imputed data as a csv

In [8]:
output_file_path = "imputation_test_output.csv"
np.savetxt("imputation_test_output.csv", impute, delimiter=",")

# return file path of the imputed data (stored as a csv file)
#return output_file_path