# Using knnImputer

In [None]:
import pandas as pd

# load dataset
url = 'https://raw.githubusercontent.com/jbrownlee/Datasets/master/horse-colic.csv'
dataframe = pd.read_csv(url, header=None, na_values='?')
dataframe.head()

In [None]:
dataframe.info()

In [None]:
# summarize the number of rows with missing values for each column

for i in range(dataframe.shape[1]):
    # count number of rows with missing values
    n_miss = dataframe[[i]].isnull().sum()
    perc = n_miss / dataframe.shape[0] * 100
    print('> %d, Missing: %d (%.1f%%)' % (i, n_miss, perc))

In [None]:
# define imputer

from sklearn.impute import KNNImputer
imputer = KNNImputer()

In [None]:
# fit on the dataset
imputer.fit(dataframe)

# transform the dataset
Xtrans = imputer.transform(dataframe)

In [None]:
xtrans = imputer.fit_transform(dataframe)

In [None]:
Xtrans = pd.DataFrame(Xtrans)
Xtrans.isnull().sum().sum()

# Using SimpleImputer

In [None]:
import numpy as np 

# Importing the SimpleImputer class 
from sklearn.impute import SimpleImputer 

# Imputer object using the mean strategy and 
# missing_values type for imputation 
imputer = SimpleImputer(missing_values = np.nan, strategy ='mean') 

data = [[12, np.nan, 34], [10, 32, np.nan], [np.nan, 11, 20]] 

print("Original Data : \n", data) 
# Fitting the data to the imputer object 
imputer = imputer.fit(data) 

# Imputing the data	 
data = imputer.transform(data) 

print("Imputed Data : \n", data) 


# Using fancyimputer

fancyimpute is a library for missing data imputation algorithms. Fancyimpute use machine learning algorithm to impute missing values. Fancyimpute uses all the column to impute the missing values. There are two ways missing data can be imputed using Fancyimpute

- KNN or K-Nearest Neighbor
- MICE or Multiple Imputation by Chained Equation

### K-Nearest Neighbor

To fill out the missing values KNN finds out the similar data points among all the features. Then it took the average of all the points to fill in the missing values.

In [2]:
#!pip install fancyimpute
# easy_install fancyimpute     ##run this line on conda prompt
# pip install numpy --upgrade --user        ##run this line on conda prompt    

import pandas as pd 
import numpy as np 
# importing the KNN from fancyimpute library 
from fancyimpute import KNN 

df = pd.DataFrame([[np.nan, 2, np.nan, 0], 
                   [3, 4, np.nan, 1], 
                   [np.nan, np.nan, np.nan, 5], 
                   [np.nan, 3, np.nan, 4], 
                   [5,	 7, 8,	 2], 
                   [2,	 5, 7,	 9]], 
                  columns = list('ABCD')) 

# printing the dataframe 
print(df) 

# calling the KNN class 
knn_imputer = KNN() 
# imputing the missing value with knn imputer 
df = knn_imputer.fit_transform(df) 

# printing dataframe 
print(pd.DataFrame(df, columns=list('ABCD'))) 


     A    B    C  D
0  NaN  2.0  NaN  0
1  3.0  4.0  NaN  1
2  NaN  NaN  NaN  5
3  NaN  3.0  NaN  4
4  5.0  7.0  8.0  2
5  2.0  5.0  7.0  9
Imputing row 1/6 with 2 missing, elapsed time: 0.001
          A         B         C    D
0  3.235569  2.000000  7.756303  0.0
1  3.000000  4.000000  7.825000  1.0
2  3.676471  3.463866  7.640000  5.0
3  3.355140  3.000000  7.591837  4.0
4  5.000000  7.000000  8.000000  2.0
5  2.000000  5.000000  7.000000  9.0


### Multiple Imputation by Chained Equation:

MICE uses multiple imputation instead of single imputation which results in statistical uncertainty. MICE perform multiple regression over the sample data and take averages of them

In [3]:
import pandas as pd 
import numpy as np 

# importing the MICE from fancyimpute library 
from sklearn.experimental import enable_iterative_imputer
from fancyimpute import IterativeImputer 

df = pd.DataFrame([[np.nan, 2, np.nan, 0], 
                   [3, 4, np.nan, 1], 
                   [np.nan, np.nan, np.nan, 5], 
                   [np.nan, 3, np.nan, 4], 
                   [5, 7, 8, 2], 
                   [2, 5, 7, 9]], 
                  columns = list('ABCD')) 

# printing the dataframe 
print(df) 

# calling the MICE class 
mice_imputer = IterativeImputer() 
# imputing the missing value with mice imputer 
df = mice_imputer.fit_transform(df) 

# printing dataframe 
print(pd.DataFrame(df, columns=list('ABCD'))) 


     A    B    C  D
0  NaN  2.0  NaN  0
1  3.0  4.0  NaN  1
2  NaN  NaN  NaN  5
3  NaN  3.0  NaN  4
4  5.0  7.0  8.0  2
5  2.0  5.0  7.0  9
          A         B         C    D
0  1.739135  2.000000  7.906731  0.0
1  3.000000  4.000000  7.919353  1.0
2  2.304308  4.235237  7.441668  5.0
3  1.608699  3.000000  7.481066  4.0
4  5.000000  7.000000  8.000000  2.0
5  2.000000  5.000000  7.000000  9.0


# Using autoimpute

### Main Features
- Utility functions to examine patterns in missing data and decide on relevant features for imputation
- Missingness classifier and automatic missing data test set generator
- Native handling for categorical variables (as predictors and targets of imputation)
- Single and multiple imputation classes for pandas DataFrames
- Custom visualization support for utility functions and imputation methods
- Analysis methods and pooled parameter inference using multiply imputed datasets
- Numerous imputation methods, as specified in the table below:

In [4]:
# pip install autoimpute

from autoimpute.imputations import SingleImputer, MultipleImputer, MiceImputer

si = SingleImputer() # pass through data once
mi = MultipleImputer() # pass through data multiple times
mice = MiceImputer() # pass through data multiple times and iteratively optimize imputations in each column



https://kearnz.github.io/autoimpute-tutorials/