<a href="https://colab.research.google.com/github/chrismarkella/Kaggle-access-from-Google-Colab/blob/master/Imputation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import numpy as np
import pandas as pd

In [2]:
housing = {
    'Bed': [1,1,2,10,np.nan],
    'Bath': [1,1,2,2,3],
    'Room': [np.nan,5,6,7,7],
}

df = pd.DataFrame(
    data=housing,
    index=list('ABCDE')
)
df

Unnamed: 0,Bed,Bath,Room
A,1.0,1,
B,1.0,1,5.0
C,2.0,2,6.0
D,10.0,2,7.0
E,,3,7.0


In [3]:
columns_with_NaN = [col for col in df.columns if
                        df[col].isnull().any()
                    ]
columns_with_NaN

['Bed', 'Room']

In [4]:
df.drop(columns_with_NaN, axis='columns')

Unnamed: 0,Bath
A,1
B,1
C,2
D,2
E,3


In [5]:
df.dropna(axis='columns')

Unnamed: 0,Bath
A,1
B,1
C,2
D,2
E,3


In [0]:
from sklearn.impute import SimpleImputer

def imputation(extension=False, verbose=False):
    """Return the list of imputers with the corresponding imputed DataFrames.
    """
    imputed_mean_df = df.copy()
    imputed_median_df = df.copy()
    imputed_most_frequent_df = df.copy()

    imputer_mean = SimpleImputer(strategy='mean')
    imputer_median = SimpleImputer(strategy='median')
    imputer_most_frequent = SimpleImputer(strategy='most_frequent')

    imputers = [
        (imputer_mean, imputed_mean_df),
        (imputer_median, imputed_median_df),
        (imputer_most_frequent, imputed_most_frequent_df),
    ]

    for _imputer, _df in imputers:
        _df[columns_with_NaN] = _imputer.fit_transform(df[columns_with_NaN])
        if verbose:
            print(_df)

    if extension:
        for _, _df in imputers:
            for col in columns_with_NaN:
                _df[col + '_was_missing'] = df[col].isnull()
            if verbose:
                print(f'{_df}')
    return imputers


In [7]:
imputers_lst = imputation()

for _imputer, _df in imputers_lst:
    print(_imputer)
    print(_df)

SimpleImputer(add_indicator=False, copy=True, fill_value=None,
              missing_values=nan, strategy='mean', verbose=0)
    Bed  Bath  Room
A   1.0     1  6.25
B   1.0     1  5.00
C   2.0     2  6.00
D  10.0     2  7.00
E   3.5     3  7.00
SimpleImputer(add_indicator=False, copy=True, fill_value=None,
              missing_values=nan, strategy='median', verbose=0)
    Bed  Bath  Room
A   1.0     1   6.5
B   1.0     1   5.0
C   2.0     2   6.0
D  10.0     2   7.0
E   1.5     3   7.0
SimpleImputer(add_indicator=False, copy=True, fill_value=None,
              missing_values=nan, strategy='most_frequent', verbose=0)
    Bed  Bath  Room
A   1.0     1   7.0
B   1.0     1   5.0
C   2.0     2   6.0
D  10.0     2   7.0
E   1.0     3   7.0


In [8]:
imputers_lst = imputation(extension=True)

for _imputer, _df in imputers_lst:
    print(_imputer)
    print(_df)

SimpleImputer(add_indicator=False, copy=True, fill_value=None,
              missing_values=nan, strategy='mean', verbose=0)
    Bed  Bath  Room  Bed_was_missing  Room_was_missing
A   1.0     1  6.25            False              True
B   1.0     1  5.00            False             False
C   2.0     2  6.00            False             False
D  10.0     2  7.00            False             False
E   3.5     3  7.00             True             False
SimpleImputer(add_indicator=False, copy=True, fill_value=None,
              missing_values=nan, strategy='median', verbose=0)
    Bed  Bath  Room  Bed_was_missing  Room_was_missing
A   1.0     1   6.5            False              True
B   1.0     1   5.0            False             False
C   2.0     2   6.0            False             False
D  10.0     2   7.0            False             False
E   1.5     3   7.0             True             False
SimpleImputer(add_indicator=False, copy=True, fill_value=None,
              missing_val

In [9]:
imputers_lst = imputation(extension=True, verbose=True)


    Bed  Bath  Room
A   1.0     1  6.25
B   1.0     1  5.00
C   2.0     2  6.00
D  10.0     2  7.00
E   3.5     3  7.00
    Bed  Bath  Room
A   1.0     1   6.5
B   1.0     1   5.0
C   2.0     2   6.0
D  10.0     2   7.0
E   1.5     3   7.0
    Bed  Bath  Room
A   1.0     1   7.0
B   1.0     1   5.0
C   2.0     2   6.0
D  10.0     2   7.0
E   1.0     3   7.0
    Bed  Bath  Room  Bed_was_missing  Room_was_missing
A   1.0     1  6.25            False              True
B   1.0     1  5.00            False             False
C   2.0     2  6.00            False             False
D  10.0     2  7.00            False             False
E   3.5     3  7.00             True             False
    Bed  Bath  Room  Bed_was_missing  Room_was_missing
A   1.0     1   6.5            False              True
B   1.0     1   5.0            False             False
C   2.0     2   6.0            False             False
D  10.0     2   7.0            False             False
E   1.5     3   7.0             Tru