<a href="https://colab.research.google.com/github/chrismarkella/Kaggle-access-from-Google-Colab/blob/master/SimpleImputerTutorial.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# SimpleImputer tutorial

This tutorial was inspired by Anne Bonner's article on [Medium](https://towardsdatascience.com/the-complete-beginners-guide-to-data-cleaning-and-preprocessing-2070b7d4c6d). 



In [0]:
import numpy as np
import pandas as pd

In [33]:
df = pd.DataFrame(
    data={
        'animal': ['Cat', 'Dog', 'Moose', 'Dog', 'Moose', 'Moose'],
        'age': [4,17,6,8,4,5],
        'worth': [30000, 20000, 40000,20000,np.nan,10000],
        'friendly': ['No', 'Yes', 'No', 'No', 'Yes', 'Yes'],
    }
)
df

Unnamed: 0,animal,age,worth,friendly
0,Cat,4,30000.0,No
1,Dog,17,20000.0,Yes
2,Moose,6,40000.0,No
3,Dog,8,20000.0,No
4,Moose,4,,Yes
5,Moose,5,10000.0,Yes


In [34]:
from sklearn.impute import SimpleImputer

imputer_mean = SimpleImputer(strategy='mean')
imputer_median = SimpleImputer(strategy='median')
imputer_most_frequent = SimpleImputer(strategy='most_frequent')
imputer_constant_defult = SimpleImputer(strategy='constant')
imputer_constant_one = SimpleImputer(strategy='constant', fill_value=1)

imputed_by_mean_df = pd.DataFrame(imputer_mean.fit_transform(df.iloc[:, 1:3]))
imputed_by_mean_df.columns = df.columns[1:3]
imputed_by_mean_df

# imputed_by_median_df = pd.DataFrame(imputer_median.fit_transform(df.iloc[:, 1:3]))
# imputed_by_median_df

# imputed_by_most_frequent_df = pd.DataFrame(imputer_most_frequent.fit_transform(df.iloc[:, 1:3]))
# imputed_by_most_frequent_df

# imputed_by_constant_default_df = pd.DataFrame(imputer_constant_defult.fit_transform(df.iloc[:, 1:3]))
# imputed_by_constant_default_df

# imputed_by_constant_one_df = pd.DataFrame(imputer_constant_one.fit_transform(df.iloc[:, 1:3]))
# imputed_by_constant_one_df


Unnamed: 0,age,worth
0,4.0,30000.0
1,17.0,20000.0
2,6.0,40000.0
3,8.0,20000.0
4,4.0,24000.0
5,5.0,10000.0


In [35]:
imputers = [
    imputer_mean,
    imputer_median,
    imputer_most_frequent,
    imputer_constant_defult,
    imputer_constant_one,
]

for imputer in imputers:
    imputed_df = pd.DataFrame(imputer.fit_transform(df.iloc[:, 1:3]))
    imputed_df.columns = df.columns[1:3]
    print(imputed_df)

    age    worth
0   4.0  30000.0
1  17.0  20000.0
2   6.0  40000.0
3   8.0  20000.0
4   4.0  24000.0
5   5.0  10000.0
    age    worth
0   4.0  30000.0
1  17.0  20000.0
2   6.0  40000.0
3   8.0  20000.0
4   4.0  20000.0
5   5.0  10000.0
    age    worth
0   4.0  30000.0
1  17.0  20000.0
2   6.0  40000.0
3   8.0  20000.0
4   4.0  20000.0
5   5.0  10000.0
    age    worth
0   4.0  30000.0
1  17.0  20000.0
2   6.0  40000.0
3   8.0  20000.0
4   4.0      0.0
5   5.0  10000.0
    age    worth
0   4.0  30000.0
1  17.0  20000.0
2   6.0  40000.0
3   8.0  20000.0
4   4.0      1.0
5   5.0  10000.0


In [36]:
imputers_tup = [
    ('imputer_mean', imputer_mean),
    ('imputer_median' , imputer_median), 
    ('imputer_most_frequent' , imputer_most_frequent), 
    ('imputer_constant_defult' , imputer_constant_defult), 
    ('imputer_constant_one' , imputer_constant_one),
]

for name, imputer in imputers_tup:
    imputed_df = pd.DataFrame(imputer.fit_transform(df.iloc[:, 1:3]))
    imputed_df.columns = df.columns[1:3]
    print(name)
    print(imputed_df)

imputer_mean
    age    worth
0   4.0  30000.0
1  17.0  20000.0
2   6.0  40000.0
3   8.0  20000.0
4   4.0  24000.0
5   5.0  10000.0
imputer_median
    age    worth
0   4.0  30000.0
1  17.0  20000.0
2   6.0  40000.0
3   8.0  20000.0
4   4.0  20000.0
5   5.0  10000.0
imputer_most_frequent
    age    worth
0   4.0  30000.0
1  17.0  20000.0
2   6.0  40000.0
3   8.0  20000.0
4   4.0  20000.0
5   5.0  10000.0
imputer_constant_defult
    age    worth
0   4.0  30000.0
1  17.0  20000.0
2   6.0  40000.0
3   8.0  20000.0
4   4.0      0.0
5   5.0  10000.0
imputer_constant_one
    age    worth
0   4.0  30000.0
1  17.0  20000.0
2   6.0  40000.0
3   8.0  20000.0
4   4.0      1.0
5   5.0  10000.0
