# Data Clearing: Imputation

## Import Modules

In [21]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression

## Prepare Data

In [25]:
data = {
    "name": ["Peter", "Paul", "Michael", "Tanja", "Erna", "Sabine"],
    "age": [53, 27, np.nan, 42, 80, np.nan],
    "salery": [53000, 27000, 50500, 42000, 80000, 50500],
    "sex": ["m", "m", "m", "f", "f", "f"]
}

df = pd.DataFrame(data)
df

Unnamed: 0,name,age,salery,sex
0,Peter,53.0,53000,m
1,Paul,27.0,27000,m
2,Michael,,50500,m
3,Tanja,42.0,42000,f
4,Erna,80.0,80000,f
5,Sabine,,50500,f


## Fill NaN with Pandas

In [26]:
df_pd1 = df.copy()
df_pd1['age'] = df_pd1['age'].fillna(df_pd1['age'].mean())  # reassign instead of usind deprecated 'inplace=True'
df_pd1

Unnamed: 0,name,age,salery,sex
0,Peter,53.0,53000,m
1,Paul,27.0,27000,m
2,Michael,50.5,50500,m
3,Tanja,42.0,42000,f
4,Erna,80.0,80000,f
5,Sabine,50.5,50500,f


## Fill NaN with Pandas and Condition

In [27]:
df_pd2 = df.copy()
means = df_pd2.groupby('sex')['age'].transform('mean')
df_pd2['age'] = df_pd2['age'].fillna(means)
df_pd2

Unnamed: 0,name,age,salery,sex
0,Peter,53.0,53000,m
1,Paul,27.0,27000,m
2,Michael,40.0,50500,m
3,Tanja,42.0,42000,f
4,Erna,80.0,80000,f
5,Sabine,61.0,50500,f


## Fill NaN with SciKit learn Imputer

In [28]:
df_sk1 = df.copy()
imputer = SimpleImputer(strategy='mean')
df_sk1['age'] = imputer.fit_transform(df_sk1[['age']])
df_sk1

Unnamed: 0,name,age,salery,sex
0,Peter,53.0,53000,m
1,Paul,27.0,27000,m
2,Michael,50.5,50500,m
3,Tanja,42.0,42000,f
4,Erna,80.0,80000,f
5,Sabine,50.5,50500,f


## Fill NaN with SciKit learn Linear Regression

In [30]:
df_sk2 = df.copy()

# Split train- and predict data
train_data = df_sk2[df_sk2['age'].notna()]  # Records with age
predict_data = df_sk2[df_sk2['age'].isna()]  # Records with missing age

# Linear Regression
X_train = train_data[['salery']]
y_train = train_data['age']

# Create and train Linear Regressionsmodel
regressor = LinearRegression()
regressor.fit(X_train, y_train)

# Predict missing age
X_predict = predict_data[['salery']]
predicted_age = regressor.predict(X_predict)

df_sk2.loc[df['age'].isna(), 'age'] = predicted_age
df_sk2

Unnamed: 0,name,age,salery,sex
0,Peter,53.0,53000,m
1,Paul,27.0,27000,m
2,Michael,50.5,50500,m
3,Tanja,42.0,42000,f
4,Erna,80.0,80000,f
5,Sabine,50.5,50500,f
