**3. Analyse the above techniques**

# Import Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
from sklearn.datasets import load_iris

## Loading Data

In [3]:
data = load_iris()
df = pd.DataFrame(np.array(data.data), columns=data.feature_names)
df["class"] = data.target
# Add random NA Values for analysis
df = df.mask(np.random.random(df.shape) < 0.05)
df.describe()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),class
count,147.0,142.0,142.0,141.0,141.0
mean,5.836735,3.052817,3.762676,1.180851,1.007092
std,0.827861,0.439002,1.760142,0.761757,0.815006
min,4.3,2.0,1.0,0.1,0.0
25%,5.1,2.8,1.6,0.3,0.0
50%,5.8,3.0,4.35,1.3,1.0
75%,6.4,3.3,5.1,1.8,2.0
max,7.9,4.4,6.9,2.5,2.0


In [4]:
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),class
0,5.1,3.5,,0.2,
1,4.9,3.0,1.4,0.2,0.0
2,4.7,3.2,1.3,0.2,0.0
3,4.6,3.1,1.5,0.2,0.0
4,5.0,3.6,1.4,0.2,0.0


In [5]:
df.isna().sum()

sepal length (cm)    3
sepal width (cm)     8
petal length (cm)    8
petal width (cm)     9
class                9
dtype: int64

## KNN Imputation

In [6]:
from sklearn.impute import KNNImputer

knn_imputer = KNNImputer(missing_values=np.NAN)
knn_df = pd.DataFrame(knn_imputer.fit_transform(df))
knn_df.columns = df.columns
knn_df.index = df.index
print(knn_df.isna().sum())
knn_df.describe()

sepal length (cm)    0
sepal width (cm)     0
petal length (cm)    0
petal width (cm)     0
class                0
dtype: int64


Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),class
count,150.0,150.0,150.0,150.0,150.0
mean,5.8424,3.052133,3.7612,1.202667,0.993333
std,0.826114,0.432169,1.765635,0.767136,0.813119
min,4.3,2.0,1.0,0.1,0.0
25%,5.1,2.8,1.6,0.3,0.0
50%,5.8,3.0,4.35,1.3,1.0
75%,6.4,3.3,5.1,1.8,2.0
max,7.9,4.4,6.9,2.5,2.0


## Single Value Imputation

In [7]:
from sklearn.impute import SimpleImputer

simple_imputer = SimpleImputer(missing_values=np.NAN, strategy="mean")
simple_df = pd.DataFrame(simple_imputer.fit_transform(df))
simple_df.columns = df.columns
simple_df.index = df.index
print(simple_df.isna().sum())
simple_df.describe()

sepal length (cm)    0
sepal width (cm)     0
petal length (cm)    0
petal width (cm)     0
class                0
dtype: int64


Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),class
count,150.0,150.0,150.0,150.0,150.0
mean,5.836735,3.052817,3.762676,1.180851,1.007092
std,0.819484,0.427054,1.712238,0.738392,0.790009
min,4.3,2.0,1.0,0.1,0.0
25%,5.1,2.8,1.6,0.3,0.0
50%,5.8,3.0,4.2,1.3,1.0
75%,6.4,3.3,5.1,1.8,2.0
max,7.9,4.4,6.9,2.5,2.0


## Most Frequent Value Replacement

In [8]:
mf_imputer = SimpleImputer(missing_values=np.NAN, strategy="most_frequent")
mf_df = pd.DataFrame(mf_imputer.fit_transform(df))
mf_df.columns = df.columns
mf_df.index = df.index
print(mf_df.isna().sum())
mf_df.describe()

sepal length (cm)    0
sepal width (cm)     0
petal length (cm)    0
petal width (cm)     0
class                0
dtype: int64


Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),class
count,150.0,150.0,150.0,150.0,150.0
mean,5.82,3.05,3.642,1.122,1.006667
std,0.82787,0.42722,1.786612,0.774499,0.79001
min,4.3,2.0,1.0,0.1,0.0
25%,5.1,2.8,1.5,0.2,0.0
50%,5.75,3.0,4.2,1.3,1.0
75%,6.4,3.3,5.1,1.8,2.0
max,7.9,4.4,6.9,2.5,2.0


## Global Constant Replacement

In [9]:
gcr_imputer = SimpleImputer(missing_values=np.NAN, strategy="constant", fill_value=2)
gcr_df = pd.DataFrame(gcr_imputer.fit_transform(pd.DataFrame(df["class"])))
gcr_df.columns = ["class"]
print(gcr_df.isna().sum())
gcr_df.describe()

class    0
dtype: int64


Unnamed: 0,class
count,150.0
mean,1.066667
std,0.824675
min,0.0
25%,0.0
50%,1.0
75%,2.0
max,2.0
