# **Methods for Dealing with Missing Values.**

In [None]:
# Import Library.
import pandas as pd
import numpy as np
import warnings

warnings.filterwarnings("ignore")

# Load Dataset.
data = pd.read_csv(
    "http://www.creditriskanalytics.net/uploads/1/9/5/1/19511601/hmeq.csv"
)
data.head()

Unnamed: 0,BAD,LOAN,MORTDUE,VALUE,REASON,JOB,YOJ,DEROG,DELINQ,CLAGE,NINQ,CLNO,DEBTINC
0,1,1100,25860.0,39025.0,HomeImp,Other,10.5,0.0,0.0,94.366667,1.0,9.0,
1,1,1300,70053.0,68400.0,HomeImp,Other,7.0,0.0,2.0,121.833333,0.0,14.0,
2,1,1500,13500.0,16700.0,HomeImp,Other,4.0,0.0,0.0,149.466667,1.0,10.0,
3,1,1500,,,,,,,,,,,
4,0,1700,97800.0,112000.0,HomeImp,Office,3.0,0.0,0.0,93.333333,0.0,14.0,


In [None]:
# Copy Dataframe.
df_copy = data.copy()

In [None]:
# Check for Missing Values.
print(data.isnull().sum())

BAD           0
LOAN          0
MORTDUE     518
VALUE       112
REASON      252
JOB         279
YOJ         515
DEROG       708
DELINQ      580
CLAGE       308
NINQ        510
CLNO        222
DEBTINC    1267
dtype: int64


### **Delete Rows (or Columns) with Missing Values.**

In [None]:
# Delete Entire Column (Feature) with Missing Values.
del df_copy["DEBTINC"]
print(df_copy.isnull().sum())
print(df_copy.shape)

BAD          0
LOAN         0
MORTDUE    518
VALUE      112
REASON     252
JOB        279
YOJ        515
DEROG      708
DELINQ     580
CLAGE      308
NINQ       510
CLNO       222
dtype: int64
(5960, 12)


In [None]:
# Delete Rows with Missing Values.
df_copy.dropna(inplace=True)
print(df_copy.isnull().sum())
print(df_copy.shape)

BAD        0
LOAN       0
MORTDUE    0
VALUE      0
REASON     0
JOB        0
YOJ        0
DEROG      0
DELINQ     0
CLAGE      0
NINQ       0
CLNO       0
dtype: int64
(4247, 12)


### **Impute Missing Values with Mean, Median, and Mode.**

In [None]:
""" Imputation Using the Mean Values. """

# Copy Dataframe.
df_mean_impute = data.copy()

""" Replace missing values using Mean Imputation. """
# df_mean_impute["CLNO"].fillna(df_mean_impute["CLNO"].mean())

df_mean_impute = df_mean_impute.fillna(df_mean_impute.mean())
print(df_mean_impute.isnull().sum())
print(df_mean_impute.shape)

BAD          0
LOAN         0
MORTDUE      0
VALUE        0
REASON     252
JOB        279
YOJ          0
DEROG        0
DELINQ       0
CLAGE        0
NINQ         0
CLNO         0
DEBTINC      0
dtype: int64
(5960, 13)


In [None]:
""" Imputation Using the Median Values. """

# Copy Dataframe.
df_median_impute = data.copy()

""" Replace missing values using Median Imputation. """
# df_median_impute["CLNO"].fillna(df_median_impute["CLNO"].mean())

df_median_impute = df_median_impute.fillna(df_median_impute.median())
print(df_median_impute.isnull().sum())
print(df_median_impute.shape)

BAD          0
LOAN         0
MORTDUE      0
VALUE        0
REASON     252
JOB        279
YOJ          0
DEROG        0
DELINQ       0
CLAGE        0
NINQ         0
CLNO         0
DEBTINC      0
dtype: int64
(5960, 13)


In [None]:
""" Imputation Using the Mode Values. """

# Copy Dataframe.
df_mode_impute = data.copy()

""" Replace missing values using Mode Imputation. """
# df_mode_impute["CLNO"].fillna(df_mode_impute["CLNO"].mean())

df_mode_impute = df_mode_impute.fillna(df_mode_impute.mode())
print(df_mode_impute.isnull().sum())
print(df_mode_impute.shape)

BAD          0
LOAN         0
MORTDUE    518
VALUE      112
REASON     252
JOB        279
YOJ        515
DEROG      708
DELINQ     580
CLAGE      308
NINQ       510
CLNO       222
DEBTINC    226
dtype: int64
(5960, 13)


### **Imputation Method for Categorical Columns (Assigning An Unique Category).**

In [None]:
""" Imputation Using (Zero/Constant) Values. """

# Copy Dataframe.
df_constant_impute = data.copy()

""" Replace missing values using Constant Imputation. """
# df_constant_impute["REASON"].fillna("NA")

""" Replace missing values with a number. """
df_constant_impute = df_constant_impute.fillna(0)
print(df_constant_impute.isnull().sum())
print(df_constant_impute.shape)

BAD        0
LOAN       0
MORTDUE    0
VALUE      0
REASON     0
JOB        0
YOJ        0
DEROG      0
DELINQ     0
CLAGE      0
NINQ       0
CLNO       0
DEBTINC    0
dtype: int64
(5960, 13)


## **Using Algorithms that support Missing Values.**

In [None]:
from sklearn.impute import SimpleImputer, KNNImputer


def impute(X, method="none"):
    if method == "none":
        return pd.DataFrame(X)
    if method == "drop":
        X = X.drop("DEBTINC", axis=1).values
        return pd.DataFrame(X)
    if method == "constant":
        imp = SimpleImputer(strategy="constant")
    if method == "mean":
        imp = SimpleImputer(strategy="mean")
    if method == "median":
        imp = SimpleImputer(strategy="median")
    if method == "most_frequent":
        imp = SimpleImputer(strategy="most_frequent")
    if method == "knn":
        imp = KNNImputer(n_neighbors=5)

    imp.fit(X)
    return pd.DataFrame(imp.transform(X))

In [None]:
slc = [1, 2, 3, 6, 7, 8, 9, 10, 11]
features = pd.DataFrame(data.values[:, slc], data.index, data.columns[slc]).values
features_impute = impute(features, "knn")

print(features_impute.isnull().sum())
print(features_impute.shape)

0    0
1    0
2    0
3    0
4    0
5    0
6    0
7    0
8    0
dtype: int64
(5960, 9)


## **Imputation using Multivariate Imputation by Chained Equation (MICE).**

MICE is a method for replacing missing data values in data collection via multiple imputations.

In [None]:
# Import Library.
import seaborn as sns
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

# Load Dataset.
data = sns.load_dataset("titanic")

# Feature Engineering.
data = data[["survived", "pclass", "sex", "age", "sibsp", "parch", "fare"]]
data["sex"] = [1 if x == "male" else 0 for x in data["sex"]]
data.head()

# Handling Missing Values.
imputer = IterativeImputer(
    imputation_order="ascending", max_iter=10, random_state=42, n_nearest_features=5
)
imputed_dataset = imputer.fit_transform(data)