# Load the dataset

In [None]:
import pandas as pd
data = pd.read_csv('Datasets/missing.csv', na_values=['?'])
data.head()

---

# Check for missing values

In [None]:
data.isnull().sum()

---

# Single imputation: Fill with mean, median of column.

## 1. Fillna()

In [None]:
import numpy as np
from statistics import mode
df = data.copy()

In [None]:
# Numerical values
mean_of_CPR_column = np.mean(df.CPR)
df['CPR'].fillna(mean_of_CPR_column, inplace=True)

In [None]:
# Categorical values
mode_of_Level_column = mode(df['Level'])
df['Level'] = df['Level'].fillna(value=mode_of_Level_column)

## 2. SimpleImputer (sklearn)

In [None]:
from sklearn.impute import SimpleImputer
df2 = data.copy()

In [None]:
# Numerical values
imputer_num = SimpleImputer(strategy='mean')
df2[['CPR']] = imputer_num.fit_transform(df2[['CPR']])

In [None]:
# Categorical values
imputer_cat = SimpleImputer(strategy='most_frequent')
df2[['Level']] = imputer_cat.fit_transform(df2[['Level']])

---

# Multiple imputation: Model other missing values and fill with what your model finds.

## 1. IterativeImputer (sklearn)

In [None]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
df3 = data.copy()

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
itr = IterativeImputer(estimator=LinearRegression())
df3[['Units', 'Connectivity']] = itr.fit_transform(df3[['Units', 'Connectivity']])

---

# KNN (k-nearest neighbors): Fill data with a value from another example which is similar.

## 1. KNNImputer (sklearn)

In [None]:
from sklearn.impute import KNNImputer
df4 = data.copy()

In [None]:
knn_imp = KNNImputer(n_neighbors=7)
df4[['Units', 'Connectivity']] = knn_imp.fit_transform(df4[['Units', 'Connectivity']])

---

# Delete the entire rows/columns

In [None]:
df5 = data.copy()

In [None]:
len(df5)

In [None]:
len(df5.dropna())

In [None]:
df5[df5.isnull().sum(axis=1) > 2]

In [None]:
idx = df5[df5.isnull().sum(axis=1) > 2].index
df5.drop(idx, inplace=True)

In [None]:
len(df5)

---

# Good-bye