# Load the dataset

In [1]:
import pandas as pd
data = pd.read_csv('Datasets/missing.csv', na_values=['?'])
data.head()

Unnamed: 0,Location_code,Level,Frequency,CPR,Units,Connectivity
0,LA,level 4,Monthly,76.0,,979.2
1,LA,level 1,Weekly,80.0,379.1,1125.2
2,,level 1,Daily,71.0,131.6,
3,TL,level 4,Monthly,32.0,118.1,
4,TL,level 2,Once,116.0,595.1,1503.2


# Check for missing values

In [2]:
data.isnull().sum()

Location_code    143
Level             52
Frequency         65
CPR               29
Units            167
Connectivity     214
dtype: int64

---

# Single imputation: Fill with mean, median of column.

## 1. Fillna()

In [3]:
import numpy as np
from statistics import mode
df = data.copy()

In [4]:
# Numerical values
mean_of_CPR_column = np.mean(df.CPR)
df['CPR'].fillna(mean_of_CPR_column, inplace=True)

In [5]:
# Categorical values
mode_of_Level_column = mode(df['Level'])
df['Level'] = df['Level'].fillna(value=mode_of_Level_column)

## 2. SimpleImputer (sklearn)

In [6]:
from sklearn.impute import SimpleImputer
df2 = data.copy()

In [7]:
# Numerical values
imputer_num = SimpleImputer(strategy='mean')
df2[['CPR']] = imputer_num.fit_transform(df2[['CPR']])

In [8]:
# Categorical values
imputer_cat = SimpleImputer(strategy='most_frequent')
df2[['Level']] = imputer_cat.fit_transform(df2[['Level']])

---

# Multiple imputation: Model other missing values and fill with what your model finds.

## 1. IterativeImputer (sklearn)

In [9]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
df3 = data.copy()

In [10]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
itr = IterativeImputer(estimator=LinearRegression())
df3[['Units', 'Connectivity']] = itr.fit_transform(df3[['Units', 'Connectivity']])

---

# KNN (k-nearest neighbors): Fill data with a value from another example which is similar.

## 1. KNNImputer (sklearn)

In [11]:
from sklearn.impute import KNNImputer
df4 = data.copy()

In [12]:
knn_imp = KNNImputer(n_neighbors=7)
df4[['Units', 'Connectivity']] = knn_imp.fit_transform(df4[['Units', 'Connectivity']])

---

# Delete the entire rows/columns

In [13]:
df5 = data.copy()

In [14]:
len(df5)

1056

In [15]:
len(df5.dropna())

497

In [16]:
df5[df5.isnull().sum(axis=1) > 2]

Unnamed: 0,Location_code,Level,Frequency,CPR,Units,Connectivity
40,,,,45.0,523.1,
41,,,,96.0,244.1,746.2
179,,level 2,,54.0,410.6,
244,,level 4,,56.0,257.6,
285,,level 1,Daily,,491.6,
486,,,,24.0,730.1,1789.2
510,,level 3,Daily,,,1628.2
811,GK,,,66.0,460.1,
886,,,,100.0,451.1,


In [17]:
idx = df5[df5.isnull().sum(axis=1) > 2].index
df5.drop(idx, inplace=True)

In [18]:
len(df5)

1047

---

# Good-bye