# Get data

In [None]:
%matplotlib inline

In [None]:
import matplotlib.pyplot as plt

In [None]:
url = (
    'http://biostat.mc.vanderbilt.edu/' 
    'wiki/pub/Main/DataSets/titanic3.xls'
)

In [None]:
import pandas as pd
import missingno as mn

In [None]:
df = pd.read_excel(url)

In [None]:
df.isnull().mean() * 100

In [None]:
mn.matrix(df);

In [None]:
mn.heatmap(df, vmin=-1, vmax=1);

In [None]:
mn.dendrogram(df);

In [None]:
mn.bar(df);

## Drop missing data

The cabin, body and boat have > 50% missing values, and should probaly be dropped.

In [None]:
mn.matrix(df.drop(columns=['cabin', 'body', 'boat']));

We can also drop rows where all data in certain columns are missing

In [None]:
mn.matrix(df.dropna(subset=['age', 'home.dest'], how='all'));

## Impute data

### Univariate imputation

In [None]:
num_cols = df.select_dtypes('number').columns
num_cols

In [None]:
from sklearn.impute import SimpleImputer

In [None]:
simple_imputer = SimpleImputer()

In [None]:
df1 = df.copy()

In [None]:
df1.loc[:, num_cols] = simple_imputer.fit_transform(df[num_cols])

In [None]:
mn.matrix(df1);

### Multivariate imputation

In [None]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

In [None]:
df1 = df.copy()

In [None]:
imputer = IterativeImputer(random_state=0)

In [None]:
df1.loc[:, num_cols] = imputer.fit_transform(df[num_cols])

In [None]:
mn.matrix(df1);

### What is the difference?

In [None]:
import numpy as np

In [None]:
x = np.array([
    [10, 10],
    [1, 1],
    [2,2],
    [10, 10],
    [10, np.nan],
    [np.nan, 10],
    [np.nan, np.nan]
])

In [None]:
simple_imputer.fit_transform(x)

In [None]:
imputer.fit_transform(x)