# Multiple Imputation with Chained Equations


Demonstration of Multiple Imputation with Chained Equations (MICE) using a toy dataset (Iris)

In [1]:
import seaborn as sns
import numpy as np

# Plot formatting
sns.set(style='ticks')

In [2]:
# Parameters
ratio_missing_data = 0.1

# Seed random state for reproducibility
# Remove hard-coded seed value for true pseudo-randomness
rng = np.random.default_rng(seed=73)

In [3]:
# Checks on parameters
assert 0 <= ratio_missing_data < 1, 'Invalid missing data ratio'

## Initialize sample dataset

In [4]:
# Load sample data
df = sns.load_dataset('iris')
df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [None]:
# Quick visualization of data
sns.pairplot(df, hue='species')

<seaborn.axisgrid.PairGrid at 0x7f6fca543cf8>

In [None]:
# Randomly replace some of the numeric values with NaNs
numeric_df = df.select_dtypes(include=np.number)
nonnumeric_df = df.select_dtypes(exclude=np.number)
missing_df = numeric_df.mask(rng.random(size=numeric_df.shape) < ratio_missing_data)

print('Number of rows:', len(missing_df))
print('Number of non-NaN values in each column:')
print(missing_df.count())

## Multiple Imputation with Chained Equations

#### Show versions for documentation

In [None]:
print(np.__version__)

In [None]:
pd.show_versions()