<h1> AWS and Machine Learning </h1>

> Data Cleaning - Strategies for Missing Values - Mean Imputation

GitHub Link: https://github.com/data-analytics-professionals

In [None]:
# Set the Jyputer Notebook, width: 100% 
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [None]:
from IPython.display import Image
cover = Image(filename="../project_diabetes/photos/Strategies for Missing Values – Impute, Compare and Extensively Analyse.png")
display(cover)

> Data Acquisition 

In [None]:
# Load the libraries (numpy, pandas) for data analysis and data wrangling
import numpy as np
import pandas as pd

# Read the dataset 'diabetes.csv'
df = pd.read_csv('data/pima_indians_diabetes.xls')

In [None]:
# Sample function return a random sample of n items from an axis of object.
df.sample(n=10)

In [None]:
# info function prints a concise summary of a DataFrame including the index dtype and column dtypes, non-null values and memory usage.
df.info()

In [None]:
# Print the description of the data
df.describe()

> Analyzing Missingness Percentage

In [None]:
# Create a nullity DataFrame df_nullity
df_nullity = df.isnull()

In [None]:
# print nullity df_nullity
df_nullity.head(n=10)

In [None]:
# Calculate total of missing values
print('Total Missing Values:\n', df_nullity.sum())

In [None]:
# Calculate percentage of missing values
df_nullity_percent =  df_nullity.mean() * 100
print('Percentage of Missing Values:\n', df_nullity_percent)

> Visualize Missingness

In [None]:
# Import missingno as msno
import missingno as msno
import matplotlib.pyplot as plt
%matplotlib inline

print(plt.style.available)


In [None]:
# Set Solarize_Light2 style
plt.style.use('Solarize_Light2')

In [None]:
# Plot amount of missingness in the DataFrame
msno.bar(
    df,
    figsize=(24, 10),
    fontsize=16,
    labels=None,
    log=False,
    color='limegreen',
    inline=False,
    filter=None,
    n=0,
    p=0,
    sort=None,
    ax=None
)

plt.show()


> Plot the nullity matrix of df

In [None]:
# Plot nullity matrix of df
msno.matrix(
    df=df,
    filter=None,
    n=0,
    p=0,
    sort=None,
    figsize=(25, 10),
    width_ratios=(15, 1),
    color=(1,0,0),
    fontsize=16,
    labels=None,
    sparkline=True,
    inline=False,
    freq=None,
    ax=None,
)

plt.show()


In [None]:
# Sort diabetes dataframe df on 'Serum_Insulin'
sorted_values = df.sort_values(by=['Serum_Insulin'])

# Visualize the missingness summary of sorted
msno.matrix(
    sorted_values,
    filter=None,
    n=0,
    p=0,
    sort=None,
    figsize=(25, 10),
    width_ratios=(15, 1),
    color=(0,1,0),
    fontsize=16,
    labels=None,
    sparkline=True,
    inline=False,
    freq=None,
    ax=None,
)

plt.show()

> Correlations Between Missing Data (Missingness)

> Remember we could find correlations between missing data and it helps us to gain a deeper understanding of the type of missing data.
> It also provides suitable ways in which the missing values can be addressed.


In [None]:
# Plot missingness heatmap of diabetes DataFrame df
msno.heatmap(
    df,
    inline=False,
    filter=None,
    n=0,
    p=0,
    sort=None,
    figsize=(20, 12),
    fontsize=16,
    labels=True,
    cmap='gist_rainbow',
    vmin=-1,
    vmax=1,
    cbar=True,
    ax=None,
)

# Show plot
plt.show()

In [None]:
# Plot missingness dendrogram of diabetes DataFrame df
msno.dendrogram(
    df,
    method='average',
    filter=None,
    n=0,
    p=0,
    orientation=None,
    figsize=None,
    fontsize=16,
    inline=False,
    ax=None,
)

# Show plot
plt.show()

> More topics: 
    Writing useful functions

In [None]:
import numpy as np
from numpy.random import rand

# Write a function that automates creating dummy values for missing data
def fill_dummy_values(df, scaling_factor=0.075):
    df_dummy = df.copy(deep=True)
    for col_name in df_dummy:
        
        # Get column
        col = df_dummy[col_name]
        col_null = col.isnull()  
        # Calculate number of missing values in column 
        num_nulls = col_null.sum()
        # Calculate column range
        col_range = col.max() - col.min()
        # Shift dummy values to -2 and -1 , Remember rand generates values between 0 and 1
        dummy_values = (rand(num_nulls) - 2) 
        # Scale dummy variables by scaling_factor and shift them towards col.min
        dummy_values = dummy_values * scaling_factor * col_range + col.min()
        
        # Return dummy values
        col[col_null] = dummy_values
    return df_dummy

> Generate scatter plot with missingness

In [None]:
# Fill dummy values in diabetes_dummy
diabetes_dummy = fill_dummy_values(df)

# Sum the nullity of Serum_Insulin and BMI (for coloring)
nullity_Serum_Insulin_BMI = df.Serum_Insulin.isnull() + df.BMI.isnull()

# Create a scatter plot of Serum_Insulin and BMI 
diabetes_dummy.plot(
    x='Serum_Insulin', 
    y='BMI', 
    kind='scatter', 
    alpha=0.5,
    # Set color to nullity of BMI and Skin_Fold
    c=nullity_Serum_Insulin_BMI, 
    colormap='gist_rainbow',
    title='Serum Insulin Vs BMI'    
)

plt.show()

> Excellent! Now from above plot, lets make sure that we take a close look at how the missing values of Serum Insulin and BMI interact with eachther. We dont see any specfic corelation between these two variables that is of missingness of Skin_Insulin and BMI.

> Let's find interaction between Serum_Insulin and BMI

In [None]:
# Sum the nullity of Skin_Fold and BMI
nullity_Skin_Fold_BMI = df.Skin_Fold.isnull() + df.BMI.isnull()

# Create a scatter plot of Skin Fold and BMI 
diabetes_dummy.plot(
    x='Skin_Fold', 
    y='BMI', 
    kind='scatter', 
    alpha=0.5,                
    
    # Set color to nullity of BMI and Skin_Fold
    c=nullity_Skin_Fold_BMI,
    colormap='gist_rainbow',
    
    # Set title
    title='Skin_Fold Vs BMI'
)

plt.show()

Remember: We have already discussed and performed deletions tasks
    Just to recall deletions are of two types 
    1. Pairwise deletion
    2. Listwise deletion (Complete Case Analysis)
    Note: used when the values are MCAR

"""

Listwise and pairwise deletion are the most common techniques to handling missing data.  
It is important to understand that in the vast majority of cases, an important assumption to using either of these techniques is that our data is missing completely 
at random (MCAR).

In other words, the analysts or researchers needs to support that the probability of missing data on their dependent variable is unrelated to other independent variables 
as well as the dependent variable itself.

"""

> Mean Imputations

In [None]:
# Make a copy of diabetes
diabetes_mean = df.copy(deep=True)

from sklearn.impute import SimpleImputer

# Create mean imputer object
mean_imputer = SimpleImputer(strategy='mean')

# Impute mean values in the DataFrame diabetes_mean
diabetes_mean.iloc[:, :] = mean_imputer.fit_transform(diabetes_mean)

> Median Imputations

In [None]:
# Make a copy of diabetes
diabetes_median = df.copy(deep=True)

# Create median imputer object
median_imputer = SimpleImputer(strategy='median')

# Impute median values in the DataFrame diabetes_median
diabetes_median.iloc[:, :] = median_imputer.fit_transform(diabetes_median)

> Mode Imputation


In [None]:
# Make a copy of diabetes
diabetes_mode = df.copy(deep=True)

# Create mode imputer object
mode_imputer = SimpleImputer(strategy='most_frequent')

# Impute mode values in the DataFrame diabetes_mode
diabetes_mode.iloc[:, :] = mode_imputer.fit_transform(diabetes_mode)

>Imputing a constant

In [None]:
# Make a copy of diabetes
diabetes_constant = df.copy(deep=True)

# Create comstant imputer object
constant_imputer = SimpleImputer(strategy='constant', fill_value=0)

# Impute missing values to 0 in diabetes_constant
diabetes_constant.iloc[:, :] = constant_imputer.fit_transform(diabetes_constant)

>Scatterplot of imputation

In [None]:
# Sum the nullity of Serum_Insulin and BMI (for coloring)
nullity = df['Serum_Insulin'].isnull() + df['BMI'].isnull()

# Create a scatter plot of Serum_Insulin and BMI from diabetes_mean 
diabetes_mean.plot(
    x='Serum_Insulin', 
    y='BMI', 
    kind='scatter', 
    alpha=0.5,
    c=nullity, 
    cmap='gist_rainbow',
    title='Mean Imputation'
)

plt.show()

>Comparision of Different Imputation Strategies and their Visual Analysis 

In [None]:
# Set nrows and ncols to 2
fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(20,20))

nullity = df['Serum_Insulin'].isnull() + df['BMI'].isnull()

# Create a dictionary of imputations
imputations = {
    'Mean Imputation': diabetes_mean,
    'Median Imputation': diabetes_median,
    'Most Frequent Imputation': diabetes_mode,
    'Constant Imputation' : diabetes_constant    
}

# Loop over flattened axes and imputations
for ax, df_key in zip(axes.flatten(), imputations):
    # Select and also set the title for a DataFrame
    imputations[df_key].plot(
        x='Serum_Insulin',
        y='BMI',
        kind='scatter',
        alpha=0.5,
        c=nullity,
        cmap='gist_rainbow',
        ax=ax,
        colorbar=False,
        title=df_key
    )
    

> Summary:
> Clear correlation between non-missing plotted features that are missed in our all imputation strategies and this creates a bias.
> imputation strategies do preserve their basic statistical properties but don’t account for their correlations.
   

> To deal with this real world problem we need more robust imputation strategies and that we will learn in next lecture.