<h1> AWS and Machine Learning </h1>

> Data Cleaning - Strategies for Missing Values – Detect Hidden Missing Values and Convert into NULL Values and Analyse

GitHub Link: https://github.com/data-analytics-professionals

In [None]:
# Set the Jyputer Notebook, width: 100% 
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [None]:
from IPython.display import Image
cover = Image(filename="../project_diabetes/photos/Detect Hidden Missing Values.png")
display(cover)

> Data Acquisition 

In [None]:
# Load the libraries (numpy, pandas) for data analysis and data wrangling
import numpy as np
import pandas as pd

# Read the dataset 'diabetes.csv'
df = pd.read_csv('data/diabetes.csv')

In [None]:
# Sample function return a random sample of n items from an axis of object.
df.sample(n=10)

In [None]:
# info function prints a concise summary of a DataFrame including the index dtype and column dtypes, non-null values and memory usage.
df.info()

In [None]:
# Print the description of the data
df.describe()

> Remember: BMI means body mass index

> The body mass index (BMI) is a measure that uses your height and weight to work out if your weight is healthy. 
> The BMI calculation divides an adult's weight in kilograms by their height in metres squared. 

> For example, A BMI of 25 means 25kg/m2.

In [None]:
# Store all rows of column 'bmi' in zero_bmi which are equal to 0 
zero_bmi = df.bmi[df.bmi == 0]

In [None]:
# print zero_bmi
zero_bmi

> Replacing Hidden Missing Values

In [None]:
# Set the 0 values of column 'bmi' to np.nan
df.bmi[df.bmi == 0] = np.nan

In [None]:
# Print the 'NaN' values in the column bmi
print(df.bmi[np.isnan(df.bmi)])

> Analyzing Missingness Percentage

In [None]:
# Create a nullity DataFrame df_nullity
df_nullity = df.isnull()

In [None]:
# print nullity df_nullity
df_nullity.head(n=10)

In [None]:
# Calculate total of missing values
print('Total Missing Values:\n', df_nullity.sum())

In [None]:
# Calculate percentage of missing values
df_nullity_percent =  df_nullity.mean() * 100
print('Percentage of Missing Values:\n', df_nullity_percent)

> Visualize Missingness

In [None]:
# Import missingno as msno
import missingno as msno
import matplotlib.pyplot as plt
%matplotlib inline

print(plt.style.available)


In [None]:
# Set Solarize_Light2 style
plt.style.use('Solarize_Light2')

In [None]:
# Plot amount of missingness in the DataFrame
msno.bar(
    df,
    figsize=(24, 10),
    fontsize=16,
    labels=None,
    log=False,
    color='limegreen',
    inline=False,
    filter=None,
    n=0,
    p=0,
    sort=None,
    ax=None
)

plt.show()


> Plot the nullity matrix of df

In [None]:
# Plot nullity matrix of df
msno.matrix(
    df=df,
    filter=None,
    n=0,
    p=0,
    sort=None,
    figsize=(25, 10),
    width_ratios=(15, 1),
    color=(0,1,0),
    fontsize=16,
    labels=None,
    sparkline=True,
    inline=False,
    freq=None,
    ax=None,
)

plt.show()


In [None]:
# Sort diabetes dataframe df on 'bmi'
sorted_values = df.sort_values(by=['bmi'])

# Visualize the missingness summary of sorted
msno.matrix(
    sorted_values,
    filter=None,
    n=0,
    p=0,
    sort=None,
    figsize=(25, 10),
    width_ratios=(15, 1),
    color=(0,1,0),
    fontsize=16,
    labels=None,
    sparkline=True,
    inline=False,
    freq=None,
    ax=None,
)

plt.show()

> Correlations Between Missing Data

> Remember we could find correlations between missing data and it helps us to gain a deeper understanding of the type of missing data.
> It also provides suitable ways in which the missing values can be addressed.


In [None]:
# Plot missingness heatmap of diabetes
msno.heatmap(
    df,
    inline=False,
    filter=None,
    n=0,
    p=0,
    sort=None,
    figsize=(20, 12),
    fontsize=16,
    labels=True,
    cmap='RdBu',
    vmin=-1,
    vmax=1,
    cbar=True,
    ax=None,
)

# Show plot
plt.show()

In [None]:
# Plot missingness dendrogram of diabetes
msno.dendrogram(
    df,
    method='average',
    filter=None,
    n=0,
    p=0,
    orientation=None,
    figsize=None,
    fontsize=16,
    inline=False,
    ax=None,
)

# Show plot
plt.show()

> More topics: 
    Writing useful functions

In [None]:
import numpy as np
from numpy.random import rand

# Write a function that automates creating dummy values for missing data
def fill_dummy_values(df, scaling_factor=0.075):
    df_dummy = df.copy(deep=True)
    for col_name in df_dummy:
        col = df_dummy[col_name]
        col_null = col.isnull()    
        # Calculate number of missing values in column 
        num_nulls = col_null.sum()
        # Calculate column range
        col_range = col.max() - col.min()
        # Scale the random values to scaling_factor times col_range
        dummy_values = (rand(num_nulls) - 2) * scaling_factor * col_range + col.min()
        col[col_null] = dummy_values
    return df_dummy

> Generate scatter plot with missingness - take a deep look at how the missing values of selected features interact

In [None]:
# Fill dummy values in diabetes_dummy
diabetes_dummy = fill_dummy_values(df)

# Sum the nullity of Skin_Fold and BMI
nullity = df.age.isnull() + df.bmi.isnull()

# Create a scatter plot of Skin Fold and BMI 
diabetes_dummy.plot(x='age', y='bmi', kind='scatter', alpha=0.5, 
                    
                    # Set color to nullity of BMI and Skin_Fold
                    c=nullity, 
                    cmap='nipy_spectral'
                   )

plt.show()

In [None]:
# Print the number of missing values in bmi
print(df['bmi'].isnull().sum())

# Drop rows where 'Glucose' has a missing value
df.dropna(subset=['bmi'], how='any', inplace=True)

# Visualize the missingness of diabetes (df) after dropping missing values
# Plot nullity matrix of df
msno.matrix(
    df=df,
    filter=None,
    n=0,
    p=0,
    sort=None,
    figsize=(25, 10),
    width_ratios=(15, 1),
    color=(0,1,0),
    fontsize=16,
    labels=None,
    sparkline=True,
    inline=False,
    freq=None,
    ax=None,
)

plt.show()

> Awesome! Do you see how the glucose column has no missing values now?

"""

Will you delete?

Before deleting missing values completely, you must consider the factors for deletion. 
The simplest factor to consider is the size of the missing data. 

More complex reasons affecting missingness may require domain knowledge. 

i advise that you must identify the reason for missingness and then perform the appropriate deletion.

1. use msno.matrix() and msno.heatmap() to visualize missingness and the correlation between variables with missing data. 
2. You must determine pattern in missingness. 
3. Lastly, if required proceed to pairwise deletion or listwise deletion depending on the type of missingness.

"""

> How to decide all Missing Data (NaNs) in bmi feature, if we decide so 

In [None]:
# Drop rows where 'bmi' has a missing value
df.dropna(subset=["bmi"], how='all', inplace=True)