# 1. Data Overview & Visualization

In [1]:
import numpy as np
import pandas as pd

%matplotlib inline
import squarify
import seaborn as sns
import plotly.tools as tls
import plotly.offline as py
import plotly.graph_objs as go
import matplotlib.pyplot as plt
import plotly.figure_factory as ff
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

py.init_notebook_mode(connected=True)


# Time
from contextlib import contextmanager
@contextmanager
def timer(title):
    t0 = time.time()
    yield
    print("{} - done in {:.0f}s".format(title, time.time() - t0))

#ignore warning messages 
import warnings
warnings.filterwarnings('ignore') 

from src.visualization import target_count, target_percent, missing_plot, correlation_plot

In [2]:
data_path = 'data/diabetes.csv'
data = pd.read_csv(data_path)
data.info()
data.head() 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
Pregnancies                 768 non-null int64
Glucose                     768 non-null int64
BloodPressure               768 non-null int64
SkinThickness               768 non-null int64
Insulin                     768 non-null int64
BMI                         768 non-null float64
DiabetesPedigreeFunction    768 non-null float64
Age                         768 non-null int64
Outcome                     768 non-null int64
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


The above graph shows that the data is unbalanced. The number of non-diabetic is 268 the number of diabetic patients is 500

In [3]:
D = data[(data['Outcome'] != 0)]
H = data[(data['Outcome'] == 0)]

target_count(data)
target_percent(data)

We saw on data.head() that some features contain 0, it doesn't make sense here and this indicates missing value
Below we replace 0 value by NaN :

In [4]:
data[['Glucose','BloodPressure','SkinThickness','Insulin','BMI']] = data[
    ['Glucose','BloodPressure','SkinThickness','Insulin','BMI']
].replace(0,np.NaN)

Now, we can look at where are missing values : 

In [5]:
# Plotting 
missing_plot(data, 'Outcome')

OK, all missing values are encoded with NaN value

**To fill these Nan values the data distribution needs to be understood against the target**. 

A **correlation matrix** is a table showing correlation coefficients between sets of variables. Each random variable (Xi) in the table is correlated with each of the other values in the table (Xj). This allows you to see which pairs have the highest correlation.

In [6]:
correlation_plot(data)

To replace missing values, we'll use median by target (Outcome)

# 2. Replace missing values and EDA 

In [7]:
def median_target(var):   
    temp = data[data[var].notnull()]
    temp = temp[[var, 'Outcome']].groupby(['Outcome'])[[var]].median().reset_index()
    return temp


def plot_distribution(data_select, size_bin) :  
    tmp1 = D[data_select]
    tmp2 = H[data_select]
    hist_data = [tmp1, tmp2]
    
    group_labels = ['diabetic', 'healthy']
    colors = ['#FFD700', '#7EC0EE']

    fig = ff.create_distplot(hist_data, group_labels, colors = colors, show_hist = True, bin_size = size_bin, curve_type='kde')
    
    fig['layout'].update(title = data_select)

    py.iplot(fig, filename = 'Density plot')

## 2.1. Insulin

* **Insulin** : 2-Hour serum insulin (mu U/ml)

In [8]:
plot_distribution('Insulin', 0)

In [9]:
median_target('Insulin')

Unnamed: 0,Outcome,Insulin
0,0,102.5
1,1,169.5


Insulin's medians by the target are really different ! 102.5 for a healthy person and 169.5 for a diabetic person

In [10]:
data.loc[(data['Outcome'] == 0 ) & (data['Insulin'].isnull()), 'Insulin'] = 102.5
data.loc[(data['Outcome'] == 1 ) & (data['Insulin'].isnull()), 'Insulin'] = 169.5

## 2.2. Glucose

* **Glucose** : Plasma glucose concentration a 2 hours in an oral glucose tolerance test

In [11]:
plot_distribution('Glucose', 0)

In [12]:
median_target('Glucose')

Unnamed: 0,Outcome,Glucose
0,0,107.0
1,1,140.0


In [13]:
data.loc[(data['Outcome'] == 0 ) & (data['Glucose'].isnull()), 'Glucose'] = 107
data.loc[(data['Outcome'] == 1 ) & (data['Glucose'].isnull()), 'Glucose'] = 140

107 for a healthy person and 140 for a diabetic person

## 2.3. SkinThickness</a> 

* **SkinThickness** : Triceps skin fold thickness (mm)

In [14]:
plot_distribution('SkinThickness', 10)

In [15]:
median_target('SkinThickness')

Unnamed: 0,Outcome,SkinThickness
0,0,27.0
1,1,32.0


In [16]:
data.loc[(data['Outcome'] == 0 ) & (data['SkinThickness'].isnull()), 'SkinThickness'] = 27
data.loc[(data['Outcome'] == 1 ) & (data['SkinThickness'].isnull()), 'SkinThickness'] = 32

27 for a healthy person and 32 for a diabetic person

## 2.4. BloodPressure

* ** BloodPressure** : Diastolic blood pressure (mm Hg)

In [17]:
plot_distribution('BloodPressure', 5)

In [18]:
median_target('BloodPressure')

Unnamed: 0,Outcome,BloodPressure
0,0,70.0
1,1,74.5


In [19]:
data.loc[(data['Outcome'] == 0 ) & (data['BloodPressure'].isnull()), 'BloodPressure'] = 70
data.loc[(data['Outcome'] == 1 ) & (data['BloodPressure'].isnull()), 'BloodPressure'] = 74.5

## 2.5. BMI

* **BMI** : Body mass index (weight in kg/(height in m)^2)

In [20]:
plot_distribution('BMI', 0)

In [21]:
median_target('BMI')

Unnamed: 0,Outcome,BMI
0,0,30.1
1,1,34.3


In [22]:
data.loc[(data['Outcome'] == 0 ) & (data['BMI'].isnull()), 'BMI'] = 30.1
data.loc[(data['Outcome'] == 1 ) & (data['BMI'].isnull()), 'BMI'] = 34.3

* **Age** : Age (years)
* **DiabetesPedigreeFunction** : Diabetes pedigree function
* **Pregnancies** : Number of times pregnant

In [23]:
#plot distribution 
plot_distribution('Age', 0)
plot_distribution('Pregnancies', 0)
plot_distribution('DiabetesPedigreeFunction', 0)

In [24]:
missing_plot(data, 'Outcome')