In [160]:
# import packages we need for exploratory data analysis (EDA)
import pandas as pd  # to store tabular data
import numpy as np  # to do some math
import matplotlib.pyplot as plt  # a popular data visualization tool
import seaborn as sns  # another popular data visualization tool
%matplotlib inline  
plt.style.use('fivethirtyeight')  # a popular data visualization theme

In [163]:
# https://machinelearningmastery.com/handle-missing-data-python/
pima_column_names = ['times_pregnant', 'plasma_glucose_concentration', 'diastolic_blood_pressure', 'triceps_thickness', 'serum_insulin', 'bmi', 'pedigree_function', 'age', 'onset_diabetes']

In [164]:
pima = pd.read_csv('../data/pima.data', names=pima_column_names)

In [165]:
pima.head()

Unnamed: 0,times_pregnant,plasma_glucose_concentration,diastolic_blood_pressure,triceps_thickness,serum_insulin,bmi,pedigree_function,age,onset_diabetes
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [166]:
pima.shape

(768, 9)

In [167]:
pima.head(10)

Unnamed: 0,times_pregnant,plasma_glucose_concentration,diastolic_blood_pressure,triceps_thickness,serum_insulin,bmi,pedigree_function,age,onset_diabetes
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
5,5,116,74,0,0,25.6,0.201,30,0
6,3,78,50,32,88,31.0,0.248,26,1
7,10,115,0,0,0,35.3,0.134,29,0
8,2,197,70,45,543,30.5,0.158,53,1
9,8,125,96,0,0,0.0,0.232,54,1


In [168]:
pima.isnull().sum()  # oh good, we have no missing values, let's do some EDA

times_pregnant                  0
plasma_glucose_concentration    0
diastolic_blood_pressure        0
triceps_thickness               0
serum_insulin                   0
bmi                             0
pedigree_function               0
age                             0
onset_diabetes                  0
dtype: int64

In [169]:
pima.describe()

Unnamed: 0,times_pregnant,plasma_glucose_concentration,diastolic_blood_pressure,triceps_thickness,serum_insulin,bmi,pedigree_function,age,onset_diabetes
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [170]:
# woah woah, notice that the min value of bmi is 0. That is physically impossible. 
# Perhaps 0 has been encoded as a missing value instead of None

In [171]:
# We see that a 0 appears as a min for 
# times_pregnant
# plasma_glucose_concentration
# diastolic_blood_pressure
# triceps_thickness
# serum_insulin
# bmi
# onset_diabetes

In [172]:
# because 0 is a class for onset_diabetes and 0 is a viable number for times_pregnant, we may conclude that 0
# is encoding missing values for 

# plasma_glucose_concentration
# diastolic_blood_pressure
# triceps_thickness
# serum_insulin
# bmi

In [173]:
# Let's replace the 0's in these columns with None so our fillna, and dropna functions may work properly
for col in ['serum_insulin', 'bmi', 'plasma_glucose_concentration', 'diastolic_blood_pressure', 'triceps_thickness']:
    pima[col].replace([0], [None], inplace=True)

In [174]:
pima.isnull().sum()  # this makes more sense now!

times_pregnant                    0
plasma_glucose_concentration      5
diastolic_blood_pressure         35
triceps_thickness               227
serum_insulin                   374
bmi                              11
pedigree_function                 0
age                               0
onset_diabetes                    0
dtype: int64

In [175]:
pima.head(10)

Unnamed: 0,times_pregnant,plasma_glucose_concentration,diastolic_blood_pressure,triceps_thickness,serum_insulin,bmi,pedigree_function,age,onset_diabetes
0,6,148,72.0,35.0,,33.6,0.627,50,1
1,1,85,66.0,29.0,,26.6,0.351,31,0
2,8,183,64.0,,,23.3,0.672,32,1
3,1,89,66.0,23.0,94.0,28.1,0.167,21,0
4,0,137,40.0,35.0,168.0,43.1,2.288,33,1
5,5,116,74.0,,,25.6,0.201,30,0
6,3,78,50.0,32.0,88.0,31.0,0.248,26,1
7,10,115,,,,35.3,0.134,29,0
8,2,197,70.0,45.0,543.0,30.5,0.158,53,1
9,8,125,96.0,,,,0.232,54,1


In [186]:
pima.describe()  # doesn't include columns with missing values..

Unnamed: 0,times_pregnant,pedigree_function,age,onset_diabetes
count,768.0,768.0,768.0,768.0
mean,3.845052,0.471876,33.240885,0.348958
std,3.369578,0.331329,11.760232,0.476951
min,0.0,0.078,21.0,0.0
25%,1.0,0.24375,24.0,0.0
50%,3.0,0.3725,29.0,0.0
75%,6.0,0.62625,41.0,1.0
max,17.0,2.42,81.0,1.0


In [176]:
from sklearn.preprocessing import Imputer

In [177]:
imputer = Imputer(strategy='mean')

In [178]:
pima_imputed = imputer.fit_transform(pima)

In [179]:
type(pima_imputed)  # comes out as an array

numpy.ndarray

In [182]:
pima_imputed = pd.DataFrame(pima_imputed, columns=pima_column_names)

In [185]:
pima_imputed.head(10)  # notice for example the triceps_thickness missing values were replaced with 29.15342

Unnamed: 0,times_pregnant,plasma_glucose_concentration,diastolic_blood_pressure,triceps_thickness,serum_insulin,bmi,pedigree_function,age,onset_diabetes
0,6.0,148.0,72.0,35.0,155.548223,33.6,0.627,50.0,1.0
1,1.0,85.0,66.0,29.0,155.548223,26.6,0.351,31.0,0.0
2,8.0,183.0,64.0,29.15342,155.548223,23.3,0.672,32.0,1.0
3,1.0,89.0,66.0,23.0,94.0,28.1,0.167,21.0,0.0
4,0.0,137.0,40.0,35.0,168.0,43.1,2.288,33.0,1.0
5,5.0,116.0,74.0,29.15342,155.548223,25.6,0.201,30.0,0.0
6,3.0,78.0,50.0,32.0,88.0,31.0,0.248,26.0,1.0
7,10.0,115.0,72.405184,29.15342,155.548223,35.3,0.134,29.0,0.0
8,2.0,197.0,70.0,45.0,543.0,30.5,0.158,53.0,1.0
9,8.0,125.0,96.0,29.15342,155.548223,32.457464,0.232,54.0,1.0


In [187]:
pima_imputed.isnull().sum()  # no missing values

times_pregnant                  0
plasma_glucose_concentration    0
diastolic_blood_pressure        0
triceps_thickness               0
serum_insulin                   0
bmi                             0
pedigree_function               0
age                             0
onset_diabetes                  0
dtype: int64

In [199]:
pima_dropped = pima.dropna()

In [227]:
print "retained {}% of rows".format(round(100*(pima.shape[0] - pima_dropped.shape[0])/float(pima.shape[0])))
# lost over half of the rows!

retained 49.0% of rows


In [None]:
# some EDA of the dataset before it was dropped and after

In [237]:
pima['onset_diabetes'].value_counts(normalize=True)

0    0.651042
1    0.348958
Name: onset_diabetes, dtype: float64

In [238]:
pima_dropped['onset_diabetes'].value_counts(normalize=True)  # the split of trues and falses stay relatively the same

0    0.668367
1    0.331633
Name: onset_diabetes, dtype: float64

In [252]:
print (pima_dropped.describe() - pima.describe())/pima.describe()


       times_pregnant  pedigree_function       age  onset_diabetes
count       -0.489583          -0.489583 -0.489583       -0.489583
mean        -0.141489           0.108439 -0.071481       -0.049650
std         -0.046936           0.042735 -0.132604       -0.011636
min               NaN           0.089744  0.000000             NaN
25%          0.000000           0.106667 -0.041667             NaN
50%         -0.333333           0.206711 -0.068966             NaN
75%         -0.166667           0.097006 -0.121951        0.000000
max          0.000000           0.000000  0.000000        0.000000


In [254]:
# the pedigree_fucntion average rose 10% after dropping missing values, which is a big increase!

# we can see how dropping values severely affects the shape of the data and we should try to retain as much as possible

In [None]:
# now lets do some machine learning

In [232]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score, GridSearchCV


X_dropped = pima_dropped.drop('onset_diabetes', axis=1)
y_droppped = pima_dropped['onset_diabetes']

knn_params = {'n_neighbors':[1, 2, 3, 4, 5, 6, 7]}
grid = GridSearchCV(knn, knn_params)
grid.fit(X_dropped, y_dropped)

print grid.best_score_, grid.best_params_

0.744897959184 {'n_neighbors': 7}


In [236]:
pima_zero = pima.fillna(0)

X_zero = pima_zero.drop('onset_diabetes', axis=1)
y_zero = pima_zero['onset_diabetes']


grid.fit(X_zero, y_zero)

print grid.best_score_, grid.best_params_  # if the values stayed at 0, our accuracy goes down

0.73046875 {'classify__n_neighbors': 6}


In [212]:
# TODO talk about how it is improper mathematically to impute the entire dataset and then go into machine learning
# it is better to set up a sklearn pipeline to do your steps

In [213]:
from sklearn.pipeline import Pipeline

In [233]:
knn_params = {'classify__n_neighbors':[1, 2, 3, 4, 5, 6, 7]}

mean_impute = Pipeline([('imputer', Imputer(strategy='mean')), ('classify', knn)])
X = pima.drop('onset_diabetes', axis=1)
y = pima['onset_diabetes']

grid = GridSearchCV(mean_impute, knn_params)
grid.fit(X, y)

print grid.best_score_, grid.best_params_

0.731770833333 {'classify__n_neighbors': 6}


In [234]:
knn_params = {'classify__n_neighbors':[1, 2, 3, 4, 5, 6, 7]}

median_impute = Pipeline([('imputer', Imputer(strategy='median')), ('classify', knn)])
X = pima.drop('onset_diabetes', axis=1)
y = pima['onset_diabetes']

grid = GridSearchCV(median_impute, knn_params)
grid.fit(X, y)

print grid.best_score_, grid.best_params_

0.735677083333 {'classify__n_neighbors': 6}


In [None]:
# our accuries may be lower, but they were made on more than twice the size of the dataset with missing values!
# they were still better than leaving them all at 0

In [None]:
# TODO talk about standardization (column-based z-score normalization) vs normalization (row-based norm scaling)

# https://stackoverflow.com/questions/39120942/difference-between-standardscaler-and-normalizer-in-sklearn-preprocessing

# talk about how some models are affected by standardization

In [303]:
from sklearn.preprocessing import Normalizer, StandardScaler

In [306]:
knn_params = {'classify__n_neighbors':[1, 2, 3, 4, 5, 6, 7]}

mean_impute_normalize = Pipeline([('imputer', Imputer(strategy='mean')), ('normalize', Normalizer()), ('classify', knn)])
X = pima.drop('onset_diabetes', axis=1)
y = pima['onset_diabetes']

grid = GridSearchCV(mean_impute_normalize, knn_params)
grid.fit(X, y)

print grid.best_score_, grid.best_params_

0.682291666667 {'classify__n_neighbors': 6}


In [307]:
knn_params = {'classify__n_neighbors':[1, 2, 3, 4, 5, 6, 7]}

mean_impute_standardize = Pipeline([('imputer', Imputer(strategy='mean')), ('standardize', StandardScaler()), ('classify', knn)])
X = pima.drop('onset_diabetes', axis=1)
y = pima['onset_diabetes']

grid = GridSearchCV(mean_impute_standardize, knn_params)
grid.fit(X, y)

print grid.best_score_, grid.best_params_

0.735677083333 {'classify__n_neighbors': 7}


In [310]:
knn_params = {'classify__n_neighbors':[1, 2, 3, 4, 5, 6, 7]}

median_impute_normalize = Pipeline([('imputer', Imputer(strategy='median')), ('normalize', Normalizer()), ('classify', knn)])
X = pima.drop('onset_diabetes', axis=1)
y = pima['onset_diabetes']

grid = GridSearchCV(median_impute_normalize, knn_params)
grid.fit(X, y)

print grid.best_score_, grid.best_params_

0.6796875 {'classify__n_neighbors': 4}


In [311]:
knn_params = {'classify__n_neighbors':[1, 2, 3, 4, 5, 6, 7]}

median_impute_standardize = Pipeline([('imputer', Imputer(strategy='median')), ('standardize', StandardScaler()), ('classify', knn)])
X = pima.drop('onset_diabetes', axis=1)
y = pima['onset_diabetes']

grid = GridSearchCV(median_impute_standardize, knn_params)
grid.fit(X, y)

print grid.best_score_, grid.best_params_  # woah woah woah, this is the best accuracy we've gotten so far working with missing data

0.743489583333 {'classify__n_neighbors': 7}


In [312]:
# run a more comprehensive grid_search with imputing and standardization, comes up with the same result
# that median imputing is the best

knn_params = {'classify__n_neighbors':[1, 2, 3, 4, 5, 6, 7], 'imputer__strategy':['mean', 'median']}

impute_standardize = Pipeline([('imputer', Imputer()), ('standardize', StandardScaler()), ('classify', knn)])
X = pima.drop('onset_diabetes', axis=1)
y = pima['onset_diabetes']

grid = GridSearchCV(impute_standardize, knn_params)
grid.fit(X, y)

print grid.best_score_, grid.best_params_  # woah woah woah, this is the best accuracy we've gotten so far working with missing data

0.743489583333 {'imputer__strategy': 'median', 'classify__n_neighbors': 7}


In [None]:
# next chapter we will deal with categorical imputing 