In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf

In [None]:
diabetes = pd.read_csv('pima-indians-diabetes.csv')

In [None]:
diabetes.head()

In [None]:
diabetes.describe(include='all')

# let's normalize our data...

In [None]:
diabetes.columns

In [None]:
cols_to_norm = ['Number_pregnant', 'Glucose_concentration', 'Blood_pressure', 'Triceps','Insulin', 'BMI', 'Pedigree']

## Great one-liner to normalize a set of columns

In [None]:
diabetes[cols_to_norm] = diabetes[cols_to_norm].apply(lambda x: (x - x.min())/(x.max() - x.min()))

In [None]:
diabetes.describe(include='all')

# Now determine feature columns

In [None]:
diabetes.columns

In [None]:
num_preg = tf.feature_column.numeric_column('Number_pregnant')
plasma_gluc = tf.feature_column.numeric_column('Glucose_concentration')
dias_press = tf.feature_column.numeric_column('Blood_pressure')
tricep = tf.feature_column.numeric_column('Triceps')
insulin = tf.feature_column.numeric_column('Insulin')
bmi = tf.feature_column.numeric_column('BMI')
diabetes_pedrigree = tf.feature_column.numeric_column('Pedigree')
age = tf.feature_column.numeric_column('Age')
# There's got to be an easier way!!

## now categorical, non-continuous features
We can do based on a vocabulary list or a hash bucket

In [None]:
# there's also vocab file and identity
assigned_group = tf.feature_column.categorical_column_with_vocabulary_list('Group', ['A','B','C','D'])
# this is great if there's only a few defined items

In [None]:
# if you have so many you don't want to type out you can use a hash bucket
# instead of passing each value, you give max categories you believe there will be
# e.g.:
# assigned_group = tf.feature_column.categorical_column_with_hash_bucket('Group', hash_bucket_size=10)

In [None]:
# now let's look at converting a continuous column to a categorical column
# In this case, Age
# Thjs is known as "feature engineering"

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
# lets look at what the data looks like
diabetes['Age'].hist(bins=20)

In [None]:
age_buckets = tf.feature_column.bucketized_column(age, boundaries=list(range(20,90,10))) #[20,30,40,50,60,70,80])

In [None]:
feat_cols = [num_preg, plasma_gluc, dias_press, tricep, insulin, bmi, diabetes_pedrigree, assigned_group, age_buckets]

# now that we have our columns do train/test split

In [None]:
x_data = diabetes.drop('Class', axis=1)

In [None]:
x_data.head()

In [None]:
labels = diabetes['Class']
labels.head()

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(x_data,labels, test_size=0.3, random_state=101)

In [None]:
X_train.describe()

# Begin S6L34

In [None]:
input_func = tf.estimator.inputs.pandas_input_fn(x=X_train, y=y_train,
                                                batch_size=10, num_epochs=1000,
                                                shuffle=True)

In [None]:
model = tf.estimator.LinearClassifier(feature_columns=feat_cols, n_classes=2 ) # 2 b/c it's a binary classification

In [None]:
feat_cols

In [None]:
model.train(input_fn=input_func, steps=1000)

In [None]:
eval_input_func = tf.estimator.inputs.pandas_input_fn(x=X_test, y=y_test, batch_size=10, num_epochs=1, shuffle=False)

In [None]:
results = model.evaluate(eval_input_func)

In [None]:
results
# auc is Area Under Curve
# Accuracy = 74%, "not too bad" -lecture

In [None]:
# no y val because that's what we want to predict
pred_input_func = tf.estimator.inputs.pandas_input_fn(x=X_test, batch_size=10, num_epochs=1, shuffle=False)

In [None]:
predictions = model.predict(pred_input_func)

In [None]:
my_pred = list(predictions)

In [None]:
my_pred[:5]

In [None]:
# Now do dense NN classifier!
dnn_model = tf.estimator.DNNClassifier(hidden_units=[10,10,10], feature_columns=feat_cols, n_classes=2)

In [None]:
# can't do same thing as before because it gets error!!
## dnn_model.train(input_fn=input_func,steps=1000)
## Reason: if you have a feature/categorical column and you're using it on a dense column it has to be an "embedding column"
## ValueError: Items of feature_columns must be a _DenseColumn. You can wrap a categorical column with an embedding_column or indicator_column. 
##   Given: _VocabularyListCategoricalColumn(key='Group', vocabulary_list=('A', 'B', 'C', 'D'), dtype=tf.string, default_value=-1, num_oov_buckets=0)

In [None]:
embedded_group_col = tf.feature_column.embedding_column(assigned_group, dimension=4)

In [None]:
feat_cols_for_dnn = [num_preg, plasma_gluc, dias_press, tricep, insulin, bmi, diabetes_pedrigree, embedded_group_col, age_buckets]

In [None]:
input_func = tf.estimator.inputs.pandas_input_fn(X_train, y_train, batch_size=10, num_epochs=1000, shuffle=True)

In [None]:
dnn_model = tf.estimator.DNNClassifier(hidden_units=[10,10,10], feature_columns=feat_cols_for_dnn, n_classes=2)

In [None]:
dnn_model.train(input_fn=input_func, steps=1000)

In [None]:
eval_input_func = tf.estimator.inputs.pandas_input_fn(x=X_test, y=y_test, batch_size=10, num_epochs=1, shuffle=False)

In [None]:
dnn_model.evaluate(eval_input_func)
# 78%  accuracy, auc=83.0 , a little better than the linear one

In [None]:
# lets try it with more neurons, but risk is overfitting

In [None]:
embedded_group_col = tf.feature_column.embedding_column(assigned_group, dimension=4)

In [None]:
feat_cols_for_dnn = [num_preg, plasma_gluc, dias_press, tricep, insulin, bmi, diabetes_pedrigree, embedded_group_col, age_buckets]

In [None]:
input_func = tf.estimator.inputs.pandas_input_fn(X_train, y_train, batch_size=10, num_epochs=1000, shuffle=True)

In [None]:
dnn_model = tf.estimator.DNNClassifier(hidden_units=[10,20,20,20,10], feature_columns=feat_cols_for_dnn, n_classes=2)

In [None]:
dnn_model.train(input_fn=input_func, steps=1000)

In [None]:
eval_input_func = tf.estimator.inputs.pandas_input_fn(x=X_test, y=y_test, batch_size=10, num_epochs=1, shuffle=False)

In [None]:
dnn_model.evaluate(eval_input_func)
# [10,10,10]       = 78% accuracy, auc=83 , a little better than the linear one
# [10,20,20,20,10] = 77% accuracy, auc=82, so worse with more layers and nodes!