In [None]:
import pandas as pd

import tensorflow as tf

In [None]:
diabetes = pd.read_csv('pima-indians-diabetes.csv')

In [None]:
diabetes.head()

In [None]:
diabetes.describe(include='all')

# let's normalize our data...

In [None]:
diabetes.columns

In [None]:
cols_to_norm = ['Number_pregnant', 'Glucose_concentration', 'Blood_pressure', 'Triceps','Insulin', 'BMI', 'Pedigree']

## Great one-liner to normalize a set of columns

In [None]:
diabetes[cols_to_norm] = diabetes[cols_to_norm].apply(lambda x: (x - x.min())/(x.max() - x.min()))

In [None]:
diabetes.describe(include='all')

# Now determine feature columns

In [None]:
diabetes.columns

In [None]:
num_preg = tf.feature_column.numeric_column('Number_pregnant')
plasma_gluc = tf.feature_column.numeric_column('Glucose_concentration')
dias_press = tf.feature_column.numeric_column('Blood_pressure')
tricep = tf.feature_column.numeric_column('Triceps')
insulin = tf.feature_column.numeric_column('Insulin')
bmi = tf.feature_column.numeric_column('BMI')
diabetes_pedrigree = tf.feature_column.numeric_column('Pedigree')
age = tf.feature_column.numeric_column('Age')
# There's got to be an easier way!!

## now categorical, non-continuous features
We can do based on a vocabulary list or a hash bucket

In [None]:
# there's also vocab file and identity
assigned_group = tf.feature_column.categorical_column_with_vocabulary_list('Group', ['A','B','C','D'])
# this is great if there's only a few defined items

In [None]:
# if you have so many you don't want to type out you can use a hash bucket
# instead of passing each value, you give max categories you believe there will be
# e.g.:
# assigned_group = tf.feature_column.categorical_column_with_hash_bucket('Group', hash_bucket_size=10)

In [None]:
# now let's look at converting a continuous column to a categorical column
# In this case, Age
# Thjs is known as "feature engineering"

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
# lets look at what the data looks like
diabetes['Age'].hist(bins=20)

In [None]:
age_bucket = tf.feature_column.bucketized_column(age, boundaries=list(range(20,90,10))) #[20,30,40,50,60,70,80])

In [None]:
feat_cols = [num_preg, plasma_gluc, dias_press, tricep, insulin, bmi, diabetes, assigned_group, age_bucket]

# now that we have our columns do train/test split

In [None]:
x_data = diabetes.drop('Class', axis=1)

In [None]:
x_data.head()

In [None]:
labels = diabetes['Class']
labels.head()

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(x_data,labels, test_size=0.3, random_state=101)

In [None]:
X_train

# Begin S6L34