In [None]:
# Tensorflow Linear Classifier Example

## Classification on : Pima Indians Diabetes Dataset


### Import this csv file using `pandas` and view its columns.


In [31]:
import pandas as pd

diabetes = pd.read_csv('../input/diabetes.csv')

In [32]:
diabetes.head()

### Here, the last column `Outcome` is the final output or the prediction whether the person is suffering from diabetes or not. 

#### In this dataset, it is easy that we can predict the `Age` column will help us to classify our data because it can be a continuous feature column. Let's plot a graph to check our assumption.

In [33]:
import matplotlib.pyplot as plt
%matplotlib inline

In [34]:
diabetes['Age'].hist(bins=20)

## Now let's normalize this dataset.

* We'll not include `Age` and `Outcome` because Age is our continuous feature column and Outcome is the final result.

In [35]:
cols_to_norm = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction']

In [36]:
diabetes[cols_to_norm] = diabetes[cols_to_norm].apply(lambda x : (x - x.min()) / (x.max() - x.min()))

In [37]:
diabetes.head()

## Covert this data into feature and numeric columns using Tensorflow

In [38]:
import tensorflow as tf

In [39]:
pregnancies          = tf.feature_column.numeric_column('Pregnancies')
glucose              = tf.feature_column.numeric_column('Glucose')
blood_pressure       = tf.feature_column.numeric_column('BloodPressure')
skin_thickness       = tf.feature_column.numeric_column('SkinThickness')
insulin              = tf.feature_column.numeric_column('Insulin')
bmi                  = tf.feature_column.numeric_column('BMI')
diabetes_pedigree_fn = tf.feature_column.numeric_column('DiabetesPedigreeFunction')
age                  = tf.feature_column.numeric_column('Age')

## Same for non continuous features
* First we'll do this using `vocabulary list` and then use `hash buckets`

In [40]:
assigned_group = tf.feature_column.categorical_column_with_vocabulary_list('Outcome', ['0', '1'])

In [41]:
# assigned_group = tf.feature_column.categorical_column_with_hash_bucket('Outcome', hash_bucket_size=2)

## Creating age bucket

In [42]:
age_bucket = tf.feature_column.bucketized_column(age, boundaries=[20,30,40,50,60,70,80])

In [43]:
feat_cols = [pregnancies, glucose, blood_pressure, skin_thickness, insulin, bmi, diabetes_pedigree_fn, age_bucket]

## Now we'll perform `train-test-split` using `Scikit Learn`
  
  
* We want `Outcome` column to be in Y Data and everything else in X Data

* Also we'll use `Outcome` column for labels

In [44]:
x_data = diabetes.drop('Outcome', axis=1)

x_data.head()

In [45]:
labels = diabetes['Outcome']

labels.head()

In [46]:
from sklearn.model_selection import train_test_split

In [47]:
X_train, X_test, y_train, y_test = train_test_split(x_data, labels, test_size=0.3, random_state=101)

## Creating our Model

* Here we have used `pandas` to import the csv file, we'll use `pandas_input_fn` in input function.

In [48]:
input_func = tf.estimator.inputs.pandas_input_fn(x=X_train, y=y_train, batch_size=10, num_epochs=1000, shuffle=True)

In [49]:
model = tf.estimator.LinearClassifier(feature_columns=feat_cols, n_classes=2)

## Training the model

In [50]:
model.train(input_fn=input_func, steps=1000)

## Evaluating the model

In [51]:
eval_input_func = tf.estimator.inputs.pandas_input_fn(x=X_test, y=y_test, batch_size=10, num_epochs=1, shuffle=False)

In [52]:
results = model.evaluate(eval_input_func)

In [53]:
results

## Making Predictions

* Since we don't have separate data to predict, we'll use test split to predict the results.

In [54]:
pred_input_func = tf.estimator.inputs.pandas_input_fn(x=X_test, batch_size=10, num_epochs=1, shuffle=False)

In [55]:
predictions = model.predict(pred_input_func)

predictions = list(predictions)

## Now let's compare this predictions with original data: `y_test`

* First print the first 5 elements of both `predictions` and `y_test` then compare the outputs.

In [56]:
predictions[:5]

In [57]:
y_test.head()

* Here the index of first element in `y_test` is 766 and the Outcome is `1`
* Also the outcome at index 42 and the element 3 is `0`.
* We'll compare these indexes with our `predictions`.

In [58]:
print('Comparing First Element:', 'Predictions:' ,predictions[0]['class_ids'][0], '&& y_test: ',y_test[766])
print('Probability', predictions[0]['probabilities'].max())

In [59]:
print('Comparing Third Element:', 'Predictions:' ,predictions[2]['class_ids'][0], '&& y_test: ',y_test[42])
print('Probability', predictions[2]['probabilities'].max())