## Classifier using tf.estimator.DNNClassifier
https://www.tensorflow.org/tutorials/estimator/linear
https://towardsai.net/p/machine-learning/tf-estimator-a-tensorflow-high-level-api


In [0]:
import tensorflow as tf
import pandas as pd
import numpy as np
import tempfile

In [84]:
from sklearn.model_selection import train_test_split

In [85]:
import warnings
warnings.filterwarnings('ignore')

In [86]:
tf.get_logger().setLevel('ERROR')

## Get Training Data

In [87]:
dataset_url = 'http://storage.googleapis.com/download.tensorflow.org/data/petfinder-mini.zip'
csv_file = './datasets/petfinder-mini/petfinder-mini.csv'
dataset_path = tf.keras.utils.get_file('petfinder_mini.zip', dataset_url, extract=True, cache_dir='.')
df = pd.read_csv(csv_file)

In [88]:
df

Unnamed: 0,Type,Age,Breed1,Gender,Color1,Color2,MaturitySize,FurLength,Vaccinated,Sterilized,Health,Fee,Description,PhotoAmt,AdoptionSpeed
0,Cat,3,Tabby,Male,Black,White,Small,Short,No,No,Healthy,100,Nibble is a 3+ month old ball of cuteness. He ...,1,2
1,Cat,1,Domestic Medium Hair,Male,Black,Brown,Medium,Medium,Not Sure,Not Sure,Healthy,0,I just found it alone yesterday near my apartm...,2,0
2,Dog,1,Mixed Breed,Male,Brown,White,Medium,Medium,Yes,No,Healthy,0,Their pregnant mother was dumped by her irresp...,7,3
3,Dog,4,Mixed Breed,Female,Black,Brown,Medium,Short,Yes,No,Healthy,150,"Good guard dog, very alert, active, obedience ...",8,2
4,Dog,1,Mixed Breed,Male,Black,No Color,Medium,Short,No,No,Healthy,0,This handsome yet cute boy is up for adoption....,3,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11532,Dog,24,Poodle,Male,Brown,Golden,Medium,Medium,Not Sure,No,Healthy,0,been at my place for a while..am hoping to fin...,0,4
11533,Cat,1,Domestic Short Hair,Female,Cream,Gray,Medium,Short,No,No,Healthy,0,1 month old white + grey kitten for adoption n...,1,3
11534,Dog,6,Schnauzer,Female,Black,White,Small,Long,Yes,No,Healthy,0,ooooo,1,0
11535,Cat,9,Domestic Short Hair,Female,Yellow,White,Small,Short,Yes,Yes,Healthy,0,she is very shy..adventures and independent..s...,3,4


In [89]:
# In the original dataset "4" indicates the pet was not adopted.
# After modifying the label column, 0 will indicate the pet was not adopted, and 1 will indicate it was.
df['target'] = np.where(df['AdoptionSpeed']==4, 0, 1)

In [90]:
# Drop un-used columns.
df = df.drop(columns=['AdoptionSpeed', 'Description'])

In [91]:
df

Unnamed: 0,Type,Age,Breed1,Gender,Color1,Color2,MaturitySize,FurLength,Vaccinated,Sterilized,Health,Fee,PhotoAmt,target
0,Cat,3,Tabby,Male,Black,White,Small,Short,No,No,Healthy,100,1,1
1,Cat,1,Domestic Medium Hair,Male,Black,Brown,Medium,Medium,Not Sure,Not Sure,Healthy,0,2,1
2,Dog,1,Mixed Breed,Male,Brown,White,Medium,Medium,Yes,No,Healthy,0,7,1
3,Dog,4,Mixed Breed,Female,Black,Brown,Medium,Short,Yes,No,Healthy,150,8,1
4,Dog,1,Mixed Breed,Male,Black,No Color,Medium,Short,No,No,Healthy,0,3,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11532,Dog,24,Poodle,Male,Brown,Golden,Medium,Medium,Not Sure,No,Healthy,0,0,0
11533,Cat,1,Domestic Short Hair,Female,Cream,Gray,Medium,Short,No,No,Healthy,0,1,1
11534,Dog,6,Schnauzer,Female,Black,White,Small,Long,Yes,No,Healthy,0,1,1
11535,Cat,9,Domestic Short Hair,Female,Yellow,White,Small,Short,Yes,Yes,Healthy,0,3,0


In [92]:
# Split the dataframe into train, validation, and test
df_train, df_test = train_test_split(df, test_size=0.2)
df_train, df_val = train_test_split(df_train, test_size=0.2)

print(len(df_train), 'train examples')
print(len(df_val), 'validation examples')
print(len(df_test), 'test examples')

7383 train examples
1846 validation examples
2308 test examples


In [93]:
# Get th features and predictor variables
y_train = df_train['target']
X_train = df_train.drop('target', axis=1)

y_test = df_test['target']
X_test = df_test.drop('target', axis=1)

y_val = df_val['target']
X_val = df_val.drop('target', axis=1)

## Step 1: Create an input function
An input function is a function that returns a tf.data.Dataset object which outputs the following two-element tuple:

features — A Python dictionary in which:
(a)Each key is the name of a feature.
(b)Each value is an array containing all of that feature’s values.

label — An array containing the values of the label for every example.
We’re using pandas for building input pipeline

In [94]:
def input_fn(df_features, df_labels, batch_size=256, training_mode=True):
    # Convert the inputs Dataframes to a Dataset.
    dataset = tf.data.Dataset.from_tensor_slices((dict(df_features), df_labels))
    # Shuffle and repeat if you are in training mode.
    if training_mode:
        dataset = dataset.shuffle(1000).repeat()

    return dataset.batch(batch_size)

## Step 2: Define the model’s feature columns
A feature column is an object describing how the model should use raw input data from the features dictionary. When you build an Estimator model, we pass it a list of feature columns that describe each of the features you want the model to use. The tf.feature_column module provides many options for representing data to the model.

For Iris, the 4 raw features are numeric values, so we’ll build a list of feature columns to tell the Estimator model to represent each of the four features as 32-bit floating-point values. Therefore, the code to create the feature column is:

In [95]:
# Feature columns describe how to use the input.
NUMERIC_COLUMNS = ['PhotoAmt','Fee']
CATEGORICAL_COLUMNS = ['Type','Gender','Color1','Color2','MaturitySize','FurLength','Vaccinated','Sterilized','Health']
EMBEDDING_COLUMNS = ['Breed1']
HASHED_COLUMNS = ['Breed1']
BUCKETIZED_COLUMNS = ['Age']


feature_columns = []
for feature_name in NUMERIC_COLUMNS:
    feature_columns.append(tf.feature_column.numeric_column(feature_name, dtype=tf.float32))

for feature_name in CATEGORICAL_COLUMNS:
    vocabulary = df[feature_name].unique()
    categorical_column = tf.feature_column.categorical_column_with_vocabulary_list(key=feature_name, vocabulary_list=vocabulary)
    # Map categorical column to numeric values - one-hot encoding/vector
    feature_columns.append(tf.feature_column.indicator_column(categorical_column))

for feature_name in EMBEDDING_COLUMNS:
    vocabulary = df[feature_name].unique()
    categorical_column = tf.feature_column.categorical_column_with_vocabulary_list(key=feature_name, vocabulary_list=vocabulary)
    # Map categorical column to numeric values - embedding
    feature_columns.append(tf.feature_column.embedding_column(categorical_column, dimension=8))

for feature_name in HASHED_COLUMNS:
    categorical_column = tf.feature_column.categorical_column_with_hash_bucket(key=feature_name, hash_bucket_size=10)
    # Map categorical column to numeric values - Hashing
    feature_columns.append(tf.feature_column.indicator_column(categorical_column))

for feature_name in BUCKETIZED_COLUMNS:
    numerical_column = tf.feature_column.numeric_column(feature_name)
    feature_columns.append(tf.feature_column.bucketized_column(numerical_column, boundaries=[1, 3, 5]))
# Buckets include the left boundary, and exclude the right boundary. Namely, boundaries=[1, 3, 5]
# generates buckets (-inf, 1), [1, 3), [3, 5), and [5, +inf).

In [96]:
# Demonstrate several types of feature columns
train_ds = input_fn(X_train, y_train)
example_batch = next(iter(train_ds))[0]

def demo(feature_column):
    feature_layer = tf.keras.layers.DenseFeatures(feature_column)
    print(feature_layer(example_batch).numpy())

In [97]:
print(feature_columns[13])

BucketizedColumn(source_column=NumericColumn(key='Age', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None), boundaries=(1, 3, 5))


In [98]:
demo(feature_columns[13])
# Notice the one-hot values below describe which age range each row matches.

[[0. 0. 0. 1.]
 [0. 0. 0. 1.]
 [0. 0. 0. 1.]
 ...
 [0. 0. 0. 1.]
 [0. 1. 0. 0.]
 [0. 0. 0. 1.]]


## Step 3: Instantiate the Estimator
The Iris problem is a classic classification problem. Fortunately, TensorFlow provides several pre-made classifier Estimators, including:

a. tf.estimator.DNNClassifier for deep models that perform multi-class classification.
b. tf.estimator.DNNLinearCombinedClassifier for wide & deep models.
c. tf.estimator.LinearClassifier for classifiers based on linear models.

For the Iris problem, tf.estimator.DNNClassifier seems like the best choice. Here’s how we instantiated this Estimator:

In [99]:
# Build a DNN with 3 hidden layers with 30 nodes each.
classifier_dir = tempfile.mkdtemp()
classifier = tf.estimator.DNNClassifier(
    model_dir=classifier_dir,
    feature_columns=feature_columns,
    optimizer='Adagrad', # ('Adagrad', 'Adam', 'Ftrl', 'RMSProp', SGD')
    activation_fn=tf.nn.relu,
    loss_reduction=tf.losses.Reduction.SUM_OVER_BATCH_SIZE,
    # Three hidden layers of 30 nodes each.
    hidden_units=[30, 30, 30],
    # The model must choose between 3 classes.
    n_classes=3)

## Step 4: Train and  Evaluate

In [100]:
# Train the Model.
classifier.train(
    input_fn=lambda: input_fn(X_train, y_train, training_mode=True),
    steps=5000)

<tensorflow_estimator.python.estimator.canned.dnn.DNNClassifierV2 at 0x16c61ba00>

In [101]:
# Evaluates the accuracy of the trained model on the test data
eval_result = classifier.evaluate(input_fn=lambda: input_fn(X_test, y_test, training_mode=False))
print('\nTest set accuracy: {accuracy:0.3f}\n'.format(**eval_result))
for key, value in eval_result.items():
    print(key, ":", value)


Test set accuracy: 0.731

accuracy : 0.73136914
average_loss : 0.5425948
loss : 0.54712623
global_step : 5000


## Step 5: Prediction

In [102]:
# Define Prediction input data function
def prediction_input_fn(features, batch_size=256):
    # Convert the inputs to a Dataset without labels.
    dataset = tf.data.Dataset.from_tensor_slices(dict(features))

    return dataset.batch(batch_size)

In [103]:
# Generate predictions from the model
predictions = classifier.predict(input_fn=lambda: prediction_input_fn(X_val))

In [104]:
# Get predictions and their probabilities
for pred_dict in predictions:
    class_id = pred_dict['class_ids'][0]
    probability = pred_dict['probabilities'][class_id]
    print('Prediction is "{}" ({:.1f}%)"'.format(class_id, 100 * probability))

Prediction is "1" (75.4%)"
Prediction is "1" (81.6%)"
Prediction is "1" (80.4%)"
Prediction is "1" (62.3%)"
Prediction is "1" (88.3%)"
Prediction is "1" (80.2%)"
Prediction is "1" (81.4%)"
Prediction is "1" (65.5%)"
Prediction is "1" (62.8%)"
Prediction is "1" (69.4%)"
Prediction is "1" (77.7%)"
Prediction is "1" (68.5%)"
Prediction is "1" (90.2%)"
Prediction is "1" (76.4%)"
Prediction is "1" (68.3%)"
Prediction is "1" (87.4%)"
Prediction is "1" (85.6%)"
Prediction is "1" (82.1%)"
Prediction is "1" (61.1%)"
Prediction is "1" (56.1%)"
Prediction is "1" (56.7%)"
Prediction is "1" (77.8%)"
Prediction is "1" (65.1%)"
Prediction is "1" (79.0%)"
Prediction is "1" (58.4%)"
Prediction is "1" (80.9%)"
Prediction is "1" (52.5%)"
Prediction is "1" (82.3%)"
Prediction is "1" (75.7%)"
Prediction is "1" (82.2%)"
Prediction is "0" (49.4%)"
Prediction is "1" (89.9%)"
Prediction is "1" (83.5%)"
Prediction is "1" (73.1%)"
Prediction is "1" (73.0%)"
Prediction is "1" (85.5%)"
Prediction is "1" (66.4%)"
P