# 1. Setup libraries for Machine Learning 

The first step is to import all the libaries that will be needed to parse and explore the data

In [1]:
# IMPORT LIBRARIES (must be installed via pip)
# ----------------------------------------------------------------------------

# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
import numpy as np              # linear algebra
import pandas as pd             # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt # plotting
import seaborn as sns           # plotting
import tensorflow as tf         # machine learning
import sys                      # for facets stuff
import base64                   # for facets stuff

# generic_feature_statistics_generator is in the facets directory
sys.path.append('./facets/facets_overview/python')

# ML libraries 
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score

# calculate the feature statistics from the datasets and stringify it for use in facets overview
from generic_feature_statistics_generator import GenericFeatureStatisticsGenerator
from IPython.core.display import display, HTML

# tell jupyter notebook to diplay plotted data inline
%matplotlib inline 

# SET LOCAL VARIABLES
# -----------------------------------------------------------------------------

# define local file for training data with features and labels
csv_file = 'datasets/iris.csv'

# define feature columns for input feature data 
prediction_feature_columns = ['SepalLengthCm','SepalWidthCm','PetalLengthCm','PetalWidthCm']

# path to save the model for future training or tensorflow serving
model_path = 'models/temp/iris'

# verify version of tensorflow, should be 1.4+ to use new Estimators features
print('TENSORFLOW VERSION: ', tf.__version__)


('TENSORFLOW VERSION: ', '1.4.1')


# 2. Import Dataset from local CSV file

In [2]:
# get full dataset from local CSV file
df = pd.read_csv(csv_file)
df.head()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa


# 3. Clean up the Dataset

In [3]:
# drop unnecessary 'Id' column from the dataset, doesnt help predict the label
df.drop('Id', inplace=True, axis=1)

In [4]:
# ALWAYS randomize sample
df = df.sample(frac=1).reset_index(drop=True)
df.head()

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,5.4,3.7,1.5,0.2,Iris-setosa
1,4.9,2.4,3.3,1.0,Iris-versicolor
2,5.8,2.7,5.1,1.9,Iris-virginica
3,6.3,2.5,5.0,1.9,Iris-virginica
4,5.6,2.8,4.9,2.0,Iris-virginica


In [5]:
# generate statistics for template
gfsg = GenericFeatureStatisticsGenerator()

# load data
proto = gfsg.ProtoFromDataFrames([{'name': 'train', 'table': df}])

# create base64 string for html template
protostr = base64.b64encode(proto.SerializeToString()).decode("utf-8")

# Display the facets overview visualization for this data
overview_template = """<link rel="import" href="/nbextensions/facets-dist/facets-jupyter.html" >
        <facets-overview id="overview"></facets-overview>
        <script>
          document.querySelector("#overview").protoInput = "{protostr}";
        </script>"""

# load data into template
overview_html = overview_template.format(protostr=protostr)

# display template
display(HTML(overview_html))

In [6]:
# convert dataframe to json string to be used by the html template
jsonstr = df.to_json(orient='records')

# Display the Dive visualization for this data
deepdive_template = """<link rel="import" href="/nbextensions/facets-dist/facets-jupyter.html">
        <facets-dive sprite-image-width="32" sprite-image-height="32" id="deepdive" height="600" height="1000"></facets-dive>
        <script>
          document.querySelector("#deepdive").data = {jsonstr};
        </script>"""

# load data into template
deepdive_html = deepdive_template.format(jsonstr=jsonstr)

#display template
display(HTML(deepdive_html))

In [7]:
# label encode the string label: Species
# it should be a numeric value, instead of a string value
# to do: use one-hot encoder instead of label encoder
le = LabelEncoder()
df.Species = le.fit_transform(df.Species)

# show the cleaned dataset
df.head()

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,5.4,3.7,1.5,0.2,0
1,4.9,2.4,3.3,1.0,1
2,5.8,2.7,5.1,1.9,2
3,6.3,2.5,5.0,1.9,2
4,5.6,2.8,4.9,2.0,2


In [8]:
# set just the label column to y for the train_test_split()
# *** need to shuffle the dataset first?
data_label = df.Species
data_label.head()

0    0
1    1
2    2
3    2
4    2
Name: Species, dtype: int64

In [9]:
# remove the species column so that you can scale the rest of the values
# the values are already stored in y variable
# *** convert this to X values here, for clarity?
data_features = df.drop('Species', inplace=False, axis=1)
data_features.head()


Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm
0,5.4,3.7,1.5,0.2
1,4.9,2.4,3.3,1.0
2,5.8,2.7,5.1,1.9
3,6.3,2.5,5.0,1.9
4,5.6,2.8,4.9,2.0


In [10]:
# scale data for better results
# so that columns with larger values dont get weighed more
# example: age change of 40 years (significant) vs income change of $40 (not significant)

sc = StandardScaler(copy=True, with_mean=True, with_std=True)
scaled_features = pd.DataFrame(sc.fit_transform(data_features), columns=[data_features.columns], index=data_features.index)

# show scaled X values 
scaled_features.head() 
# THIS IS A DATAFRAME

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm
0,-0.537178,1.494863,-1.284407,-1.312977
1,-1.143017,-1.513375,-0.260824,-0.261193
2,-0.052506,-0.819166,0.762759,0.922064
3,0.553333,-1.281972,0.705893,0.922064
4,-0.294842,-0.587764,0.649027,1.053537


In [11]:
# show labels (y values in equasion)
data_label.head()
# THIS IS A SERIES


0    0
1    1
2    2
3    2
4    2
Name: Species, dtype: int64

In [12]:
# do a 70/30 split of the data into a train_x and train_y (70% for training) 
# and a test_x and test_y (30% for validation) 
# X is a DATAFRAME of scaled features
# y is a SERIES of labels
train_x, test_x, train_y, test_y = train_test_split(scaled_features, data_label, test_size=0.30, random_state=101)
 

In [13]:
test_x.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 45 entries, 33 to 66
Data columns (total 4 columns):
SepalLengthCm    45 non-null float64
SepalWidthCm     45 non-null float64
PetalLengthCm    45 non-null float64
PetalWidthCm     45 non-null float64
dtypes: float64(4)
memory usage: 1.8 KB


# 4. Create Feature Columns List for Classifier

In [14]:
# Feature columns describe how to use the input.
# you need to define the name and type for each column of your training data
my_feature_columns = []

# numeric coulmns
my_feature_columns.append(tf.feature_column.numeric_column(key='SepalLengthCm'));
my_feature_columns.append(tf.feature_column.numeric_column(key='SepalWidthCm'));
my_feature_columns.append(tf.feature_column.numeric_column(key='PetalLengthCm'));
my_feature_columns.append(tf.feature_column.numeric_column(key='PetalWidthCm'));   

# other column types
# hashed strings or categorical features
# example:
# state = tf.feature_column.categorical_column_with_hash_bucket('state',100) 
# pick the right size for your data

    

# 5. Create Classifier from Feature Column list

In [15]:
# Build 2 hidden layer DNN with 10, 10 units respectively
# uses feature column types defined in the previous step
classifier = tf.estimator.DNNClassifier(
    # feature columns defined in the prev step
    feature_columns=my_feature_columns,
    # Two hidden layers of 10 nodes each.
    hidden_units=[10, 10, 10],
    # The model must choose between 3 classes.
    n_classes=data_label.nunique(),
    # save the model to a folder when run, reuse the same way
    model_dir=model_path
)


INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_task_type': 'worker', '_is_chief': True, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x1218546d0>, '_save_checkpoints_steps': None, '_keep_checkpoint_every_n_hours': 10000, '_service': None, '_num_ps_replicas': 0, '_tf_random_seed': None, '_master': '', '_num_worker_replicas': 1, '_task_id': 0, '_log_step_count_steps': 100, '_model_dir': 'models/temp/iris', '_save_summary_steps': 100}


In [16]:
# setup the train input function using the train_x and train_y datasource
train_input_fn = tf.estimator.inputs.pandas_input_fn(train_x, y=train_y, num_epochs=5000, shuffle=True)


# 6. Train Classifier

In [17]:
# this is where training actually occurs, this will be a long running process on large datasets
classifier.train(input_fn=train_input_fn)


INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Restoring parameters from models/temp/iris/model.ckpt-8204
INFO:tensorflow:Saving checkpoints for 8205 into models/temp/iris/model.ckpt.
INFO:tensorflow:loss = 77.341034, step = 8205
INFO:tensorflow:global_step/sec: 389.734
INFO:tensorflow:loss = 0.47247326, step = 8305 (0.261 sec)
INFO:tensorflow:global_step/sec: 417.599
INFO:tensorflow:loss = 0.33949822, step = 8405 (0.238 sec)
INFO:tensorflow:global_step/sec: 387.514
INFO:tensorflow:loss = 0.31105655, step = 8505 (0.258 sec)
INFO:tensorflow:global_step/sec: 406.666
INFO:tensorflow:loss = 0.22861943, step = 8605 (0.246 sec)
INFO:tensorflow:global_step/sec: 433.129
INFO:tensorflow:loss = 0.1562928, step = 8705 (0.231 sec)
INFO:tensorflow:global_step/sec: 423.243
INFO:tensorflow:loss = 0.13484488, step = 8805 (0.239 sec)
INFO:tensorflow:global_step/sec: 408.345
INFO:tensorflow:loss = 0.124146976, step = 8905 (0.245 sec)
INFO:tensorflow:global_step/sec: 420.523
INFO:tensorflow:

<tensorflow.python.estimator.canned.dnn.DNNClassifier at 0x1218bb050>

# 7. Test classifier with the 30% split data

In [18]:
# validate the model using the x_test values to get a prediction for each one
# then you can compare the predicted values against the known values in test_y
# which will give you an accuracy score in percentage
test_input_fn = tf.estimator.inputs.pandas_input_fn(test_x, num_epochs=1, shuffle=False)


In [19]:
test_result = list(classifier.predict(input_fn=test_input_fn))

print(test_result)

INFO:tensorflow:Restoring parameters from models/temp/iris/model.ckpt-12306
[{'probabilities': array([1.1489269e-08, 1.8662671e-05, 9.9998128e-01], dtype=float32), 'logits': array([-9.509177 , -2.1163101,  8.772656 ], dtype=float32), 'classes': array(['2'], dtype=object), 'class_ids': array([2])}, {'probabilities': array([9.9999547e-01, 4.5274141e-06, 2.2725922e-17], dtype=float32), 'logits': array([ 16.233528 ,   3.9281735, -22.089495 ], dtype=float32), 'classes': array(['0'], dtype=object), 'class_ids': array([0])}, {'probabilities': array([1.0641408e-05, 9.0658122e-01, 9.3408152e-02], dtype=float32), 'logits': array([-8.405992  ,  2.9466913 ,  0.67398924], dtype=float32), 'classes': array(['1'], dtype=object), 'class_ids': array([1])}, {'probabilities': array([8.2058724e-09, 1.5196923e-05, 9.9998474e-01], dtype=float32), 'logits': array([-9.683038 , -2.1590407,  8.935362 ], dtype=float32), 'classes': array(['2'], dtype=object), 'class_ids': array([2])}, {'probabilities': array([8.22

In [20]:
# build an array with just the predicted species (label encoded values) 
# filtering out all other prediction response data
predicted_y = []
for each in test_result:
    predicted_y.append(each['class_ids'][0])

In [21]:
# compare the predicted scores on the test data against the actual test_y values
# with both a confusion matrix and an accuracy score
print('OVERALL ACCURACY:')
print(accuracy_score(test_y, predicted_y))

print('')

print('CONFUSION MATRIX:')
print(confusion_matrix(test_y, predicted_y))


OVERALL ACCURACY:
0.9555555555555556

CONFUSION MATRIX:
[[14  0  0]
 [ 0 18  1]
 [ 0  1 11]]


# 8. Predict with new data records 

In [22]:
# test using single records (real world values)
# one Iris-setosa, one Iris-versicolor
new_records = pd.DataFrame([
    [5.1, 3.5, 1.4, 0.2],  # first value to predict
    [5.7, 2.8, 4.1, 1.3]], # second value to predict
    columns=prediction_feature_columns)
 
# these values must be scaled, to match what is used in the model
scaled_new_records = pd.DataFrame(sc.transform(new_records), columns=[new_records.columns], index=new_records.index)

#SepalLengthCm, SepalWidthCm, PetalLengthCm, PetalWidthCm
#-1.870024, -0.124958, -1.511870, -1.444450 : 0
# 1.522676, -0.124958, 1.217684, 1.185010 : 1


In [23]:
# predict using the model to get a prediction for each item passed
prediction_input_fn = tf.estimator.inputs.pandas_input_fn(scaled_new_records, num_epochs=1, shuffle=False)

In [24]:
prediction_result = list(classifier.predict(input_fn=prediction_input_fn))

prediction_result

INFO:tensorflow:Restoring parameters from models/temp/iris/model.ckpt-12306


[{'class_ids': array([0]),
  'classes': array(['0'], dtype=object),
  'logits': array([ 17.42134 ,   3.224774, -21.743052], dtype=float32),
  'probabilities': array([9.9999928e-01, 6.8313847e-07, 9.7976293e-18], dtype=float32)},
 {'class_ids': array([1]),
  'classes': array(['1'], dtype=object),
  'logits': array([ -5.7947717,  19.335667 , -27.04612  ], dtype=float32),
  'probabilities': array([1.2189602e-11, 1.0000000e+00, 7.1886301e-21], dtype=float32)}]

# 9. Decode new predictions

In [25]:
# for each of the results
i = 0
for each in prediction_result:
    # print the values used in the prediction
    print(new_records.iloc[[i]])
    
    # print the transformed label-encoded value
    print('*** PREDICTION: ' + str(each['class_ids'][0]) + ' (Label Decoded: ' + str(le.inverse_transform(each['class_ids'][0]))+ ')')
    print('')
    
    # go to next row
    i = i + 1
    
#-1.870024, -0.124958, -1.511870, -1.444450 : 0
# 1.522676, -0.124958, 1.217684, 1.185010 : 1

   SepalLengthCm  SepalWidthCm  PetalLengthCm  PetalWidthCm
0            5.1           3.5            1.4           0.2
*** PREDICTION: 0 (Label Decoded: Iris-setosa)

   SepalLengthCm  SepalWidthCm  PetalLengthCm  PetalWidthCm
1            5.7           2.8            4.1           1.3
*** PREDICTION: 1 (Label Decoded: Iris-versicolor)



  if diff:
  if diff:


# 10. Save Model for Tensorflow Serving

In [26]:
export_dir="serving/models/iris/"

feature_spec = tf.feature_column.make_parse_example_spec(my_feature_columns)

input_receiver_fn = tf.estimator.export.build_parsing_serving_input_receiver_fn(feature_spec)

classifier.export_savedmodel(export_dir, input_receiver_fn, as_text=False) 

INFO:tensorflow:Restoring parameters from models/temp/iris/model.ckpt-12306
INFO:tensorflow:Assets added to graph.
INFO:tensorflow:No assets to write.
INFO:tensorflow:SavedModel written to: serving/models/iris/temp-1523814296/saved_model.pb


'serving/models/iris/1523814296'

# 11. Make predictions from the saved DNNClassifier model

https://www.kaggle.com/jeffcarp/example-save-and-load-a-tensorflow-model

In [27]:
# this should load the model from the last saved in the model_dir
# so everything it needs should be included below

# input column names
columns_names = ['SepalLengthCm','SepalWidthCm','PetalLengthCm','PetalWidthCm']

# where the model was last saved 
model_path = 'models/temp/iris' 

# define columns in model using the column_names list
# Feature columns describe how to use the input.
# you need to define the name and type for each column of your training data
hardcoded_feature_columns = []

# numeric coulmns
hardcoded_feature_columns.append(tf.feature_column.numeric_column(key='SepalLengthCm'));
hardcoded_feature_columns.append(tf.feature_column.numeric_column(key='SepalWidthCm'));
hardcoded_feature_columns.append(tf.feature_column.numeric_column(key='PetalLengthCm'));
hardcoded_feature_columns.append(tf.feature_column.numeric_column(key='PetalWidthCm'));   

#hardcoded_feature_columns = []
#for key in columns_names:
#    hardcoded_feature_columns.append(tf.feature_column.numeric_column(key=key))
#    print(key)

# create a classifier using the defined columns and the pre-defined model
saved_estimator = tf.estimator.DNNClassifier(
    # feature column array of tf.feature_column types, defined in the prev step
    feature_columns=hardcoded_feature_columns,
    # Two hidden layers of 10 nodes each.
    hidden_units=[10, 10, 10],
    # The model must choose between 3 classes.
    n_classes=3,
    # use saved model from folder
    model_dir=model_path
)

# test using single records (2 real world values)
# one Iris-setosa, one Iris-versicolor
records_to_predict = pd.DataFrame([
    [5.1, 3.5, 1.4, 0.2], # first records features
    [5.7, 2.8, 4.1, 1.3]],# second record features
    columns=columns_names)# defined column names
 
# these values must be scaled, to match what is used in the model
scaled_records_to_predict = pd.DataFrame(sc.transform(records_to_predict), columns=[records_to_predict.columns], index=records_to_predict.index)

# predict using the model to get a prediction for each item passed
saved_prediction_input_fn = tf.estimator.inputs.pandas_input_fn(scaled_records_to_predict, num_epochs=1, shuffle=False)

# make predictions and store as a list
saved_prediction_result = list(saved_estimator.predict(input_fn=saved_prediction_input_fn))

# output prediction values
saved_prediction_result

#SepalLengthCm, SepalWidthCm, PetalLengthCm, PetalWidthCm
#-1.870024, -0.124958, -1.511870, -1.444450 : 0
# 1.522676, -0.124958, 1.217684, 1.185010 : 1
#estimator_from_file = load_estimator_model('models/iris/1520662093')

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_task_type': 'worker', '_is_chief': True, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x1239a6750>, '_save_checkpoints_steps': None, '_keep_checkpoint_every_n_hours': 10000, '_service': None, '_num_ps_replicas': 0, '_tf_random_seed': None, '_master': '', '_num_worker_replicas': 1, '_task_id': 0, '_log_step_count_steps': 100, '_model_dir': 'models/temp/iris', '_save_summary_steps': 100}
INFO:tensorflow:Restoring parameters from models/temp/iris/model.ckpt-12306


[{'class_ids': array([0]),
  'classes': array(['0'], dtype=object),
  'logits': array([ 17.42134 ,   3.224774, -21.743052], dtype=float32),
  'probabilities': array([9.9999928e-01, 6.8313847e-07, 9.7976293e-18], dtype=float32)},
 {'class_ids': array([1]),
  'classes': array(['1'], dtype=object),
  'logits': array([ -5.7947717,  19.335667 , -27.04612  ], dtype=float32),
  'probabilities': array([1.2189602e-11, 1.0000000e+00, 7.1886301e-21], dtype=float32)}]

# END

Just testing features and note below this line, not part of main execution

In [None]:
df.head()