[View in Colaboratory](https://colab.research.google.com/github/contractorwolf/tensorflow-irisdataset/blob/master/Colab_Iris_Estimator_Tensorflow_Basics.ipynb)

# 1. Setup libraries for Machine Learning 

The first step is to import all the libaries that will be needed to parse and explore the data

In [1]:
# IMPORT LIBRARIES (must be installed via pip)
# ----------------------------------------------------------------------------

# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
import numpy as np              # linear algebra
import pandas as pd             # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt # plotting
import seaborn as sns           # plotting
import tensorflow as tf         # machine learning


# ML libraries 
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score

# tell jupyter notebook to diplay plotted data inline
%matplotlib inline 

# SET LOCAL VARIABLES
# -----------------------------------------------------------------------------

# define local file for training data with features and labels
csv_file = 'iris.csv'

# define feature columns for input feature data 
prediction_feature_columns = ['SepalLengthCm','SepalWidthCm','PetalLengthCm','PetalWidthCm']

# path to save the model for future training or tensorflow serving
#model_path = 'models/temp/iris'

# verify version of tensorflow, should be 1.4+ to use new Estimators features
print('TENSORFLOW VERSION: ' + tf.__version__)


TENSORFLOW VERSION: 1.9.0-rc1


In [3]:
from google.colab import files

uploaded = files.upload()

for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))

Saving iris.csv to iris.csv
User uploaded file "iris.csv" with length 5107 bytes


# 2. Import Dataset from local CSV file

You will first need to load your raw data from a local CSV file, and you should attempt to output the columns after each step for verification.  

In [4]:
# get full dataset from local CSV file
df = pd.read_csv(csv_file)
df.head()


Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa


In [0]:
#df['SepalLengthCm'].hist()

In [0]:
#df.plot(kind="scatter", x="SepalLengthCm",y="SepalWidthCm", c=['blue','green','red'], s=100, alpha=.8)

In [0]:
#df.plot(kind="scatter", x="PetalLengthCm",y="PetalWidthCm", c=['blue','green','red'], s=100, alpha=.8)

# 3. Clean up the Dataset

1. eliminate noise in data (remove columns not tied to the label)
2. make values numeric (remove strings with numeric or hash values)
3. randomize order (removes order bias)
4. scale data (scaled according to the data, for training and prediction)

In [5]:
# drop unnecessary 'Id' column from the dataset, doesnt help predict the label
df.drop('Id', inplace=True, axis=1)

In [6]:
# ALWAYS randomize sample
df = df.sample(frac=1).reset_index(drop=True)
df.head()

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,4.5,2.3,1.3,0.3,Iris-setosa
1,6.5,3.0,5.2,2.0,Iris-virginica
2,7.7,2.8,6.7,2.0,Iris-virginica
3,5.8,2.7,5.1,1.9,Iris-virginica
4,7.3,2.9,6.3,1.8,Iris-virginica


In [7]:
# label encode the string label: Species
# it should be a numeric value, instead of a string value
# to do: use one-hot encoder instead of label encoder
le = LabelEncoder()
df.Species = le.fit_transform(df.Species)

# show the cleaned dataset
df.head()

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,4.5,2.3,1.3,0.3,0
1,6.5,3.0,5.2,2.0,2
2,7.7,2.8,6.7,2.0,2
3,5.8,2.7,5.1,1.9,2
4,7.3,2.9,6.3,1.8,2


In [8]:
# set just the label column to y for the train_test_split()
data_label = df.Species
data_label.head()

0    0
1    2
2    2
3    2
4    2
Name: Species, dtype: int64

In [9]:
# remove the species column so that you can scale the rest of the values
# the values are already stored in y variable
# *** convert this to X values here, for clarity?
data_features = df.drop('Species', inplace=False, axis=1)
data_features.head()


Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm
0,4.5,2.3,1.3,0.3
1,6.5,3.0,5.2,2.0
2,7.7,2.8,6.7,2.0
3,5.8,2.7,5.1,1.9
4,7.3,2.9,6.3,1.8


In [10]:
# scale data for better results
# so that columns with larger values dont get weighed more
# example: age change of 40 years (significant) vs income change of $40 (not significant)

sc = StandardScaler(copy=True, with_mean=True, with_std=True)
scaled_features=pd.DataFrame(sc.fit_transform(data_features), columns=list(data_features.columns))
scaled_features.head() 
# THIS IS A DATAFRAME

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm
0,-1.627688,-1.744778,-1.398138,-1.181504
1,0.795669,-0.124958,0.819624,1.053537
2,2.249683,-0.587764,1.67261,1.053537
3,-0.052506,-0.819166,0.762759,0.922064
4,1.765012,-0.356361,1.445147,0.790591


In [11]:
# do a 70/30 split of the data into a train_x and train_y (70% for training) 
# and a test_x and test_y (30% for validation) 
# X is a DATAFRAME of scaled features
# y is a SERIES of labels
train_x, test_x, train_y, test_y = train_test_split(scaled_features, data_label, test_size=0.30, random_state=42)
 

# 4. Create Feature Columns List for Classifier

In [12]:
# Feature columns describe how to use the input.
# you need to define the name and type for each column of your training data
my_feature_columns = []

# numeric coulmns
my_feature_columns.append(tf.feature_column.numeric_column(key='SepalLengthCm'));
my_feature_columns.append(tf.feature_column.numeric_column(key='SepalWidthCm'));
my_feature_columns.append(tf.feature_column.numeric_column(key='PetalLengthCm'));
my_feature_columns.append(tf.feature_column.numeric_column(key='PetalWidthCm')); 

# other column types
# hashed strings or categorical features

# example:
# state = tf.feature_column.categorical_column_with_hash_bucket('state',100) 
# pick the right size for your data

list(my_feature_columns)

[_NumericColumn(key='SepalLengthCm', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 _NumericColumn(key='SepalWidthCm', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 _NumericColumn(key='PetalLengthCm', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 _NumericColumn(key='PetalWidthCm', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None)]

# 5. Create Classifier from Feature Column list

In [13]:
# Build 3 hidden layer DNN with 10, 10, 10 units respectively
# uses feature column types defined in the previous step
classifier = tf.estimator.DNNClassifier(
    # feature columns defined in the prev step
    feature_columns=my_feature_columns,
    # three hidden layers of 10 nodes each.
    hidden_units=[10, 10, 10],
    # model must choose between the 3 classes in dataset
    n_classes=data_label.nunique()#,
    # save the model to a folder when run, reuse the same way
    #model_dir=model_path
)


INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/tmp/tmp7mc5935r', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f9c9ad0d0b8>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [14]:
# setup the train input function using the train_x and train_y datasource
train_input_fn = tf.estimator.inputs.pandas_input_fn(train_x, y=pd.DataFrame(train_y), num_epochs=500, shuffle=True)


# 6. Train Classifier

In [15]:
# this is where training actually occurs, this will be a long running process on large datasets
classifier.train(input_fn=train_input_fn)


INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 0 into /tmp/tmp7mc5935r/model.ckpt.
INFO:tensorflow:loss = 130.38316, step = 1
INFO:tensorflow:global_step/sec: 311.778
INFO:tensorflow:loss = 3.4718587, step = 101 (0.326 sec)
INFO:tensorflow:global_step/sec: 384.122
INFO:tensorflow:loss = 3.4746926, step = 201 (0.260 sec)
INFO:tensorflow:global_step/sec: 384.73
INFO:tensorflow:loss = 2.7547812, step = 301 (0.258 sec)
INFO:tensorflow:global_step/sec: 376.359
INFO:tensorflow:loss = 1.9434569, step = 401 (0.265 sec)
INFO:tensorflow:Saving checkpoints for 411 into /tmp/tmp7mc5935r/model.ckpt.
INFO:tensorflow:Loss for final step: 0.15746832.


<tensorflow.python.estimator.canned.dnn.DNNClassifier at 0x7f9c9ad6aac8>

# 7. Test classifier with the 30% split data

In [16]:
# validate the model using the x_test values to get a prediction for each one
# then you can compare the predicted values against the known values in test_y
# which will give you an accuracy score in percentage

# load 30% test data from split
test_input_fn = tf.estimator.inputs.pandas_input_fn(test_x, num_epochs=1, shuffle=False)

# make predictions
test_result = list(classifier.predict(input_fn=test_input_fn))

# display raw results
# print(test_result)

# build an array with just the predicted species (label encoded values) 
# filtering out all other prediction response data
predicted_y = []
for each in test_result:
    predicted_y.append(each['class_ids'][0])

print('')
    
# compare the predicted scores on the test data against the actual test_y values
# with both a confusion matrix and an accuracy score
print('OVERALL ACCURACY:')
print(accuracy_score(test_y, predicted_y))

print('')

print('CONFUSION MATRIX:')
print(confusion_matrix(test_y, predicted_y))


INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /tmp/tmp7mc5935r/model.ckpt-411
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.

OVERALL ACCURACY:
0.9555555555555556

CONFUSION MATRIX:
[[12  0  0]
 [ 0 13  0]
 [ 0  2 18]]


# 8. Predict with new data records 

In [17]:
# test using single records (real world values)
# one Iris-setosa, one Iris-versicolor
new_records = pd.DataFrame([
    [5.1, 3.5, 1.4, 0.2],  # first value to predict
    [5.7, 2.8, 4.1, 1.3]], # second value to predict
    columns=prediction_feature_columns)
    #SepalLengthCm, SepalWidthCm, PetalLengthCm, PetalWidthCm
    # 5.1, 3.5, 1.4, 0.2 : 0 (Iris-setosa)
    # 5.7, 2.8, 4.1, 1.3 : 1 (Iris-versicolor)


# show dataframe for new predictions, prior to scaling
print(new_records.columns)


print('')

# these values must be scaled, to match what is used in the model
scaled_new_records = pd.DataFrame(sc.transform(new_records),columns=list(new_records.columns))


# predict using the model to get a prediction for each item passed
prediction_input_fn = tf.estimator.inputs.pandas_input_fn(scaled_new_records, num_epochs=1, shuffle=False)

prediction_result = list(classifier.predict(input_fn=prediction_input_fn))
print('')

print(prediction_result)


# for each of the results
i = 0
for each in prediction_result:
    # print the values used in the prediction
    print(new_records.iloc[[i]])
    
    # print the transformed label-encoded value
    print('*** PREDICTION: ' + str(each['class_ids'][0]) + ' (Label Decoded: ' + str(le.inverse_transform(each['class_ids'][0]))+ ')')
    print('')
    
    # go to next row
    i = i + 1
    
#-1.870024, -0.124958, -1.511870, -1.444450 : 0
# 1.522676, -0.124958, 1.217684, 1.185010 : 1

Index(['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm'], dtype='object')

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /tmp/tmp7mc5935r/model.ckpt-411
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.

[{'logits': array([ 20.921272,   8.266466, -14.795893], dtype=float32), 'probabilities': array([9.9999678e-01, 3.1921709e-06, 3.0777389e-16], dtype=float32), 'class_ids': array([0]), 'classes': array([b'0'], dtype=object)}, {'logits': array([-3.651794 ,  4.5749416, -3.8476481], dtype=float32), 'probabilities': array([2.6727750e-04, 9.9951303e-01, 2.1973740e-04], dtype=float32), 'class_ids': array([1]), 'classes': array([b'1'], dtype=object)}]
   SepalLengthCm  SepalWidthCm  PetalLengthCm  PetalWidthCm
0            5.1           3.5            1.4           0.2
*** PREDICTION: 0 (Label Decoded: Iris-setosa)

   SepalLengthCm  SepalWid

  if diff:
  if diff:


# 9. Save Model for Tensorflow Serving

In [18]:
export_dir="serving/models/iris/"

feature_spec = tf.feature_column.make_parse_example_spec(my_feature_columns)

input_receiver_fn = tf.estimator.export.build_parsing_serving_input_receiver_fn(feature_spec)

classifier.export_savedmodel(export_dir, input_receiver_fn, as_text=False) 

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Signatures INCLUDED in export for Classify: ['serving_default', 'classification']
INFO:tensorflow:Signatures INCLUDED in export for Regress: None
INFO:tensorflow:Signatures INCLUDED in export for Predict: ['predict']
INFO:tensorflow:Signatures INCLUDED in export for Train: None
INFO:tensorflow:Signatures INCLUDED in export for Eval: None
INFO:tensorflow:Restoring parameters from /tmp/tmp7mc5935r/model.ckpt-411
INFO:tensorflow:Assets added to graph.
INFO:tensorflow:No assets to write.
INFO:tensorflow:SavedModel written to: serving/models/iris/temp-b'1530333560'/saved_model.pb


b'serving/models/iris/1530333560'

# 10. Make predictions from the saved DNNClassifier model

https://www.kaggle.com/jeffcarp/example-save-and-load-a-tensorflow-model

In [19]:
# this should load the model from the last saved in the model_dir
# so everything it needs should be included below

# input column names
columns_names = ['SepalLengthCm','SepalWidthCm','PetalLengthCm','PetalWidthCm']

# where the model was last saved 
model_path = 'models/temp/iris' 

# define columns in model using the column_names list
# Feature columns describe how to use the input.
# you need to define the name and type for each column of your training data
hardcoded_feature_columns = []

# numeric coulmns
hardcoded_feature_columns.append(tf.feature_column.numeric_column(key='SepalLengthCm'));
hardcoded_feature_columns.append(tf.feature_column.numeric_column(key='SepalWidthCm'));
hardcoded_feature_columns.append(tf.feature_column.numeric_column(key='PetalLengthCm'));
hardcoded_feature_columns.append(tf.feature_column.numeric_column(key='PetalWidthCm'));   

#hardcoded_feature_columns = []
#for key in columns_names:
#    hardcoded_feature_columns.append(tf.feature_column.numeric_column(key=key))
#    print(key)

# create a classifier using the defined columns and the pre-defined model
saved_estimator = tf.estimator.DNNClassifier(
    # feature column array of tf.feature_column types, defined in the prev step
    feature_columns=hardcoded_feature_columns,
    # Two hidden layers of 10 nodes each.
    hidden_units=[10, 10, 10],
    # The model must choose between 3 classes.
    n_classes=3,
    # use saved model from folder
    model_dir=model_path
)

# test using single records (2 real world values)
# one Iris-setosa, one Iris-versicolor
records_to_predict = pd.DataFrame([
    [5.1, 3.5, 1.4, 0.2], # first records features
    [5.7, 2.8, 4.1, 1.3]],# second record features
    columns=columns_names)# defined column names
 
# these values must be scaled, to match what is used in the model
scaled_records_to_predict = pd.DataFrame(sc.transform(records_to_predict), columns=list(records_to_predict.columns))

# predict using the model to get a prediction for each item passed
saved_prediction_input_fn = tf.estimator.inputs.pandas_input_fn(scaled_records_to_predict, num_epochs=1, shuffle=False)

# make predictions and store as a list
saved_prediction_result = list(saved_estimator.predict(input_fn=saved_prediction_input_fn))

# output prediction values
saved_prediction_result

#SepalLengthCm, SepalWidthCm, PetalLengthCm, PetalWidthCm
#-1.870024, -0.124958, -1.511870, -1.444450 : 0
# 1.522676, -0.124958, 1.217684, 1.185010 : 1
#estimator_from_file = load_estimator_model('models/iris/1520662093')

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': 'models/temp/iris', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f9c97c2a358>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
INFO:tensorflow:Could not find trained model in model_dir: models/temp/iris, running initialization to predict.
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.


[{'class_ids': array([1]),
  'classes': array([b'1'], dtype=object),
  'logits': array([-0.02224503,  0.1289943 , -0.04613251], dtype=float32),
  'probabilities': array([0.31850475, 0.37050864, 0.31098664], dtype=float32)},
 {'class_ids': array([0]),
  'classes': array([b'0'], dtype=object),
  'logits': array([0.10730875, 0.08063169, 0.02800731], dtype=float32),
  'probabilities': array([0.3451326 , 0.33604717, 0.31882018], dtype=float32)}]

# END

Just testing features and note below this line, not part of main execution