[View in Colaboratory](https://colab.research.google.com/github/contractorwolf/tensorflow-irisdataset/blob/master/Iris2.ipynb)

converted from example:
https://github.com/tensorflow/models/blob/master/samples/core/get_started/premade_estimator.py


In [1]:
from google.colab import files

uploaded = files.upload()

for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))

Saving iris.csv to iris (3).csv
User uploaded file "iris.csv" with length 5107 bytes


In [2]:
import pandas as pd
import tensorflow as tf


# ML libraries 
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score



# verify version of tensorflow, should be 1.4+ to use new Estimators features
print('TENSORFLOW VERSION: ' + tf.__version__)

TENSORFLOW VERSION: 1.7.0


In [0]:
BATCH_SIZE = 50

TRAIN_STEPS = 500

# define local file for training data with features and labels
csv_file = 'iris.csv'

In [4]:
# get full dataset from local CSV file
df = pd.read_csv(csv_file)
df.head()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa


In [8]:
# ALWAYS randomize sample
df = df.sample(frac=1).reset_index(drop=True)
df.head()


Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,119,7.7,2.6,6.9,2.3,Iris-virginica
1,74,6.1,2.8,4.7,1.2,Iris-versicolor
2,12,4.8,3.4,1.6,0.2,Iris-setosa
3,27,5.0,3.4,1.6,0.4,Iris-setosa
4,80,5.7,2.6,3.5,1.0,Iris-versicolor


In [0]:
# drop unnecessary 'Id' column from the dataset, doesnt help predict the label
df.drop('Id', inplace=True, axis=1)

In [10]:
# label encode the string label: Species
# it should be a numeric value, instead of a string value
# to do: use one-hot encoder instead of label encoder
le = LabelEncoder()
df.Species = le.fit_transform(df.Species)

# show the cleaned dataset
df.head()

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,7.7,2.6,6.9,2.3,2
1,6.1,2.8,4.7,1.2,1
2,4.8,3.4,1.6,0.2,0
3,5.0,3.4,1.6,0.4,0
4,5.7,2.6,3.5,1.0,1


In [11]:
# set just the label column to y for the train_test_split()
data_label = df.Species
data_label.head()

0    2
1    1
2    0
3    0
4    1
Name: Species, dtype: int64

In [12]:
# remove the species column so that you can scale the rest of the values
# the values are already stored in y variable
# *** convert this to X values here, for clarity?
data_features = df.drop('Species', inplace=False, axis=1)
data_features.head()

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm
0,7.7,2.6,6.9,2.3
1,6.1,2.8,4.7,1.2
2,4.8,3.4,1.6,0.2
3,5.0,3.4,1.6,0.4
4,5.7,2.6,3.5,1.0


In [85]:
# DO NOT USE, BREAKS Classifier prob because of the number size? 
#----------------------------------------------------------------

# scale data for better results
# so that columns with larger values dont get weighed more
# example: age change of 40 years (significant) vs income change of $40 (not significant)

sc = StandardScaler(copy=True, with_mean=True, with_std=True)
data_features = pd.DataFrame(sc.fit_transform(data_features), columns=[data_features.columns])#, index=data_features.index)

# show scaled X values 
data_features.head() 
# THIS IS A DATAFRAME

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm
0,1.28034,0.337848,1.103953,1.447956
1,-1.021849,1.032057,-1.398138,-1.181504
2,0.553333,-0.819166,0.649027,0.790591
3,1.038005,0.106445,0.535296,0.396172
4,-0.173674,-1.281972,0.705893,1.053537


In [0]:
# do a 70/30 split of the data into a train_x and train_y (70% for training) 
# and a test_x and test_y (30% for validation) 
# X is a DATAFRAME of scaled features
# y is a SERIES of labels
train_x, test_x, train_y, test_y = train_test_split(data_features, data_label, test_size=0.30, random_state=42)

In [0]:
# Feature columns describe how to use the input.
# you need to define the name and type for each column of your training data
my_feature_columns = []

# numeric coulmn
my_feature_columns.append(tf.feature_column.numeric_column(key='SepalLengthCm'));
my_feature_columns.append(tf.feature_column.numeric_column(key='SepalWidthCm'));
my_feature_columns.append(tf.feature_column.numeric_column(key='PetalLengthCm'));
my_feature_columns.append(tf.feature_column.numeric_column(key='PetalWidthCm'));   

# other column types
# hashed strings or categorical features
# example:
# state = tf.feature_column.categorical_column_with_hash_bucket('state',100) 
# pick the right size for your data


In [0]:


def train_input_fn(features, labels, batch_size):
    # Convert the inputs to a Dataset.
    
    #print(features)
    dataset = tf.data.Dataset.from_tensor_slices((dict(features), labels))

    # Shuffle, repeat, and batch the examples.
    dataset = dataset.shuffle(1000).repeat().batch(batch_size)

    # Return the dataset.
    return dataset


def eval_input_fn(features, labels, batch_size):
    # An input function for evaluation or prediction
    features=dict(features)
    if labels is None:
        # No labels, use only features.
        inputs = features
    else:
        inputs = (features, labels)

    # Convert the inputs to a Dataset.
    dataset = tf.data.Dataset.from_tensor_slices(inputs)

    # Batch the examples
    assert batch_size is not None, "batch_size must not be None"
    dataset = dataset.batch(batch_size)

    # Return the dataset.
    return dataset


In [16]:
# Build 2 hidden layer DNN with 10, 10 units respectively.
classifier = tf.estimator.DNNClassifier(
    feature_columns=my_feature_columns,
    # Two hidden layers of 10 nodes each.
    hidden_units=[10, 10],
    # The model must choose between 3 classes.
    n_classes=data_label.nunique())

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/tmp/tmp4xa_nf9q', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f9b52e84ef0>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [17]:
# Train the Model.
classifier.train(input_fn=lambda:train_input_fn(train_x, train_y, BATCH_SIZE), steps=TRAIN_STEPS)

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 1 into /tmp/tmp4xa_nf9q/model.ckpt.
INFO:tensorflow:loss = 105.92941, step = 1
INFO:tensorflow:global_step/sec: 763.831
INFO:tensorflow:loss = 10.26494, step = 101 (0.133 sec)
INFO:tensorflow:global_step/sec: 1014.51
INFO:tensorflow:loss = 4.9950366, step = 201 (0.102 sec)
INFO:tensorflow:global_step/sec: 1047.61
INFO:tensorflow:loss = 2.4895365, step = 301 (0.093 sec)
INFO:tensorflow:global_step/sec: 1071.59
INFO:tensorflow:loss = 4.3592453, step = 401 (0.093 sec)
INFO:tensorflow:Saving checkpoints for 500 into /tmp/tmp4xa_nf9q/model.ckpt.
INFO:tensorflow:Loss for final step: 2.088737.


<tensorflow.python.estimator.canned.dnn.DNNClassifier at 0x7f9b5f2ea860>

In [18]:
# Evaluate the model.
eval_result = classifier.evaluate(input_fn=lambda:eval_input_fn(test_x, test_y, BATCH_SIZE))

print('\nTest set accuracy: {accuracy:0.3f}\n'.format(**eval_result))


eval_result

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2018-05-19-20:10:33
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /tmp/tmp4xa_nf9q/model.ckpt-500
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Finished evaluation at 2018-05-19-20:10:33
INFO:tensorflow:Saving dict for global step 500: accuracy = 0.95555556, average_loss = 0.10954611, global_step = 500, loss = 4.929575

Test set accuracy: 0.956



{'accuracy': 0.95555556,
 'average_loss': 0.10954611,
 'global_step': 500,
 'loss': 4.929575}

In [0]:
# Generate predictions from the model

# set the expected values (from previously identified records) to compare
expected = ['Iris-setosa', 'Iris-versicolor', 'Iris-virginica']


new_records = [
    [5.1, 3.3, 1.7, 0.5], # first value to predict 'Iris-setosa'
    [5.9, 3.0, 4.2, 1.5], # second value to predict 'Iris-versicolor'
    [6.9, 3.1, 5.4, 2.1]  # third values to predict 'Iris-virginica'
]


predict_x = pd.DataFrame(new_records, columns=['SepalLengthCm','SepalWidthCm','PetalLengthCm','PetalWidthCm'])

In [21]:
prediction_result = list(classifier.predict(input_fn=lambda:eval_input_fn(predict_x, labels=None, batch_size=BATCH_SIZE)))


INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /tmp/tmp4xa_nf9q/model.ckpt-500
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.


In [22]:
# for each of the results
i = 0
for result in prediction_result:
    #if result['class_ids'].size > 0:
      record_values = new_records[i]


      prediction_class = result['class_ids'][0]
      transformed_class = le.inverse_transform(prediction_class)
      expected_value = expected[i]
      probability_value = 100 * result['probabilities'][prediction_class]
      probability = '{:.1f}%'.format(probability_value)

      # print the transformed label-encoded value
      print('VALUES: ' + str(record_values) + ' PREDICTED CLASS: (' + str(prediction_class) + ') PROBABILITY: ' + probability + ' (Decoded: ' + str(transformed_class)+ ') expected: ' + expected_value)
      print(result['class_ids'].size > 0)
    
    # go to next row
      i = i + 1


VALUES: [5.1, 3.3, 1.7, 0.5] PREDICTED CLASS: (0) PROBABILITY: 99.8% (Decoded: Iris-setosa) expected: Iris-setosa
True
VALUES: [5.9, 3.0, 4.2, 1.5] PREDICTED CLASS: (1) PROBABILITY: 94.6% (Decoded: Iris-versicolor) expected: Iris-versicolor
True
VALUES: [6.9, 3.1, 5.4, 2.1] PREDICTED CLASS: (2) PROBABILITY: 93.6% (Decoded: Iris-virginica) expected: Iris-virginica
True


  if diff:
  if diff:
  if diff:
