[View in Colaboratory](https://colab.research.google.com/github/contractorwolf/tensorflow-irisdataset/blob/master/Iris2.ipynb)

converted from example:
https://github.com/tensorflow/models/blob/master/samples/core/get_started/premade_estimator.py


In [3]:
from google.colab import files

uploaded = files.upload()

for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))

Saving iris.csv to iris (2).csv
User uploaded file "iris.csv" with length 5107 bytes


In [4]:
import pandas as pd
import tensorflow as tf


# ML libraries 
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score



# verify version of tensorflow, should be 1.4+ to use new Estimators features
print('TENSORFLOW VERSION: ' + tf.__version__)

TENSORFLOW VERSION: 1.7.0


In [0]:
#SPECIES = ['Setosa', 'Versicolor', 'Virginica']

BATCH_SIZE = 50

TRAIN_STEPS = 500

# define local file for training data with features and labels
csv_file = 'iris.csv'

In [18]:
# get full dataset from local CSV file
df = pd.read_csv(csv_file)
df.head()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa


In [21]:
# ALWAYS randomize sample
df = df.sample(frac=1).reset_index(drop=True)
df.head()


Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,73,6.3,2.5,4.9,1.5,Iris-versicolor
1,57,6.3,3.3,4.7,1.6,Iris-versicolor
2,123,7.7,2.8,6.7,2.0,Iris-virginica
3,39,4.4,3.0,1.3,0.2,Iris-setosa
4,75,6.4,2.9,4.3,1.3,Iris-versicolor


In [0]:
# drop unnecessary 'Id' column from the dataset, doesnt help predict the label
df.drop('Id', inplace=True, axis=1)

In [23]:
# label encode the string label: Species
# it should be a numeric value, instead of a string value
# to do: use one-hot encoder instead of label encoder
le = LabelEncoder()
df.Species = le.fit_transform(df.Species)

# show the cleaned dataset
df.head()

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,6.3,2.5,4.9,1.5,1
1,6.3,3.3,4.7,1.6,1
2,7.7,2.8,6.7,2.0,2
3,4.4,3.0,1.3,0.2,0
4,6.4,2.9,4.3,1.3,1


In [24]:
# set just the label column to y for the train_test_split()
data_label = df.Species
data_label.head()

0    1
1    1
2    2
3    0
4    1
Name: Species, dtype: int64

In [25]:
# remove the species column so that you can scale the rest of the values
# the values are already stored in y variable
# *** convert this to X values here, for clarity?
data_features = df.drop('Species', inplace=False, axis=1)
data_features.head()

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm
0,6.3,2.5,4.9,1.5
1,6.3,3.3,4.7,1.6
2,7.7,2.8,6.7,2.0
3,4.4,3.0,1.3,0.2
4,6.4,2.9,4.3,1.3


In [59]:
# DO NOT USE, BREAKS Classifier prob because of the number size? 
#----------------------------------------------------------------

# scale data for better results
# so that columns with larger values dont get weighed more
# example: age change of 40 years (significant) vs income change of $40 (not significant)

sc = StandardScaler(copy=True, with_mean=True, with_std=True)
data_features = pd.DataFrame(sc.fit_transform(data_features), columns=[data_features.columns], index=data_features.index)

# show scaled X values 
data_features.head() 
# THIS IS A DATAFRAME

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm
0,-0.900681,1.032057,-1.341272,-1.312977
1,-1.143017,-0.124958,-1.341272,-1.312977
2,-1.385353,0.337848,-1.398138,-1.312977
3,-1.506521,0.106445,-1.284407,-1.312977
4,-1.021849,1.26346,-1.341272,-1.312977


In [0]:
# do a 70/30 split of the data into a train_x and train_y (70% for training) 
# and a test_x and test_y (30% for validation) 
# X is a DATAFRAME of scaled features
# y is a SERIES of labels
train_x, test_x, train_y, test_y = train_test_split(data_features, data_label, test_size=0.30, random_state=42)

In [0]:
# Feature columns describe how to use the input.
# you need to define the name and type for each column of your training data
my_feature_columns = []

# numeric coulmn
my_feature_columns.append(tf.feature_column.numeric_column(key='SepalLengthCm'));
my_feature_columns.append(tf.feature_column.numeric_column(key='SepalWidthCm'));
my_feature_columns.append(tf.feature_column.numeric_column(key='PetalLengthCm'));
my_feature_columns.append(tf.feature_column.numeric_column(key='PetalWidthCm'));   

# other column types
# hashed strings or categorical features
# example:
# state = tf.feature_column.categorical_column_with_hash_bucket('state',100) 
# pick the right size for your data


In [0]:


def train_input_fn(features, labels, batch_size):
    # Convert the inputs to a Dataset.
    
    print(features)
    dataset = tf.data.Dataset.from_tensor_slices((dict(features), labels))

    # Shuffle, repeat, and batch the examples.
    dataset = dataset.shuffle(1000).repeat().batch(batch_size)

    # Return the dataset.
    return dataset


def eval_input_fn(features, labels, batch_size):
    # An input function for evaluation or prediction
    features=dict(features)
    if labels is None:
        # No labels, use only features.
        inputs = features
    else:
        inputs = (features, labels)

    # Convert the inputs to a Dataset.
    dataset = tf.data.Dataset.from_tensor_slices(inputs)

    # Batch the examples
    assert batch_size is not None, "batch_size must not be None"
    dataset = dataset.batch(batch_size)

    # Return the dataset.
    return dataset


In [30]:
# Build 2 hidden layer DNN with 10, 10 units respectively.
classifier = tf.estimator.DNNClassifier(
    feature_columns=my_feature_columns,
    # Two hidden layers of 10 nodes each.
    hidden_units=[10, 10],
    # The model must choose between 3 classes.
    n_classes=3)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/tmp/tmp4f4vnerx', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f3563940cc0>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [31]:
# Train the Model.


#print(train_x)
classifier.train(input_fn=lambda:train_input_fn(train_x, train_y, BATCH_SIZE), steps=TRAIN_STEPS)

     SepalLengthCm  SepalWidthCm  PetalLengthCm  PetalWidthCm
81             6.7           3.1            4.4           1.4
133            6.0           2.7            5.1           1.6
137            6.1           2.6            5.6           1.4
75             6.8           2.8            4.8           1.4
109            5.5           2.4            3.7           1.0
96             5.5           4.2            1.4           0.2
105            6.4           2.8            5.6           2.1
66             6.5           3.0            5.2           2.0
0              6.3           2.5            4.9           1.5
122            6.7           3.0            5.2           2.3
67             5.8           2.6            4.0           1.2
28             6.4           2.7            5.3           1.9
40             5.7           2.8            4.5           1.3
44             5.6           2.5            3.9           1.1
60             5.1           3.8            1.6           0.2
123     

<tensorflow.python.estimator.canned.dnn.DNNClassifier at 0x7f3563940dd8>

In [32]:
# Evaluate the model.
eval_result = classifier.evaluate(input_fn=lambda:eval_input_fn(test_x, test_y, BATCH_SIZE))

print('\nTest set accuracy: {accuracy:0.3f}\n'.format(**eval_result))

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2018-05-19-17:57:14
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /tmp/tmp4f4vnerx/model.ckpt-500
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Finished evaluation at 2018-05-19-17:57:14
INFO:tensorflow:Saving dict for global step 500: accuracy = 0.9777778, average_loss = 0.06361537, global_step = 500, loss = 2.8626914

Test set accuracy: 0.978



In [0]:
# Generate predictions from the model

# set the expected values (from previously identified records) to compare
expected = ['Iris-setosa', 'Iris-versicolor', 'Iris-virginica']


new_records = [
    [5.1, 3.3, 1.7, 0.5], # first value to predict 'Iris-setosa'
    [5.9, 3.0, 4.2, 1.5], # second value to predict 'Iris-versicolor'
    [6.9, 3.1, 5.4, 2.1]  # third values to predict 'Iris-virginica'
]


predict_x = pd.DataFrame(new_records, columns=['SepalLengthCm','SepalWidthCm','PetalLengthCm','PetalWidthCm'])

In [66]:
prediction_result = list(classifier.predict(input_fn=lambda:eval_input_fn(predict_x, labels=None, batch_size=BATCH_SIZE)))


INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /tmp/tmp4f4vnerx/model.ckpt-500
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.


In [73]:
# for each of the results
i = 0
for each in prediction_result:
    #if each['class_ids'].size > 0
      record_values = new_records[i]


      prediction_class = each['class_ids'][0]
      transformed_class = le.inverse_transform(prediction_class)
      expected_value = expected[i]

      # print the transformed label-encoded value
      print('INCOMING VALUES: ' + str(record_values) + ' PREDICTED CLASS ID: ' + str(prediction_class) + ' (Decoded: ' + str(transformed_class)+ ') expected: ' + expected_value)
      print('')
    
    # go to next row
      i = i + 1


INCOMING VALUES: [5.1, 3.3, 1.7, 0.5] PREDICTED CLASS ID: 0 (Decoded: Iris-setosa) expected: Iris-setosa

INCOMING VALUES: [5.9, 3.0, 4.2, 1.5] PREDICTED CLASS ID: 1 (Decoded: Iris-versicolor) expected: Iris-versicolor

INCOMING VALUES: [6.9, 3.1, 5.4, 2.1] PREDICTED CLASS ID: 2 (Decoded: Iris-virginica) expected: Iris-virginica



  if diff:
  if diff:
  if diff:
