[View in Colaboratory](https://colab.research.google.com/github/contractorwolf/tensorflow-irisdataset/blob/master/Iris2.ipynb)

In [1]:
from google.colab import files

uploaded = files.upload()

for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))

Saving iris.csv to iris (1).csv
User uploaded file "iris.csv" with length 5107 bytes


In [19]:
import pandas as pd
import tensorflow as tf


# ML libraries 
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score



# verify version of tensorflow, should be 1.4+ to use new Estimators features
print('TENSORFLOW VERSION: ' + tf.__version__)

TENSORFLOW VERSION: 1.7.0


In [0]:
# THIS SHOULD BE ONE FILE NOT TWO *********************
TRAIN_URL = "http://download.tensorflow.org/data/iris_training.csv"
TEST_URL  = "http://download.tensorflow.org/data/iris_test.csv"

CSV_COLUMN_NAMES = ['SepalLength', 'SepalWidth', 'PetalLength', 'PetalWidth', 'Species']

SPECIES = ['Setosa', 'Versicolor', 'Virginica']


BATCH_SIZE = 50
TRAIN_STEPS = 500


# REMOVE ALL CSV PARSING *********************

# The remainder of this file contains a simple example of a csv parser,
#     implemented using the `Dataset` class.

# `tf.parse_csv` sets the types of the outputs to match the examples given in
#     the `record_defaults` argument.
CSV_TYPES = [[0.0], [0.0], [0.0], [0.0], [0]]

# define local file for training data with features and labels
csv_file = 'iris.csv'

#args.batch_size), steps=args.train_steps

In [21]:
# get full dataset from local CSV file
df = pd.read_csv(csv_file)
df.head()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa


In [0]:
# drop unnecessary 'Id' column from the dataset, doesnt help predict the label
df.drop('Id', inplace=True, axis=1)

In [23]:
# ALWAYS randomize sample
df = df.sample(frac=1).reset_index(drop=True)
df.head()

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,6.0,2.2,5.0,1.5,Iris-virginica
1,5.1,3.3,1.7,0.5,Iris-setosa
2,4.4,3.0,1.3,0.2,Iris-setosa
3,5.5,2.5,4.0,1.3,Iris-versicolor
4,7.3,2.9,6.3,1.8,Iris-virginica


In [24]:
# label encode the string label: Species
# it should be a numeric value, instead of a string value
# to do: use one-hot encoder instead of label encoder
le = LabelEncoder()
df.Species = le.fit_transform(df.Species)

# show the cleaned dataset
df.head()

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,6.0,2.2,5.0,1.5,2
1,5.1,3.3,1.7,0.5,0
2,4.4,3.0,1.3,0.2,0
3,5.5,2.5,4.0,1.3,1
4,7.3,2.9,6.3,1.8,2


In [25]:
# set just the label column to y for the train_test_split()
data_label = df.Species
data_label.head()

0    2
1    0
2    0
3    1
4    2
Name: Species, dtype: int64

In [26]:
# remove the species column so that you can scale the rest of the values
# the values are already stored in y variable
# *** convert this to X values here, for clarity?
data_features = df.drop('Species', inplace=False, axis=1)
data_features.head()

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm
0,6.0,2.2,5.0,1.5
1,5.1,3.3,1.7,0.5
2,4.4,3.0,1.3,0.2
3,5.5,2.5,4.0,1.3
4,7.3,2.9,6.3,1.8


In [0]:
# do a 70/30 split of the data into a train_x and train_y (70% for training) 
# and a test_x and test_y (30% for validation) 
# X is a DATAFRAME of scaled features
# y is a SERIES of labels
train_x2, test_x2, train_y2, test_y2 = train_test_split(data_features, data_label, test_size=0.30, random_state=42)

In [0]:
# Feature columns describe how to use the input.
# you need to define the name and type for each column of your training data
my_feature_columns2 = []

# numeric coulmns



#added indicator column
my_feature_columns2.append(tf.feature_column.numeric_column(key='SepalLengthCm'));
my_feature_columns2.append(tf.feature_column.numeric_column(key='SepalWidthCm'));
my_feature_columns2.append(tf.feature_column.numeric_column(key='PetalLengthCm'));
my_feature_columns2.append(tf.feature_column.numeric_column(key='PetalWidthCm'));   

# other column types
# hashed strings or categorical features
# example:
# state = tf.feature_column.categorical_column_with_hash_bucket('state',100) 
# pick the right size for your data


In [0]:


def train_input_fn(features, labels, batch_size):
    """An input function for training"""
    # Convert the inputs to a Dataset.
    
    print(features)
    dataset = tf.data.Dataset.from_tensor_slices((dict(features), labels))

    # Shuffle, repeat, and batch the examples.
    dataset = dataset.shuffle(1000).repeat().batch(batch_size)

    # Return the dataset.
    return dataset


def eval_input_fn(features, labels, batch_size):
    """An input function for evaluation or prediction"""
    features=dict(features)
    if labels is None:
        # No labels, use only features.
        inputs = features
    else:
        inputs = (features, labels)

    # Convert the inputs to a Dataset.
    dataset = tf.data.Dataset.from_tensor_slices(inputs)

    # Batch the examples
    assert batch_size is not None, "batch_size must not be None"
    dataset = dataset.batch(batch_size)

    # Return the dataset.
    return dataset


In [30]:
# Build 2 hidden layer DNN with 10, 10 units respectively.
classifier2 = tf.estimator.DNNClassifier(
    feature_columns=my_feature_columns2,
    # Two hidden layers of 10 nodes each.
    hidden_units=[10, 10],
    # The model must choose between 3 classes.
    n_classes=3)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/tmp/tmpjr1m_rwe', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f039de910f0>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [31]:
# Train the Model.
classifier2.train(input_fn=lambda:train_input_fn(train_x2, train_y2, BATCH_SIZE), steps=TRAIN_STEPS)

     SepalLengthCm  SepalWidthCm  PetalLengthCm  PetalWidthCm
81             6.4           3.2            4.5           1.5
133            7.2           3.0            5.8           1.6
137            5.5           2.6            4.4           1.2
75             4.8           3.4            1.6           0.2
109            4.8           3.0            1.4           0.3
96             5.1           3.7            1.5           0.4
105            5.8           2.7            3.9           1.2
66             5.8           2.7            4.1           1.0
0              6.0           2.2            5.0           1.5
122            5.2           2.7            3.9           1.4
67             5.2           3.5            1.5           0.2
28             6.9           3.1            5.4           2.1
40             5.4           3.4            1.7           0.2
44             6.1           2.8            4.0           1.3
60             7.7           2.6            6.9           2.3
123     

<tensorflow.python.estimator.canned.dnn.DNNClassifier at 0x7f039d226a20>

In [32]:
# Evaluate the model.
eval_result2 = classifier2.evaluate(input_fn=lambda:eval_input_fn(test_x2, test_y2, BATCH_SIZE))

print('\nTest set accuracy: {accuracy:0.3f}\n'.format(**eval_result2))

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2018-05-19-04:26:42
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /tmp/tmpjr1m_rwe/model.ckpt-500
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Finished evaluation at 2018-05-19-04:26:42
INFO:tensorflow:Saving dict for global step 500: accuracy = 0.9777778, average_loss = 0.0533011, global_step = 500, loss = 2.3985496

Test set accuracy: 0.978



In [33]:

# Generate predictions from the model
expected = ['Setosa', 'Versicolor', 'Virginica']

predict_x = {
    'SepalLengthCm': [5.1, 5.9, 6.9],
    'SepalWidthCm' : [3.3, 3.0, 3.1],
    'PetalLengthCm': [1.7, 4.2, 5.4],
    'PetalWidthCm' : [0.5, 1.5, 2.1],
}

predictions = classifier2.predict(input_fn=lambda:eval_input_fn(predict_x, labels=None, batch_size=BATCH_SIZE))

template = ('\nPrediction is "{}" ({:.1f}%), expected "{}"')

for pred_dict, expec in zip(predictions, expected):
    class_id = pred_dict['class_ids'][0]
    probability = pred_dict['probabilities'][class_id]

    print(template.format(SPECIES[class_id], 100 * probability, expec))

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /tmp/tmpjr1m_rwe/model.ckpt-500
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.

Prediction is "Setosa" (99.8%), expected "Setosa"

Prediction is "Versicolor" (98.5%), expected "Versicolor"

Prediction is "Virginica" (84.1%), expected "Virginica"
