[View in Colaboratory](https://colab.research.google.com/github/contractorwolf/tensorflow-irisdataset/blob/master/Iris_Colab.ipynb)

In [15]:
from google.colab import files

uploaded = files.upload()

for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))

Saving iris.csv to iris.csv
User uploaded file "iris.csv" with length 5107 bytes


In [2]:
import pandas as pd
import tensorflow as tf


# ML libraries 
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score



# verify version of tensorflow, should be 1.4+ to use new Estimators features
print('TENSORFLOW VERSION: ' + tf.__version__)

TENSORFLOW VERSION: 1.7.0


In [0]:
# THIS SHOULD BE ONE FILE NOT TWO *********************
TRAIN_URL = "http://download.tensorflow.org/data/iris_training.csv"
TEST_URL  = "http://download.tensorflow.org/data/iris_test.csv"

CSV_COLUMN_NAMES = ['SepalLength', 'SepalWidth', 'PetalLength', 'PetalWidth', 'Species']

SPECIES = ['Setosa', 'Versicolor', 'Virginica']


BATCH_SIZE = 50
TRAIN_STEPS = 500


# REMOVE ALL CSV PARSING *********************

# The remainder of this file contains a simple example of a csv parser,
#     implemented using the `Dataset` class.

# `tf.parse_csv` sets the types of the outputs to match the examples given in
#     the `record_defaults` argument.
CSV_TYPES = [[0.0], [0.0], [0.0], [0.0], [0]]

# define local file for training data with features and labels
csv_file = 'iris.csv'

#args.batch_size), steps=args.train_steps


In [32]:
# get full dataset from local CSV file
df = pd.read_csv(csv_file)
df.head()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa


In [0]:
# drop unnecessary 'Id' column from the dataset, doesnt help predict the label
df.drop('Id', inplace=True, axis=1)

In [34]:
# ALWAYS randomize sample
df = df.sample(frac=1).reset_index(drop=True)
df.head()

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,6.7,2.5,5.8,1.8,Iris-virginica
1,6.7,3.3,5.7,2.1,Iris-virginica
2,6.8,3.2,5.9,2.3,Iris-virginica
3,7.7,2.8,6.7,2.0,Iris-virginica
4,6.7,3.0,5.2,2.3,Iris-virginica


In [35]:
# label encode the string label: Species
# it should be a numeric value, instead of a string value
# to do: use one-hot encoder instead of label encoder
le = LabelEncoder()
df.Species = le.fit_transform(df.Species)

# show the cleaned dataset
df.head()

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,6.7,2.5,5.8,1.8,2
1,6.7,3.3,5.7,2.1,2
2,6.8,3.2,5.9,2.3,2
3,7.7,2.8,6.7,2.0,2
4,6.7,3.0,5.2,2.3,2


In [36]:
# set just the label column to y for the train_test_split()
data_label = df.Species
data_label.head()

0    2
1    2
2    2
3    2
4    2
Name: Species, dtype: int64

In [47]:
# remove the species column so that you can scale the rest of the values
# the values are already stored in y variable
# *** convert this to X values here, for clarity?
data_features = df.drop('Species', inplace=False, axis=1)
data_features.head()

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm
0,6.7,2.5,5.8,1.8
1,6.7,3.3,5.7,2.1
2,6.8,3.2,5.9,2.3
3,7.7,2.8,6.7,2.0
4,6.7,3.0,5.2,2.3


In [28]:
# scale data for better results
# so that columns with larger values dont get weighed more
# example: age change of 40 years (significant) vs income change of $40 (not significant)

sc = StandardScaler(copy=True, with_mean=True, with_std=True)
scaled_features = pd.DataFrame(sc.fit_transform(data_features), columns=[data_features.columns], index=data_features.index)

# show scaled X values 
scaled_features.head() 
# THIS IS A DATAFRAME

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm
0,0.18983,-0.356361,0.421564,0.396172
1,-1.021849,1.032057,-1.227541,-0.787085
2,0.310998,-0.587764,0.535296,0.001753
3,-1.506521,1.26346,-1.568735,-1.312977
4,-0.294842,-0.819166,0.250967,0.133226


In [0]:
# do a 70/30 split of the data into a train_x and train_y (70% for training) 
# and a test_x and test_y (30% for validation) 
# X is a DATAFRAME of scaled features
# y is a SERIES of labels
train_x2, test_x2, train_y2, test_y2 = train_test_split(data_features, data_label, test_size=0.30, random_state=42)

In [0]:
# Feature columns describe how to use the input.
# you need to define the name and type for each column of your training data
my_feature_columns2 = []

# numeric coulmns



#added indicator column
my_feature_columns2.append(tf.feature_column.numeric_column(key='SepalLengthCm'));
my_feature_columns2.append(tf.feature_column.numeric_column(key='SepalWidthCm'));
my_feature_columns2.append(tf.feature_column.numeric_column(key='PetalLengthCm'));
my_feature_columns2.append(tf.feature_column.numeric_column(key='PetalWidthCm'));   

# other column types
# hashed strings or categorical features
# example:
# state = tf.feature_column.categorical_column_with_hash_bucket('state',100) 
# pick the right size for your data


In [0]:
def maybe_download():
    train_path = tf.keras.utils.get_file(TRAIN_URL.split('/')[-1], TRAIN_URL)
    test_path = tf.keras.utils.get_file(TEST_URL.split('/')[-1], TEST_URL)

    return train_path, test_path

def load_data(y_name='Species'):
    """Returns the iris dataset as (train_x, train_y), (test_x, test_y)."""
    train_path, test_path = maybe_download()

    train = pd.read_csv(train_path, names=CSV_COLUMN_NAMES, header=0)
    train_x, train_y = train, train.pop(y_name)

    test = pd.read_csv(test_path, names=CSV_COLUMN_NAMES, header=0)
    test_x, test_y = test, test.pop(y_name)

    return (train_x, train_y), (test_x, test_y)


def train_input_fn(features, labels, batch_size):
    """An input function for training"""
    # Convert the inputs to a Dataset.
    
    print(features)
    dataset = tf.data.Dataset.from_tensor_slices((dict(features), labels))

    # Shuffle, repeat, and batch the examples.
    dataset = dataset.shuffle(1000).repeat().batch(batch_size)

    # Return the dataset.
    return dataset


def eval_input_fn(features, labels, batch_size):
    """An input function for evaluation or prediction"""
    features=dict(features)
    if labels is None:
        # No labels, use only features.
        inputs = features
    else:
        inputs = (features, labels)

    # Convert the inputs to a Dataset.
    dataset = tf.data.Dataset.from_tensor_slices(inputs)

    # Batch the examples
    assert batch_size is not None, "batch_size must not be None"
    dataset = dataset.batch(batch_size)

    # Return the dataset.
    return dataset

  
# REMOVE ALL CSV PARSING *********************  
def _parse_line(line):
    # Decode the line into its fields
    fields = tf.decode_csv(line, record_defaults=CSV_TYPES)

    # Pack the result into a dictionary
    features = dict(zip(CSV_COLUMN_NAMES, fields))

    # Separate the label from the features
    label = features.pop('Species')

    return features, label


def csv_input_fn(csv_path, batch_size):
    # Create a dataset containing the text lines.
    dataset = tf.data.TextLineDataset(csv_path).skip(1)

    # Parse each line.
    dataset = dataset.map(_parse_line)

    # Shuffle, repeat, and batch the examples.
    dataset = dataset.shuffle(1000).repeat().batch(batch_size)

    # Return the dataset.
    return dataset

In [0]:
# Fetch the data
(train_x, train_y), (test_x, test_y) = load_data()

In [49]:
train_x.head()

Unnamed: 0,SepalLength,SepalWidth,PetalLength,PetalWidth
0,6.4,2.8,5.6,2.2
1,5.0,2.3,3.3,1.0
2,4.9,2.5,4.5,1.7
3,4.9,3.1,1.5,0.1
4,5.7,3.8,1.7,0.3


In [50]:
train_x2.head()

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm
81,5.4,3.7,1.5,0.2
133,5.0,3.3,1.4,0.2
137,7.7,2.6,6.9,2.3
75,6.7,3.1,4.7,1.5
109,5.0,2.3,3.3,1.0


In [0]:
# Feature columns describe how to use the input.
my_feature_columns = []

for key in train_x.keys():
    my_feature_columns.append(tf.feature_column.numeric_column(key=key))

In [52]:
my_feature_columns

[_NumericColumn(key='SepalLength', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 _NumericColumn(key='SepalWidth', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 _NumericColumn(key='PetalLength', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 _NumericColumn(key='PetalWidth', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None)]

In [53]:
my_feature_columns2

[_NumericColumn(key='SepalLengthCm', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 _NumericColumn(key='SepalWidthCm', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 _NumericColumn(key='PetalLengthCm', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 _NumericColumn(key='PetalWidthCm', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None)]

In [60]:
# Build 2 hidden layer DNN with 10, 10 units respectively.
classifier = tf.estimator.DNNClassifier(
    feature_columns=my_feature_columns,
    # Two hidden layers of 10 nodes each.
    hidden_units=[10, 10],
    # The model must choose between 3 classes.
    n_classes=3)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/tmp/tmpas9kwv2f', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f13914f4588>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [61]:
# Build 2 hidden layer DNN with 10, 10 units respectively.
classifier2 = tf.estimator.DNNClassifier(
    feature_columns=my_feature_columns2,
    # Two hidden layers of 10 nodes each.
    hidden_units=[10, 10],
    # The model must choose between 3 classes.
    n_classes=3)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/tmp/tmpp47koft4', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f13914f4630>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [62]:
# Train the Model.
classifier.train(input_fn=lambda:train_input_fn(train_x, train_y, BATCH_SIZE), steps=TRAIN_STEPS)

     SepalLength  SepalWidth  PetalLength  PetalWidth
0            6.4         2.8          5.6         2.2
1            5.0         2.3          3.3         1.0
2            4.9         2.5          4.5         1.7
3            4.9         3.1          1.5         0.1
4            5.7         3.8          1.7         0.3
5            4.4         3.2          1.3         0.2
6            5.4         3.4          1.5         0.4
7            6.9         3.1          5.1         2.3
8            6.7         3.1          4.4         1.4
9            5.1         3.7          1.5         0.4
10           5.2         2.7          3.9         1.4
11           6.9         3.1          4.9         1.5
12           5.8         4.0          1.2         0.2
13           5.4         3.9          1.7         0.4
14           7.7         3.8          6.7         2.2
15           6.3         3.3          4.7         1.6
16           6.8         3.2          5.9         2.3
17           7.6         3.0

<tensorflow.python.estimator.canned.dnn.DNNClassifier at 0x7f13914f4ac8>

In [63]:
# Train the Model.
classifier2.train(input_fn=lambda:train_input_fn(train_x2, train_y2, BATCH_SIZE), steps=TRAIN_STEPS)

     SepalLengthCm  SepalWidthCm  PetalLengthCm  PetalWidthCm
81             5.4           3.7            1.5           0.2
133            5.0           3.3            1.4           0.2
137            7.7           2.6            6.9           2.3
75             6.7           3.1            4.7           1.5
109            5.0           2.3            3.3           1.0
96             5.1           3.7            1.5           0.4
105            6.4           3.2            5.3           2.3
66             7.7           3.8            6.7           2.2
0              6.7           2.5            5.8           1.8
122            5.6           2.7            4.2           1.3
67             6.0           2.2            4.0           1.0
28             5.3           3.7            1.5           0.2
40             6.4           2.7            5.3           1.9
44             5.0           3.4            1.6           0.4
60             6.3           2.5            5.0           1.9
123     

<tensorflow.python.estimator.canned.dnn.DNNClassifier at 0x7f13914f4a90>

In [29]:
# Evaluate the model.
eval_result = classifier.evaluate(input_fn=lambda:eval_input_fn(test_x, test_y, BATCH_SIZE))

print('\nTest set accuracy: {accuracy:0.3f}\n'.format(**eval_result))

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2018-05-18-04:20:09
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /tmp/tmprok3kc59/model.ckpt-500
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Finished evaluation at 2018-05-18-04:20:09
INFO:tensorflow:Saving dict for global step 500: accuracy = 0.93333334, average_loss = 0.073242165, global_step = 500, loss = 2.197265

Test set accuracy: 0.933



In [19]:


# Generate predictions from the model
expected = ['Setosa', 'Versicolor', 'Virginica']

predict_x = {
    'SepalLength': [5.1, 5.9, 6.9],
    'SepalWidth' : [3.3, 3.0, 3.1],
    'PetalLength': [1.7, 4.2, 5.4],
    'PetalWidth' : [0.5, 1.5, 2.1],
}

predictions = classifier.predict(input_fn=lambda:eval_input_fn(predict_x, labels=None, batch_size=BATCH_SIZE))

template = ('\nPrediction is "{}" ({:.1f}%), expected "{}"')

for pred_dict, expec in zip(predictions, expected):
    class_id = pred_dict['class_ids'][0]
    probability = pred_dict['probabilities'][class_id]

    print(template.format(SPECIES[class_id], 100 * probability, expec))

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /tmp/tmp4zdhd5z_/model.ckpt-5500
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.

Prediction is "Setosa" (99.9%), expected "Setosa"

Prediction is "Versicolor" (100.0%), expected "Versicolor"

Prediction is "Virginica" (99.4%), expected "Virginica"
