# This tutorial provides an example of ow to load csv data from a file into a tf.data.Dataset.

In [1]:
from __future__ import absolute_import, division, print_function, unicode_literals
import functools

In [2]:
import numpy as np
import tensorflow as tf

In [3]:
TRAIN_DATA_URL = "https://storage.googleapis.com/tf-datasets/titanic/train.csv"
TEST_DATA_URL = "https://storage.googleapis.com/tf-datasets/titanic/eval.csv"

In [5]:
train_file_path = tf.keras.utils.get_file('train.csv', TRAIN_DATA_URL)
test_file_path = tf.keras.utils.get_file('eval.csv', TEST_DATA_URL)

Downloading data from https://storage.googleapis.com/tf-datasets/titanic/train.csv
Downloading data from https://storage.googleapis.com/tf-datasets/titanic/eval.csv


In [6]:
#Make Numpy values easier to read
np.set_printoptions(precision=3, suppress=True)

###  numpy.set_printoptions
These options determine the way floating point numbers, arrays and other NumPy objects are displayed.
	
precision : int or None, optional
Number of digits of precision for floating point output (default 8). May be None if floatmode is not fixed, to print as many digits as necessary to uniquely specify the value.

https://docs.scipy.org/doc/numpy/reference/generated/numpy.set_printoptions.html

#### Load Data
To start, look at the top of the csv file to see how it is formatted

In [15]:
LABEL_COLUMN = 'survived'
LABELS = [0, 1]

In [16]:
def get_dataset(file_path, **kwargs):
    dataset = tf.data.experimental.make_csv_dataset(
        file_path,
        batch_size = 5,
        label_name = LABEL_COLUMN,
        na_value='?',
        num_epochs=1,
        ignore_errors=True,
        **kwargs)
    return dataset


raw_train_data = get_dataset(train_file_path)
raw_test_data = get_dataset(test_file_path)

Instructions for updating:
Use `tf.data.Dataset.interleave(map_func, cycle_length, block_length, num_parallel_calls=tf.data.experimental.AUTOTUNE)` instead. If sloppy execution is desired, use `tf.data.Options.experimental_determinstic`.


## what is tensorflow dataset?
#### tf.data.experimental.make_csv_dataset

https://www.tensorflow.org/api_docs/python/tf/data/experimental/make_csv_dataset?hl=ko&version=stable
![image.png](attachment:image.png)


file_pattern: List of files or patterns of file paths containing CSV records. See tf.io.gfile.glob for pattern rules.

Returns:
A dataset, where each element is a (features, labels) tuple that corresponds to a batch of batch_size CSV rows. The features dictionary maps feature column names to Tensors containing the corresponding column data, and labels is a Tensor containing the column data for the label column specified by label_name.

In [18]:
type(raw_train_data)

tensorflow.python.data.ops.dataset_ops.PrefetchDataset

In [20]:
def show_batch(dataset):
    for batch, label in dataset.take(1):
        for key, value in batch.items():
            print('{:20s}: {}'.format(key, value.numpy()))
            
show_batch(raw_train_data)

sex                 : [b'male' b'male' b'male' b'male' b'male']
age                 : [20. 32. 71. 16. 28.]
n_siblings_spouses  : [0 1 0 0 0]
parch               : [0 0 0 0 0]
fare                : [ 9.225 15.85  49.504  9.217  7.896]
class               : [b'Third' b'Third' b'First' b'Third' b'Third']
deck                : [b'unknown' b'unknown' b'unknown' b'unknown' b'unknown']
embark_town         : [b'Southampton' b'Southampton' b'Cherbourg' b'Southampton' b'Southampton']
alone               : [b'y' b'n' b'y' b'y' b'y']


Each item in the dataset is a batch, represented as a tuple of (many examples, many labels). The data from the examples is organized in column-based tensors (rather than row-based tensors), each with as many elements as the batch size (5 in this case).

It might help to see this yourself.

As you can see, the column in the CSV are named. The dataset constructor will pick these names up automatically.
If the file you are working with does not contain the column names in the first line, pass them in a list of strings to the column_names argument in the make_csv_dataset function.

In [23]:
CSV_COLUMNS = ['survived', 'sex', 'age', 'n_siblings_spouses', 'parch', 'fare', 'class', 'deck', 'embark_town', 'alone']
temp_dataset = get_dataset(train_file_path, column_names=CSV_COLUMNS)
show_batch(temp_dataset)

sex                 : [b'female' b'male' b'male' b'male' b'male']
age                 : [15. 24. 21. 56. 39.]
n_siblings_spouses  : [0 2 0 0 0]
parch               : [1 0 0 0 0]
fare                : [211.337  24.15   73.5    26.55    7.925]
class               : [b'First' b'Third' b'Second' b'First' b'Third']
deck                : [b'B' b'unknown' b'unknown' b'unknown' b'unknown']
embark_town         : [b'Southampton' b'Southampton' b'Southampton' b'Southampton'
 b'Southampton']
alone               : [b'n' b'n' b'y' b'y' b'y']


In [24]:
SELECT_COLUMNS = ['survived', 'age', 'n_siblings_spouses', 'class', 'deck', 'alone']

temp_dataset = get_dataset(train_file_path, select_columns=SELECT_COLUMNS)

show_batch(temp_dataset)

age                 : [50. 34. 28.  2.  4.]
n_siblings_spouses  : [2 0 0 0 0]
class               : [b'First' b'Third' b'Third' b'Third' b'Third']
deck                : [b'unknown' b'unknown' b'unknown' b'G' b'unknown']
alone               : [b'n' b'y' b'y' b'n' b'n']


## Data Preprocessing

A CSV file can contain a variety of data types. Typically you want to convert from those mixed types to a fixed length vector before feeding the data into your model.

Tensorflow ahs a built-in system for describing common input conversions: tf.feature_column.

You can preprocess your data using any tool you like (like nltk or sklearn), and just pass the processed output to TensorFlow.

The primary advantage of doing the preprocessing inside your model is that when you export the model it includes the preprocessing. This way you can pass the raw data directly to your model.

### Continuous data
If your data is already in an appropriate numeric format, you can pack the data into a vector before passing it off to the model.

In [25]:
SELECT_COLUMNS = ['survived', 'age', 'n_siblings_spouses', 'parch', 'fare']
DEFAULTS = [0, 0.0, 0.0, 0.0, 0.0]
temp_dataset = get_dataset(train_file_path,
                          select_columns = SELECT_COLUMNS,
                          column_defaults = DEFAULTS)
show_batch(temp_dataset)

age                 : [22. 28. 28. 11. 28.]
n_siblings_spouses  : [0. 0. 2. 5. 0.]
parch               : [0. 0. 0. 2. 0.]
fare                : [ 9.35   7.75  21.679 46.9    7.829]


In [26]:
example_batch, labels_batch = next(iter(temp_dataset))

Here's a simple function that will pack together all the columns:


In [27]:
def pack(features, label):
    return tf.stack(list(features.values()), axis=-1), label

Apply this to each element of the dataset:

In [28]:
packed_dataset = temp_dataset.map(pack)

for features, labels in packed_dataset.take(1):
    print(features.numpy())
    print()
    print(labels.numpy())

[[ 7.     0.     2.    26.25 ]
 [59.     0.     0.    13.5  ]
 [32.     0.     0.     8.05 ]
 [28.     0.     0.     7.75 ]
 [28.     0.     0.     7.829]]

[1 0 1 1 0]


If You have mixed datatypes you may want to separate out these simple numeric fields. The
#### tf.feature_column 
api can handle them, but this incurs some overhead and should be avoided unless really nescessary. Switch back to the mixed dataset.

In [29]:
show_batch(raw_train_data)

sex                 : [b'male' b'male' b'male' b'male' b'female']
age                 : [28. 24. 28. 28. 30.]
n_siblings_spouses  : [0 0 0 0 0]
parch               : [0 0 0 0 0]
fare                : [ 6.858  7.496  8.05  39.6   12.475]
class               : [b'Third' b'Third' b'Third' b'First' b'Third']
deck                : [b'unknown' b'unknown' b'unknown' b'unknown' b'unknown']
embark_town         : [b'Queenstown' b'Southampton' b'Southampton' b'Cherbourg' b'Southampton']
alone               : [b'y' b'y' b'y' b'y' b'y']


In [30]:
example_batch, labels_batch = next(iter(temp_dataset)) 

In [31]:
class PackNumericFeatures(object):
  def __init__(self, names):
    self.names = names

  def __call__(self, features, labels):
    numeric_features = [features.pop(name) for name in self.names]
    numeric_features = [tf.cast(feat, tf.float32) for feat in numeric_features]
    numeric_features = tf.stack(numeric_features, axis=-1)
    features['numeric'] = numeric_features

    return features, labels

In [32]:
NUMERIC_FEATURES = ['age','n_siblings_spouses','parch', 'fare']

packed_train_data = raw_train_data.map(
    PackNumericFeatures(NUMERIC_FEATURES))

packed_test_data = raw_test_data.map(
    PackNumericFeatures(NUMERIC_FEATURES))

In [33]:
show_batch(packed_train_data)


sex                 : [b'male' b'male' b'male' b'male' b'female']
class               : [b'Third' b'Third' b'Third' b'Second' b'Third']
deck                : [b'unknown' b'unknown' b'unknown' b'unknown' b'G']
embark_town         : [b'Southampton' b'Southampton' b'Southampton' b'Cherbourg' b'Southampton']
alone               : [b'y' b'y' b'y' b'n' b'n']
numeric             : [[28.     0.     0.     8.05 ]
 [28.     0.     0.     7.896]
 [28.     0.     0.    56.496]
 [29.     1.     0.    27.721]
 [29.     1.     1.    10.462]]


In [34]:
example_batch, labels_batch = next(iter(packed_train_data)) 


In [35]:
import pandas as pd
desc = pd.read_csv(train_file_path)[NUMERIC_FEATURES].describe()
desc

Unnamed: 0,age,n_siblings_spouses,parch,fare
count,627.0,627.0,627.0,627.0
mean,29.631308,0.545455,0.379585,34.385399
std,12.511818,1.15109,0.792999,54.59773
min,0.75,0.0,0.0,0.0
25%,23.0,0.0,0.0,7.8958
50%,28.0,0.0,0.0,15.0458
75%,35.0,1.0,0.0,31.3875
max,80.0,8.0,5.0,512.3292


In [36]:
MEAN = np.array(desc.T['mean'])
STD = np.array(desc.T['std'])

In [37]:
def normalize_numeric_data(data, mean, std):
    return (data-mean)/std

In [38]:
normalizer = functools.partial(normalize_numeric_data, mean=MEAN, std=STD)
numeric_column = tf.feature_column.numeric_column('numeric', normalizer_fn = normalizer, shape=[len(NUMERIC_FEATURES)])
numeric_columns = [numeric_column]
numeric_column


NumericColumn(key='numeric', shape=(4,), default_value=None, dtype=tf.float32, normalizer_fn=functools.partial(<function normalize_numeric_data at 0x000001B76EFB23A8>, mean=array([29.631,  0.545,  0.38 , 34.385]), std=array([12.512,  1.151,  0.793, 54.598])))

In [39]:
example_batch['numeric']


<tf.Tensor: id=764, shape=(5, 4), dtype=float32, numpy=
array([[ 21.   ,   2.   ,   2.   , 262.375],
       [ 28.   ,   0.   ,   0.   ,   7.896],
       [ 27.   ,   1.   ,   0.   ,  13.858],
       [ 28.   ,   2.   ,   0.   ,  21.679],
       [ 50.   ,   0.   ,   0.   ,  10.5  ]], dtype=float32)>

In [40]:
numeric_layer = tf.keras.layers.DenseFeatures(numeric_columns)
numeric_layer(example_batch).numpy()

array([[-0.69 ,  1.264,  2.043,  4.176],
       [-0.13 , -0.474, -0.479, -0.485],
       [-0.21 ,  0.395, -0.479, -0.376],
       [-0.13 ,  1.264, -0.479, -0.233],
       [ 1.628, -0.474, -0.479, -0.437]], dtype=float32)

## Categorical data
Some of the columns in the CSV data are categorical columns. That is, the content should be one of a limited set of options.

Use the tf.feature_column API to create a collection with a tf.feature_column.indicator_column for each categorical column.

In [41]:
CATEGORIES = {
    'sex': ['male', 'female'],
    'class' : ['First', 'Second', 'Third'],
    'deck' : ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J'],
    'embark_town' : ['Cherbourg', 'Southhampton', 'Queenstown'],
    'alone' : ['y', 'n']
}

In [42]:
categorical_columns = []
for feature, vocab in CATEGORIES.items():
  cat_col = tf.feature_column.categorical_column_with_vocabulary_list(
        key=feature, vocabulary_list=vocab)
  categorical_columns.append(tf.feature_column.indicator_column(cat_col))

In [43]:
# See what you just created.
categorical_columns

[IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='sex', vocabulary_list=('male', 'female'), dtype=tf.string, default_value=-1, num_oov_buckets=0)),
 IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='class', vocabulary_list=('First', 'Second', 'Third'), dtype=tf.string, default_value=-1, num_oov_buckets=0)),
 IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='deck', vocabulary_list=('A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J'), dtype=tf.string, default_value=-1, num_oov_buckets=0)),
 IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='embark_town', vocabulary_list=('Cherbourg', 'Southhampton', 'Queenstown'), dtype=tf.string, default_value=-1, num_oov_buckets=0)),
 IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='alone', vocabulary_list=('y', 'n'), dtype=tf.string, default_value=-1, num_oov_buckets=0))]

In [44]:
categorical_layer = tf.keras.layers.DenseFeatures(categorical_columns)
print(categorical_layer(example_batch).numpy()[0])

Instructions for updating:
The old _FeatureColumn APIs are being deprecated. Please use the new FeatureColumn APIs instead.
Instructions for updating:
The old _FeatureColumn APIs are being deprecated. Please use the new FeatureColumn APIs instead.
[0. 1. 1. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1.]


## Combined preprocessing layer
Add the two feature column collections and pass them to a tf.keras.layers.DenseFeatures to create an input layer that will extract and preprocess both input types:

In [45]:
preprocessing_layer = tf.keras.layers.DenseFeatures(categorical_columns+numeric_columns)

In [46]:
print(preprocessing_layer(example_batch).numpy()[0])


[ 0.     1.     1.     0.     0.     0.     1.     0.     0.     0.
  0.     0.     0.     0.     0.     1.     0.     0.    -0.69   1.264
  2.043  4.176  0.     1.   ]


In [47]:
model = tf.keras.Sequential([
  preprocessing_layer,
  tf.keras.layers.Dense(128, activation='relu'),
  tf.keras.layers.Dense(128, activation='relu'),
  tf.keras.layers.Dense(1, activation='sigmoid'),
])

model.compile(
    loss='binary_crossentropy',
    optimizer='adam',
    metrics=['accuracy'])

In [48]:
train_data = packed_train_data.shuffle(500)
test_data = packed_test_data

In [49]:
model.fit(train_data, epochs=20)


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x1b76f001f08>

In [50]:
test_loss, test_accuracy = model.evaluate(test_data)

print('\n\nTest Loss {}, Test Accuracy {}'.format(test_loss, test_accuracy))



Test Loss 0.44506336797801954, Test Accuracy 0.8333333134651184


In [51]:
predictions = model.predict(test_data)

# Show some results
for prediction, survived in zip(predictions[:10], list(test_data)[0][1][:10]):
  print("Predicted survival: {:.2%}".format(prediction[0]),
        " | Actual outcome: ",
        ("SURVIVED" if bool(survived) else "DIED"))


Predicted survival: 87.13%  | Actual outcome:  DIED
Predicted survival: 98.23%  | Actual outcome:  DIED
Predicted survival: 56.00%  | Actual outcome:  DIED
Predicted survival: 0.25%  | Actual outcome:  DIED
Predicted survival: 23.85%  | Actual outcome:  DIED
