##### Copyright 2019 The TensorFlow Authors.



In [1]:
#@title Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Load CSV data

<table class="tfo-notebook-buttons" align="left">
  <td>
    <a target="_blank" href="https://www.tensorflow.org/tutorials/load_data/csv"><img src="https://www.tensorflow.org/images/tf_logo_32px.png" />View on TensorFlow.org</a>
  </td>
  <td>
    <a target="_blank" href="https://colab.research.google.com/github/tensorflow/docs/blob/master/site/en/tutorials/load_data/csv.ipynb"><img src="https://www.tensorflow.org/images/colab_logo_32px.png" />Run in Google Colab</a>
  </td>
  <td>
    <a target="_blank" href="https://github.com/tensorflow/docs/blob/master/site/en/tutorials/load_data/csv.ipynb"><img src="https://www.tensorflow.org/images/GitHub-Mark-32px.png" />View source on GitHub</a>
  </td>
  <td>
    <a href="https://storage.googleapis.com/tensorflow_docs/docs/site/en/tutorials/load_data/csv.ipynb"><img src="https://www.tensorflow.org/images/download_logo_32px.png" />Download notebook</a>
  </td>
</table>

This tutorial provides an example of how to load CSV data from a file into a `tf.data.Dataset`.

The data used in this tutorial are taken from the Titanic passenger list. The model will predict the likelihood a passenger survived based on characteristics like age, gender, ticket class, and whether the person was traveling alone.

## Setup

In [2]:
try:
  # %tensorflow_version only exists in Colab.
  %tensorflow_version 2.x
except Exception:
  pass


In [3]:
from __future__ import absolute_import, division, print_function, unicode_literals
import functools

import numpy as np
import tensorflow as tf

In [4]:
TRAIN_DATA_URL = "https://storage.googleapis.com/tf-datasets/titanic/train.csv"
TEST_DATA_URL = "https://storage.googleapis.com/tf-datasets/titanic/eval.csv"

train_file_path = tf.keras.utils.get_file("train.csv", TRAIN_DATA_URL,cache_dir=".\\")
test_file_path = tf.keras.utils.get_file("eval.csv", TEST_DATA_URL,cache_dir=".\\")

In [5]:
# Make numpy values easier to read.
np.set_printoptions(precision=3, suppress=True)

## Load data


The only column you need to identify explicitly is the one with the value that the model is intended to predict. 

In [6]:
#Especifica que columna debe usarse como etiqueta
LABEL_COLUMN = 'survived'
LABELS = [0, 1]

Now read the CSV data from the file and create a dataset. 

(For the full documentation, see `tf.data.experimental.make_csv_dataset`)


In [7]:
def get_dataset(file_path, **kwargs):
  #Creamos un dataset a partir del archivo  
  dataset = tf.data.experimental.make_csv_dataset(
      file_path,
      batch_size=3, # define un batch de 3 registros
      label_name=LABEL_COLUMN, # define cual es la columna a usar como etiqueta
      na_value="?", #Cuando no hay un valor valido, informa ?
      num_epochs=1,
      ignore_errors=True, # Se salta registros con error
      **kwargs)
  return dataset

raw_train_data = get_dataset(train_file_path)
raw_test_data = get_dataset(test_file_path)

Muestra la estructura de un data set.

shapes: (OrderedDict([(sex, (?,)), (age, (?,)), (n_siblings_spouses, (?,)), (parch, (?,)), (fare, (?,)), (class, (?,)), (deck, (?,)), (embark_town, (?,)), (alone, (?,))]), (?,))

Notese que son dos colecciones. La primera tiene una serie de columnas, la segunda las etiquetas

types: (OrderedDict([(sex, tf.string), (age, tf.float32), (n_siblings_spouses, tf.int32), (parch, tf.int32), (fare, tf.float32), (class, tf.string), (deck, tf.string), (embark_town, tf.string), (alone, tf.string)])

Aqui tenemos los tipos de cada columna

In [8]:
raw_train_data

<PrefetchDataset shapes: (OrderedDict([(sex, (None,)), (age, (None,)), (n_siblings_spouses, (None,)), (parch, (None,)), (fare, (None,)), (class, (None,)), (deck, (None,)), (embark_town, (None,)), (alone, (None,))]), (None,)), types: (OrderedDict([(sex, tf.string), (age, tf.float32), (n_siblings_spouses, tf.int32), (parch, tf.int32), (fare, tf.float32), (class, tf.string), (deck, tf.string), (embark_town, tf.string), (alone, tf.string)]), tf.int32)>

In [9]:
raw_train_data.element_spec

(OrderedDict([('sex', TensorSpec(shape=(None,), dtype=tf.string, name=None)),
              ('age', TensorSpec(shape=(None,), dtype=tf.float32, name=None)),
              ('n_siblings_spouses',
               TensorSpec(shape=(None,), dtype=tf.int32, name=None)),
              ('parch', TensorSpec(shape=(None,), dtype=tf.int32, name=None)),
              ('fare', TensorSpec(shape=(None,), dtype=tf.float32, name=None)),
              ('class', TensorSpec(shape=(None,), dtype=tf.string, name=None)),
              ('deck', TensorSpec(shape=(None,), dtype=tf.string, name=None)),
              ('embark_town',
               TensorSpec(shape=(None,), dtype=tf.string, name=None)),
              ('alone',
               TensorSpec(shape=(None,), dtype=tf.string, name=None))]),
 TensorSpec(shape=(None,), dtype=tf.int32, name=None))

In [10]:
def show_batch(dataset):
  #Como indique antes, tenemos dos colecciones. La primera con todas las columnas del dataset. La segunda con las etiquetas
  #Cada take traera 5 registros, porque el dataset es de tamaño 3
  i=1  
  for batch, label in dataset.take(2):
    print("{}\nEtiqueta {}".format(i,label))
    i+=1
    for key, value in batch.items():
      print("{:20s}: {}".format(key,value.numpy()))

In [11]:
show_batch(raw_train_data)

1
Etiqueta [1 1 0]
sex                 : [b'female' b'female' b'male']
age                 : [24. 28. 33.]
n_siblings_spouses  : [1 1 0]
parch               : [2 0 0]
fare                : [65.    51.862 12.275]
class               : [b'Second' b'First' b'Second']
deck                : [b'unknown' b'D' b'unknown']
embark_town         : [b'Southampton' b'Southampton' b'Southampton']
alone               : [b'n' b'n' b'y']
2
Etiqueta [0 1 1]
sex                 : [b'male' b'male' b'male']
age                 : [19. 48. 44.]
n_siblings_spouses  : [3 1 0]
parch               : [2 0 0]
fare                : [263.     52.      7.925]
class               : [b'First' b'First' b'Third']
deck                : [b'C' b'C' b'unknown']
embark_town         : [b'Southampton' b'Southampton' b'Southampton']
alone               : [b'n' b'n' b'y']


In [47]:
def muestra(input):
  #El dataset es un iterable
  datos=input.take(1)
  #El dataset es un iterable. Si usamos datos en un for podremos ir iterando hasta que el dataset no tenga mas datos.
  #obtenemos el iterable asociado al dataset datos
  it = iter(datos)
  #Recupera un registro
  valor, label =it.next()
  print(label)
  print(valor["sex"])  
  #print("{}".label[0])
  print ("Etiqueta {}".format(label))
  for key, value in valor.items():
    print("{:20s}: {}".format(key,value))
  
  

Each item in the dataset is a batch, represented as a tuple of (*many examples*, *many labels*). The data from the examples is organized in column-based tensors (rather than row-based tensors), each with as many elements as the batch size (5 in this case).

It might help to see this yourself.

In [48]:
muestra(raw_train_data)

tf.Tensor([1 0 1], shape=(3,), dtype=int32)
tf.Tensor([b'female' b'male' b'male'], shape=(3,), dtype=string)
Etiqueta [1 0 1]
sex                 : [b'female' b'male' b'male']
age                 : [14. 28. 32.]
n_siblings_spouses  : [1 0 0]
parch               : [0 0 0]
fare                : [11.242  8.05   7.854]
class               : [b'Third' b'Third' b'Third']
deck                : [b'unknown' b'unknown' b'unknown']
embark_town         : [b'Cherbourg' b'Southampton' b'Southampton']
alone               : [b'n' b'y' b'y']


As you can see, the columns in the CSV are named. The dataset constructor will pick these names up automatically. If the file you are working with does not contain the column names in the first line, pass them in a list of strings to  the `column_names` argument in the `make_csv_dataset` function.

In [14]:
CSV_COLUMNS = ['survived', 'sex', 'age', 'n_siblings_spouses', 'parch', 'fare', 'class', 'deck', 'embark_town', 'alone']

temp_dataset = get_dataset(train_file_path, column_names=CSV_COLUMNS)

show_batch(temp_dataset)

1
Etiqueta [1 0 0]
sex                 : [b'male' b'male' b'male']
age                 : [ 4. 11. 28.]
n_siblings_spouses  : [1 5 1]
parch               : [1 2 0]
fare                : [11.133 46.9   15.85 ]
class               : [b'Third' b'Third' b'Third']
deck                : [b'unknown' b'unknown' b'unknown']
embark_town         : [b'Southampton' b'Southampton' b'Southampton']
alone               : [b'n' b'n' b'n']
2
Etiqueta [0 1 1]
sex                 : [b'male' b'female' b'female']
age                 : [28. 34. 33.]
n_siblings_spouses  : [0 0 1]
parch               : [0 1 0]
fare                : [ 6.95 23.   53.1 ]
class               : [b'Third' b'Second' b'First']
deck                : [b'unknown' b'unknown' b'E']
embark_town         : [b'Queenstown' b'Southampton' b'Southampton']
alone               : [b'y' b'n' b'n']


This example is going to use all the available columns. If you need to omit some columns from the dataset, create a list of just the columns you plan to use, and pass it into the (optional) `select_columns` argument of the constructor.


In [15]:
SELECT_COLUMNS = ['survived', 'age', 'n_siblings_spouses', 'class', 'deck', 'alone']

temp_dataset = get_dataset(train_file_path, select_columns=SELECT_COLUMNS)

show_batch(temp_dataset)

1
Etiqueta [1 1 0]
age                 : [52. 27. 28.]
n_siblings_spouses  : [0 0 0]
class               : [b'First' b'Third' b'Second']
deck                : [b'C' b'unknown' b'unknown']
alone               : [b'y' b'n' b'y']
2
Etiqueta [0 0 1]
age                 : [34. 26. 35.]
n_siblings_spouses  : [1 0 0]
class               : [b'Second' b'Third' b'First']
deck                : [b'unknown' b'unknown' b'B']
alone               : [b'n' b'y' b'y']


## Data preprocessing

A CSV file can contain a variety of data types. Typically you want to convert from those mixed types to a fixed length vector before feeding the data into your model.

TensorFlow has a built-in system for describing common input conversions: `tf.feature_column`, see [this tutorial](../keras/feature_columns) for details.


You can preprocess your data using any tool you like (like [nltk](https://www.nltk.org/) or [sklearn](https://scikit-learn.org/stable/)), and just pass the processed output to TensorFlow. 


The primary advantage of doing the preprocessing inside your model is that when you export the model it includes the preprocessing. This way you can pass the raw data directly to your model.

### Continuous data

If your data is already in an appropriate numeric format, you can pack the data into a vector before passing it off to the model:

In [43]:
SELECT_COLUMNS = ['survived', 'age', 'n_siblings_spouses', 'parch', 'fare']
DEFAULTS = [0, 0.0, 0.0, 0.0, 0.0]
temp_dataset = get_dataset(train_file_path, 
                           select_columns=SELECT_COLUMNS,
                           column_defaults = DEFAULTS)

show_batch(temp_dataset)

1
Etiqueta [0 0 0]
age                 : [42. 24. 59.]
n_siblings_spouses  : [1. 2. 0.]
parch               : [0. 0. 0.]
fare                : [52.   24.15  7.25]
2
Etiqueta [0 1 1]
age                 : [25. 35. 24.]
n_siblings_spouses  : [0. 0. 0.]
parch               : [0. 0. 2.]
fare                : [13.    26.288 16.7  ]


In [44]:
example_batch, labels_batch = next(iter(temp_dataset)) 

In [96]:
show_batch(temp_dataset)
show_batch(temp_dataset)

1
Etiqueta [1 0 1]
age                 : [35. 21.  6.]
n_siblings_spouses  : [0. 0. 0.]
parch               : [0. 0. 1.]
fare                : [512.329   7.775  33.   ]
2
Etiqueta [0 0 1]
age                 : [33. 18. 24.]
n_siblings_spouses  : [0. 1. 2.]
parch               : [0. 0. 3.]
fare                : [ 5.     6.496 18.75 ]
1
Etiqueta [1 1 0]
age                 : [49. 40. 28.]
n_siblings_spouses  : [1. 0. 0.]
parch               : [0. 0. 0.]
fare                : [76.729 31.     8.05 ]
2
Etiqueta [1 0 1]
age                 : [25. 61. 22.]
n_siblings_spouses  : [1. 0. 0.]
parch               : [0. 0. 0.]
fare                : [  7.775   6.238 151.55 ]


Here's a simple function that will pack together all the columns:

In [97]:
def pack(features, label):
  return tf.stack(list(features.values()), axis=-1), label

Apply this to each element of the dataset:

In [98]:
packed_dataset = temp_dataset.map(pack)

for features, labels in packed_dataset.take(1):
  print(features.numpy())
  print()
  print(labels.numpy())

[[24.     0.     0.     7.496]
 [28.     0.     0.     8.05 ]
 [35.     0.     0.    26.   ]]

[0 0 0]


If you have mixed datatypes you may want to separate out these simple-numeric fields. The `tf.feature_column` api can handle them, but this incurs some overhead and should be avoided unless really necessary. Switch back to the mixed dataset:

In [None]:
show_batch(raw_train_data)

In [None]:
example_batch, labels_batch = next(iter(temp_dataset)) 

So define a more general preprocessor that selects a list of numeric features and packs them into a single column:

In [None]:
class PackNumericFeatures(object):
  def __init__(self, names):
    self.names = names

  def __call__(self, features, labels):
    numeric_freatures = [features.pop(name) for name in self.names]
    numeric_features = [tf.cast(feat, tf.float32) for feat in numeric_freatures]
    numeric_features = tf.stack(numeric_features, axis=-1)
    features['numeric'] = numeric_features

    return features, labels

In [None]:
NUMERIC_FEATURES = ['age','n_siblings_spouses','parch', 'fare']

packed_train_data = raw_train_data.map(
    PackNumericFeatures(NUMERIC_FEATURES))

packed_test_data = raw_test_data.map(
    PackNumericFeatures(NUMERIC_FEATURES))

In [None]:
show_batch(packed_train_data)

In [None]:
example_batch, labels_batch = next(iter(packed_train_data)) 

#### Data Normalization

Continuous data should always be normalized.

In [None]:
import pandas as pd
desc = pd.read_csv(train_file_path)[NUMERIC_FEATURES].describe()
desc

In [None]:
MEAN = np.array(desc.T['mean'])
STD = np.array(desc.T['std'])

In [None]:
def normalize_numeric_data(data, mean, std):
  # Center the data
  return (data-mean)/std


Now create a numeric column. The `tf.feature_columns.numeric_column` API accepts a `normalizer_fn` argument, which will be run on each batch.

Bind the `MEAN` and `STD` to the normalizer fn using [`functools.partial`](https://docs.python.org/3/library/functools.html#functools.partial).

In [None]:
# See what you just created.
normalizer = functools.partial(normalize_numeric_data, mean=MEAN, std=STD)

numeric_column = tf.feature_column.numeric_column('numeric', normalizer_fn=normalizer, shape=[len(NUMERIC_FEATURES)])
numeric_columns = [numeric_column]
numeric_column

When you train the model, include this feature column to select and center this block of numeric data:

In [None]:
example_batch['numeric']

In [None]:
numeric_layer = tf.keras.layers.DenseFeatures(numeric_columns)
numeric_layer(example_batch).numpy()

The mean based normalization used here requires knowing the means of each column ahead of time.

### Categorical data

Some of the columns in the CSV data are categorical columns. That is, the content should be one of a limited set of options.

Use the `tf.feature_column` API to create a collection with a `tf.feature_column.indicator_column` for each categorical column.



In [None]:
CATEGORIES = {
    'sex': ['male', 'female'],
    'class' : ['First', 'Second', 'Third'],
    'deck' : ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J'],
    'embark_town' : ['Cherbourg', 'Southhampton', 'Queenstown'],
    'alone' : ['y', 'n']
}


In [None]:
categorical_columns = []
for feature, vocab in CATEGORIES.items():
  cat_col = tf.feature_column.categorical_column_with_vocabulary_list(
        key=feature, vocabulary_list=vocab)
  categorical_columns.append(tf.feature_column.indicator_column(cat_col))

In [None]:
# See what you just created.
categorical_columns

In [None]:
categorical_layer = tf.keras.layers.DenseFeatures(categorical_columns)
print(categorical_layer(example_batch).numpy()[0])

This will be become part of a data processing input later when you build the model.

### Combined preprocessing layer

Add the two feature column collections and pass them to a `tf.keras.layers.DenseFeatures` to create an input layer that will extract and preprocess both input types:

In [None]:
preprocessing_layer = tf.keras.layers.DenseFeatures(categorical_columns+numeric_columns)

In [None]:
print(preprocessing_layer(example_batch).numpy()[0])

## Build the model

Build a `tf.keras.Sequential`, starting with the `preprocessing_layer`.

In [None]:
model = tf.keras.Sequential([
  preprocessing_layer,
  tf.keras.layers.Dense(128, activation='relu'),
  tf.keras.layers.Dense(128, activation='relu'),
  tf.keras.layers.Dense(1, activation='sigmoid'),
])

model.compile(
    loss='binary_crossentropy',
    optimizer='adam',
    metrics=['accuracy'])

## Train, evaluate, and predict

Now the model can be instantiated and trained.

In [None]:
train_data = packed_train_data.shuffle(500)
test_data = packed_test_data

In [None]:
model.fit(train_data, epochs=20)

Once the model is trained, you can check its accuracy on the `test_data` set.

In [None]:
test_loss, test_accuracy = model.evaluate(test_data)

print('\n\nTest Loss {}, Test Accuracy {}'.format(test_loss, test_accuracy))

Use `tf.keras.Model.predict` to infer labels on a batch or a dataset of batches.

In [None]:
predictions = model.predict(test_data)

# Show some results
for prediction, survived in zip(predictions[:10], list(test_data)[0][1][:10]):
  print("Predicted survival: {:.2%}".format(prediction[0]),
        " | Actual outcome: ",
        ("SURVIVED" if bool(survived) else "DIED"))

