In [1]:
 import functools

import numpy as np
import tensorflow as tf
 

In [2]:
 TRAIN_DATA_URL = "https://storage.googleapis.com/tf-datasets/titanic/train.csv"
TEST_DATA_URL = "https://storage.googleapis.com/tf-datasets/titanic/eval.csv"

train_file_path = tf.keras.utils.get_file("train.csv", TRAIN_DATA_URL)
test_file_path = tf.keras.utils.get_file("eval.csv", TEST_DATA_URL)
 

In [3]:
 # Make numpy values easier to read.
np.set_printoptions(precision=3, suppress=True)
 

In [6]:
train_file_path

'C:\\Users\\dhk13\\.keras\\datasets\\train.csv'

In [7]:
 LABEL_COLUMN = 'survived'
LABELS = [0, 1]
 

In [8]:
 def get_dataset(file_path, **kwargs):
  dataset = tf.data.experimental.make_csv_dataset(
      file_path,
      batch_size=5, # Artificially small to make examples easier to show.
      label_name=LABEL_COLUMN,
      na_value="?",
      num_epochs=1,
      ignore_errors=True, 
      **kwargs)
  return dataset

raw_train_data = get_dataset(train_file_path)
raw_test_data = get_dataset(test_file_path)
 

In [9]:
raw_train_data

<PrefetchDataset shapes: (OrderedDict([(sex, (None,)), (age, (None,)), (n_siblings_spouses, (None,)), (parch, (None,)), (fare, (None,)), (class, (None,)), (deck, (None,)), (embark_town, (None,)), (alone, (None,))]), (None,)), types: (OrderedDict([(sex, tf.string), (age, tf.float32), (n_siblings_spouses, tf.int32), (parch, tf.int32), (fare, tf.float32), (class, tf.string), (deck, tf.string), (embark_town, tf.string), (alone, tf.string)]), tf.int32)>

In [10]:
 def show_batch(dataset):
  for batch, label in dataset.take(1):
    for key, value in batch.items():
      print("{:20s}: {}".format(key,value.numpy()))
 

In [11]:
 show_batch(raw_train_data)
 

sex                 : [b'male' b'female' b'male' b'male' b'female']
age                 : [28. 38. 35. 41.  5.]
n_siblings_spouses  : [0 0 0 0 2]
parch               : [0 0 0 0 1]
fare                : [13.    80.     7.896  7.125 19.258]
class               : [b'Second' b'First' b'Third' b'Third' b'Third']
deck                : [b'unknown' b'B' b'unknown' b'unknown' b'unknown']
embark_town         : [b'Southampton' b'unknown' b'Cherbourg' b'Southampton' b'Cherbourg']
alone               : [b'y' b'y' b'y' b'y' b'n']


In [12]:
 CSV_COLUMNS = ['survived', 'sex', 'age', 'n_siblings_spouses', 'parch', 'fare', 'class', 'deck', 'embark_town', 'alone']

temp_dataset = get_dataset(train_file_path, column_names=CSV_COLUMNS)

show_batch(temp_dataset)
 

sex                 : [b'female' b'male' b'female' b'male' b'male']
age                 : [24. 18. 41. 21. 30.]
n_siblings_spouses  : [0 0 0 0 0]
parch               : [0 0 1 1 0]
fare                : [69.3   11.5   19.5   77.287 13.   ]
class               : [b'First' b'Second' b'Second' b'First' b'Second']
deck                : [b'B' b'unknown' b'unknown' b'D' b'unknown']
embark_town         : [b'Cherbourg' b'Southampton' b'Southampton' b'Southampton' b'Southampton']
alone               : [b'y' b'y' b'n' b'n' b'y']


In [13]:
 SELECT_COLUMNS = ['survived', 'age', 'n_siblings_spouses', 'class', 'deck', 'alone']

temp_dataset = get_dataset(train_file_path, select_columns=SELECT_COLUMNS)

show_batch(temp_dataset)
 

age                 : [60. 28. 25. 50. 39.]
n_siblings_spouses  : [1 0 0 2 0]
class               : [b'First' b'First' b'Third' b'First' b'Second']
deck                : [b'B' b'unknown' b'unknown' b'unknown' b'unknown']
alone               : [b'n' b'y' b'y' b'n' b'y']
