In [2]:
import tensorflow as tf
import numpy as np
import functools

In [3]:
TRAIN_DATA_URL = "https://storage.googleapis.com/tf-datasets/titanic/train.csv"
TEST_DATA_URL = "https://storage.googleapis.com/tf-datasets/titanic/eval.csv"

train_file_path = tf.keras.utils.get_file("train.csv", TRAIN_DATA_URL)
test_file_path = tf.keras.utils.get_file("eval.csv", TEST_DATA_URL)

Downloading data from https://storage.googleapis.com/tf-datasets/titanic/train.csv
Downloading data from https://storage.googleapis.com/tf-datasets/titanic/eval.csv


In [21]:
import pandas as pd
train_data = pd.read_csv(train_file_path)
train_data.head()

Unnamed: 0,survived,sex,age,n_siblings_spouses,parch,fare,class,deck,embark_town,alone
0,0,male,22.0,1,0,7.25,Third,unknown,Southampton,n
1,1,female,38.0,1,0,71.2833,First,C,Cherbourg,n
2,1,female,26.0,0,0,7.925,Third,unknown,Southampton,y
3,1,female,35.0,1,0,53.1,First,C,Southampton,n
4,0,male,28.0,0,0,8.4583,Third,unknown,Queenstown,y


In [5]:
LABEL_COLUMN = "survived"
LABELS = [0, 1]

In [7]:
def get_dataset(file_path, **kwargs):
    dataset = tf.data.experimental.make_csv_dataset(
        file_path,
        batch_size=5,
        label_name=LABEL_COLUMN,
        num_epochs=1,
        ignore_errors=True,
        **kwargs)
    return dataset

raw_train_data = get_dataset(train_file_path)
raw_test_data = get_dataset(test_file_path)

In [12]:
# raw_train_data

In [13]:
# raw_test_data

In [16]:
def show_batch(dataset):
    for batch, label in dataset.take(1):
        for key, value in batch.items():
            print("{:20s}: {}".format(key, value.numpy()))

In [17]:
show_batch(raw_train_data)

sex                 : [b'female' b'male' b'male' b'male' b'female']
age                 : [28. 28. 28. 19. 32.]
n_siblings_spouses  : [1 0 0 0 1]
parch               : [0 0 0 0 1]
fare                : [15.5     7.8958  7.2292  7.65   15.5   ]
class               : [b'Third' b'Third' b'Third' b'Third' b'Third']
deck                : [b'unknown' b'unknown' b'unknown' b'F' b'unknown']
embark_town         : [b'Queenstown' b'Southampton' b'Cherbourg' b'Southampton' b'Queenstown']
alone               : [b'n' b'y' b'y' b'y' b'n']


In [24]:
#  按照列来选择数据
SELECT_COLUMNS = ['survived', 'age', 'n_siblings_spouses', 'class', 'deck', 'alone']
temp_dataset = get_dataset(train_file_path, select_columns=SELECT_COLUMNS)
show_batch(temp_dataset)

age                 : [21. 37. 28. 22. 35.]
n_siblings_spouses  : [0 1 0 0 0]
class               : [b'Third' b'Second' b'Second' b'Third' b'Third']
deck                : [b'unknown' b'unknown' b'unknown' b'unknown' b'unknown']
alone               : [b'y' b'n' b'y' b'y' b'y']


In [26]:
SELECT_COLUMNS = ['survived', 'age', 'n_siblings_spouses', 'parch', 'fare']
DEFAULTS = [0, 0.0, 0.0, 0.0, 0.0]
temp_dataset = get_dataset(train_file_path, 
                          select_columns=SELECT_COLUMNS,
                          column_defaults=DEFAULTS)
show_batch(temp_dataset)

age                 : [48. 20. 31. 45.  9.]
n_siblings_spouses  : [1. 0. 0. 0. 1.]
parch               : [0. 0. 0. 0. 1.]
fare                : [39.6     7.8542  8.6833  6.975  15.9   ]


In [27]:
example_batch, labels_batch = next(iter(temp_dataset))

In [32]:
# pack together all columns

def pack(features, label):
    return tf.stack(list(features.values()), axis=-1), label

In [34]:
pack_dataset = temp_dataset.map(pack)

for features, labels in pack_dataset.take(1):
    print(features.numpy())
    print()
    print(labels.numpy())

[[ 4.      4.      2.     31.275 ]
 [22.      0.      2.     49.5   ]
 [71.      0.      0.     49.5042]
 [33.      0.      0.      8.6542]
 [28.      0.      0.      7.75  ]]

[0 1 0 0 1]


In [37]:
class PackNumericFeatures(object):
    def __init__(self, names):
        self.names = names
    def __call__(self, features, labels):
        numeric_features = [features.pop(name) for name in self.names]
        numeric_features = [tf.cast(feat, tf.float32) for feat in numeric_features]
        numeric_features = tf.stack(numeric_features, axis=-1)
        features["numeric"] = numeric_features
        
        return features, labels

In [38]:
NUMERIC_FEATURES = ['age','n_siblings_spouses','parch', 'fare']

packed_train_data = raw_train_data.map(
    PackNumericFeatures(NUMERIC_FEATURES))

packed_test_data = raw_test_data.map(
    PackNumericFeatures(NUMERIC_FEATURES))

In [39]:
show_batch(packed_train_data)

sex                 : [b'male' b'male' b'female' b'female' b'male']
class               : [b'Third' b'First' b'Third' b'Third' b'Second']
deck                : [b'unknown' b'C' b'unknown' b'unknown' b'unknown']
embark_town         : [b'Southampton' b'Southampton' b'Queenstown' b'Southampton' b'Southampton']
alone               : [b'y' b'n' b'n' b'n' b'n']
numeric             : [[ 28.       0.       0.       8.05  ]
 [ 38.       0.       1.     153.4625]
 [ 32.       1.       1.      15.5   ]
 [ 28.       3.       1.      25.4667]
 [ 34.       1.       0.      21.    ]]


In [40]:
import pandas as pd
desc = pd.read_csv(train_file_path)[NUMERIC_FEATURES].describe()
desc

Unnamed: 0,age,n_siblings_spouses,parch,fare
count,627.0,627.0,627.0,627.0
mean,29.631308,0.545455,0.379585,34.385399
std,12.511818,1.15109,0.792999,54.59773
min,0.75,0.0,0.0,0.0
25%,23.0,0.0,0.0,7.8958
50%,28.0,0.0,0.0,15.0458
75%,35.0,1.0,0.0,31.3875
max,80.0,8.0,5.0,512.3292


In [41]:
MEAN = np.array(desc.T["mean"])
STD = np.array(desc.T["std"])

In [42]:
def normalize_numeric_data(data, mean, std):
    return (data-mean)/std