In [1]:
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import sklearn
import pandas as pd
import os
import sys
import time
import tensorflow as tf

from tensorflow import keras

import warnings
warnings.filterwarnings("ignore")

# os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 
print(tf.__version__)
print(sys.version_info)

for module in mpl,np,pd,sklearn,tf,keras:
    print(module.__name__,module.__version__)

2.0.0
sys.version_info(major=3, minor=7, micro=7, releaselevel='final', serial=0)
matplotlib 3.2.2
numpy 1.18.5
pandas 1.0.5
sklearn 0.21.2
tensorflow 2.0.0
tensorflow_core.keras 2.2.4-tf


In [2]:
train_file = "./data/titanic/train.csv"
eval_file = "./data/titanic/eval.csv"

train_df = pd.read_csv(train_file)
eval_df = pd.read_csv(eval_file)

train_df.head()

Unnamed: 0,survived,sex,age,n_siblings_spouses,parch,fare,class,deck,embark_town,alone
0,0,male,22.0,1,0,7.25,Third,unknown,Southampton,n
1,1,female,38.0,1,0,71.2833,First,C,Cherbourg,n
2,1,female,26.0,0,0,7.925,Third,unknown,Southampton,y
3,1,female,35.0,1,0,53.1,First,C,Southampton,n
4,0,male,28.0,0,0,8.4583,Third,unknown,Queenstown,y


In [3]:
y_train = train_df.pop("survived")
y_eval = eval_df.pop("survived")

In [4]:
categorical_columns = ['sex', 'n_siblings_spouses', 'parch', 'class',
                       'deck', 'embark_town', 'alone']
numeric_columns = ['age', 'fare']

feature_columns = []
for categorical_column in categorical_columns:
    vocab = train_df[categorical_column].unique()
    print(categorical_column, vocab)
    feature_columns.append(
        tf.feature_column.indicator_column(
            tf.feature_column.categorical_column_with_vocabulary_list(
                categorical_column, vocab)))

for categorical_column in numeric_columns:
    feature_columns.append(
        tf.feature_column.numeric_column(
            categorical_column, dtype=tf.float32))

sex ['male' 'female']
n_siblings_spouses [1 0 3 4 2 5 8]
parch [0 1 2 5 3 4]
class ['Third' 'First' 'Second']
deck ['unknown' 'C' 'G' 'A' 'B' 'D' 'F' 'E']
embark_town ['Southampton' 'Cherbourg' 'Queenstown' 'unknown']
alone ['n' 'y']


In [5]:
# # 离散型特征
# categorical_columns = ["sex","n_siblings_spouses","parch","class","deck","embark_town","alone"]
# # 连续型特征
# numeric_columns = ["age","fare"]

# feature_columns = []

# # 对离散型特征数据的处理
# for categorical_column in categorical_columns:
#     # 获得所有可能的值   
#     vocab = train_df[categorical_column].unique()
#     print(categorical_column,vocab)
#     # indicator_column:对离散型数据进行onehot编码
#     feature_columns.append(tf.feature_column.indicator_column(tf.feature_column.categorical_column_with_vocabulary_list(
#         categorical_column,vocab
#     )))
    
# # 对连续型特征数据进行处理
# for categorical_column in numeric_columns:
#     feature_columns.append(tf.feature_column.numeric_column(categorical_column,dtype=tf.float32))


# print(feature_columns)

In [6]:
def make_dataset(data_df, label_df, epochs = 10, shuffle = True,
                 batch_size = 32):
    dataset = tf.data.Dataset.from_tensor_slices(
        (dict(data_df), label_df))
    if shuffle:
        dataset = dataset.shuffle(10000)
    dataset = dataset.repeat(epochs).batch(batch_size)
    return dataset

# train_dataset = make_dataset(train_df,y_train,batch_size=5)



In [7]:
y_train

0      0
1      1
2      1
3      1
4      0
      ..
622    0
623    0
624    1
625    0
626    0
Name: survived, Length: 627, dtype: int64

In [8]:

output_dir = 'baseline_model'
if not os.path.exists(output_dir):
    os.mkdir(output_dir)

baseline_estimator = tf.compat.v1.estimator.BaselineClassifier(
    model_dir = output_dir,
    n_classes = 2)


INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': 'baseline_model', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x000001498907DCC8>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [9]:

baseline_estimator.train(input_fn = lambda : make_dataset(
    train_df, y_train, epochs = 100))

Instructions for updating:
If using Keras pass *_constraint arguments to layers.
Instructions for updating:
Use Variable.read_value. Variables in 2.X are initialized automatically both in eager and graph (inside tf.defun) contexts.
INFO:tensorflow:Calling model_fn.
Instructions for updating:
Use `tf.cast` instead.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 0 into baseline_model\model.ckpt.
INFO:tensorflow:loss = 22.18071, step = 0
INFO:tensorflow:global_step/sec: 356.686
INFO:tensorflow:loss = 19.829483, step = 100 (0.268 sec)
INFO:tensorflow:global_step/sec: 501.302
INFO:tensorflow:loss = 19.92646, step = 200 (0.198 sec)
INFO:tensorflow:global_step/sec: 542.025
INFO:tensorflow:loss = 22.572939, step = 300 (0.186 sec)
INFO:tensorflow:global_step/sec: 533.338
INFO:tensorflow:loss = 22.591326

<tensorflow_estimator.python.estimator.canned.baseline.BaselineClassifier at 0x14989094c88>

In [10]:
baseline_estimator.evaluate(input_fn= lambda :make_dataset(eval_df,y_eval,epochs=1,shuffle=False,batch_size=20))

INFO:tensorflow:Calling model_fn.
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2020-07-15T16:33:51Z
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from baseline_model\model.ckpt-1960
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Finished evaluation at 2020-07-15-16:33:52
INFO:tensorflow:Saving dict for global step 1960: accuracy = 0.625, accuracy_baseline = 0.625, auc = 0.5, auc_precision_recall = 0.6875, average_loss = 0.6619658, global_step = 1960, label/mean = 0.375, loss = 12.482783, precision = 0.0, prediction/mean = 0.3888027, recall = 0.0
INFO:tensorflow:Saving 'checkpoint_path' summary for global step 1960: baseline_model\model.ckpt-1960


{'accuracy': 0.625,
 'accuracy_baseline': 0.625,
 'auc': 0.5,
 'auc_precision_recall': 0.6875,
 'average_loss': 0.6619658,
 'label/mean': 0.375,
 'loss': 12.482783,
 'precision': 0.0,
 'prediction/mean': 0.3888027,
 'recall': 0.0,
 'global_step': 1960}

In [11]:

linear_output_dir = 'linear_model'
if not os.path.exists(output_dir):
    os.mkdir(output_dir)

linear_estimator = tf.estimator.LinearClassifier(
    model_dir = linear_output_dir,
    n_classes = 2,
    feature_columns = feature_columns)

linear_estimator.train(input_fn = lambda : make_dataset(
    train_df, y_train, epochs = 100))

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': 'linear_model', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x0000014A07762188>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
INFO:tensorflow:Calling model_fn.


To change all layers

<tensorflow_estimator.python.estimator.canned.linear.LinearClassifierV2 at 0x14a077a3808>

In [12]:
linear_estimator.evaluate(input_fn= lambda :make_dataset(eval_df,y_eval,epochs=1,shuffle=False,batch_size=20))

INFO:tensorflow:Calling model_fn.


To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.

INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2020-07-15T16:41:11Z
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from linear_model\model.ckpt-1960
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Finished evaluation at 2020-07-15-16:41:13
INFO:tensorflow:Saving dict for global step 1960: accuracy = 0.77272725, accuracy_baseline = 0.625, auc = 0.8345578, auc_precision_recall = 0.7801232, average_loss = 0.47881338, global_step = 1960, label/mean = 0.375, loss = 0.47469786, precision = 0.7294118, prediction/mean = 0.33076525, recall = 0.6262626
INFO:te

{'accuracy': 0.77272725,
 'accuracy_baseline': 0.625,
 'auc': 0.8345578,
 'auc_precision_recall': 0.7801232,
 'average_loss': 0.47881338,
 'label/mean': 0.375,
 'loss': 0.47469786,
 'precision': 0.7294118,
 'prediction/mean': 0.33076525,
 'recall': 0.6262626,
 'global_step': 1960}

In [14]:

dnn_output_dir = 'dnn_model'
if not os.path.exists(output_dir):
    os.mkdir(output_dir)

dnn_estimator = tf.estimator.DNNClassifier(
    model_dir = dnn_output_dir,
    n_classes = 2,
    feature_columns = feature_columns,
    hidden_units=[128,128],
    activation_fn="relu",
    optimizer="Adam"
)

dnn_estimator.train(input_fn = lambda : make_dataset(
    train_df, y_train, epochs = 100))

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': 'dnn_model', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x0000014A079B29C8>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
INFO:tensorflow:Calling model_fn.


To change all layers to

<tensorflow_estimator.python.estimator.canned.dnn.DNNClassifierV2 at 0x14a13ce0288>

In [15]:
dnn_estimator.evaluate(input_fn= lambda :make_dataset(eval_df,y_eval,epochs=1,shuffle=False,batch_size=20))

INFO:tensorflow:Calling model_fn.


To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.

INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2020-07-15T16:44:30Z
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from dnn_model\model.ckpt-1960
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Finished evaluation at 2020-07-15-16:44:31
INFO:tensorflow:Saving dict for global step 1960: accuracy = 0.78409094, accuracy_baseline = 0.625, auc = 0.84582186, auc_precision_recall = 0.81326735, average_loss = 0.50721735, global_step = 1960, label/mean = 0.375, loss = 0.5091315, precision = 0.6779661, prediction/mean = 0.44203782, recall = 0.8080808
INFO:tens

{'accuracy': 0.78409094,
 'accuracy_baseline': 0.625,
 'auc': 0.84582186,
 'auc_precision_recall': 0.81326735,
 'average_loss': 0.50721735,
 'label/mean': 0.375,
 'loss': 0.5091315,
 'precision': 0.6779661,
 'prediction/mean': 0.44203782,
 'recall': 0.8080808,
 'global_step': 1960}