In [1]:
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import sklearn
import pandas as pd
import os
import sys
import time
import tensorflow as tf

from tensorflow import keras

print(tf.__version__)
print(sys.version_info)
for module in mpl, np, pd, sklearn, tf, keras:
    print(module.__name__, module.__version__)

2.2.0
sys.version_info(major=3, minor=6, micro=9, releaselevel='final', serial=0)
matplotlib 3.2.1
numpy 1.18.5
pandas 1.0.4
sklearn 0.23.1
tensorflow 2.2.0
tensorflow.keras 2.3.0-tf


In [2]:
# https://storage.googleapis.com/tf-datasets/titanic/train.csv
# https://storage.googleapis.com/tf-datasets/titanic/eval.csv
train_file = "./data/titanic/train.csv"
eval_file = "./data/titanic/eval.csv"

train_df = pd.read_csv(train_file)
eval_df = pd.read_csv(eval_file)
print(train_df.shape)
print(train_df.head())
print(eval_df.head())

(627, 10)
   survived     sex   age  n_siblings_spouses  parch     fare  class     deck  \
0         0    male  22.0                   1      0   7.2500  Third  unknown   
1         1  female  38.0                   1      0  71.2833  First        C   
2         1  female  26.0                   0      0   7.9250  Third  unknown   
3         1  female  35.0                   1      0  53.1000  First        C   
4         0    male  28.0                   0      0   8.4583  Third  unknown   

   embark_town alone  
0  Southampton     n  
1    Cherbourg     n  
2  Southampton     y  
3  Southampton     n  
4   Queenstown     y  
   survived     sex   age  n_siblings_spouses  parch     fare   class  \
0         0    male  35.0                   0      0   8.0500   Third   
1         0    male  54.0                   0      0  51.8625   First   
2         1  female  58.0                   0      0  26.5500   First   
3         1  female  55.0                   0      0  16.0000  Second   


In [3]:
y_train = train_df.pop('survived')
y_eval = eval_df.pop('survived')

print(train_df.head())
print(eval_df.head())
print(y_train.head())
print(y_eval.head())

      sex   age  n_siblings_spouses  parch     fare  class     deck  \
0    male  22.0                   1      0   7.2500  Third  unknown   
1  female  38.0                   1      0  71.2833  First        C   
2  female  26.0                   0      0   7.9250  Third  unknown   
3  female  35.0                   1      0  53.1000  First        C   
4    male  28.0                   0      0   8.4583  Third  unknown   

   embark_town alone  
0  Southampton     n  
1    Cherbourg     n  
2  Southampton     y  
3  Southampton     n  
4   Queenstown     y  
      sex   age  n_siblings_spouses  parch     fare   class     deck  \
0    male  35.0                   0      0   8.0500   Third  unknown   
1    male  54.0                   0      0  51.8625   First        E   
2  female  58.0                   0      0  26.5500   First        C   
3  female  55.0                   0      0  16.0000  Second  unknown   
4    male  34.0                   0      0  13.0000  Second        D   

  

In [5]:
train_df.describe()

Unnamed: 0,age,n_siblings_spouses,parch,fare
count,627.0,627.0,627.0,627.0
mean,29.631308,0.545455,0.379585,34.385399
std,12.511818,1.15109,0.792999,54.59773
min,0.75,0.0,0.0,0.0
25%,23.0,0.0,0.0,7.8958
50%,28.0,0.0,0.0,15.0458
75%,35.0,1.0,0.0,31.3875
max,80.0,8.0,5.0,512.3292


In [4]:
categorical_columns = ['sex', 'n_siblings_spouses', 'parch', 'class',
                       'deck', 'embark_town', 'alone']
numeric_columns = ['age', 'fare']

feature_columns = []
for categorical_column in categorical_columns:
    vocab = train_df[categorical_column].unique()
    print(categorical_column, vocab)
    feature_columns.append(
        tf.feature_column.indicator_column(
            #categorical_column_with_vocabulary_list可以直接看官网
            tf.feature_column.categorical_column_with_vocabulary_list(
                categorical_column, vocab)))

for categorical_column in numeric_columns:
    print(categorical_column)
    feature_columns.append(
        tf.feature_column.numeric_column(
            categorical_column, dtype=tf.float32))

sex ['male' 'female']
n_siblings_spouses [1 0 3 4 2 5 8]
parch [0 1 2 5 3 4]
class ['Third' 'First' 'Second']
deck ['unknown' 'C' 'G' 'A' 'B' 'D' 'F' 'E']
embark_town ['Southampton' 'Cherbourg' 'Queenstown' 'unknown']
alone ['n' 'y']
age
fare


In [5]:
feature_columns

[IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='sex', vocabulary_list=('male', 'female'), dtype=tf.string, default_value=-1, num_oov_buckets=0)),
 IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='n_siblings_spouses', vocabulary_list=(1, 0, 3, 4, 2, 5, 8), dtype=tf.int64, default_value=-1, num_oov_buckets=0)),
 IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='parch', vocabulary_list=(0, 1, 2, 5, 3, 4), dtype=tf.int64, default_value=-1, num_oov_buckets=0)),
 IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='class', vocabulary_list=('Third', 'First', 'Second'), dtype=tf.string, default_value=-1, num_oov_buckets=0)),
 IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='deck', vocabulary_list=('unknown', 'C', 'G', 'A', 'B', 'D', 'F', 'E'), dtype=tf.string, default_value=-1, num_oov_buckets=0)),
 IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='embark_town',

In [6]:
type(train_df)

pandas.core.frame.DataFrame

In [7]:
train_df.shape

(627, 9)

In [7]:
type(y_train)

pandas.core.series.Series

In [8]:
dataset = tf.data.Dataset.from_tensor_slices(
        (dict(train_df), y_train))

for i in dataset.take(1):
    print(i)

({'sex': <tf.Tensor: shape=(), dtype=string, numpy=b'male'>, 'age': <tf.Tensor: shape=(), dtype=float64, numpy=22.0>, 'n_siblings_spouses': <tf.Tensor: shape=(), dtype=int64, numpy=1>, 'parch': <tf.Tensor: shape=(), dtype=int64, numpy=0>, 'fare': <tf.Tensor: shape=(), dtype=float64, numpy=7.25>, 'class': <tf.Tensor: shape=(), dtype=string, numpy=b'Third'>, 'deck': <tf.Tensor: shape=(), dtype=string, numpy=b'unknown'>, 'embark_town': <tf.Tensor: shape=(), dtype=string, numpy=b'Southampton'>, 'alone': <tf.Tensor: shape=(), dtype=string, numpy=b'n'>}, <tf.Tensor: shape=(), dtype=int64, numpy=0>)


In [9]:
def make_dataset(data_df, label_df, epochs = 10, shuffle = True,
                 batch_size = 32):
    dataset = tf.data.Dataset.from_tensor_slices(
        (dict(data_df), label_df))
    if shuffle:
        dataset = dataset.shuffle(10000)
    #必须是repeat类型的dataset，进行分批
    dataset = dataset.repeat(epochs).batch(batch_size)
    return dataset

In [10]:
linear_output_dir = 'linear_model'
if not os.path.exists(linear_output_dir):
    os.mkdir(linear_output_dir)
#线性分类器模型
linear_estimator = tf.estimator.LinearClassifier(
    model_dir = linear_output_dir,
    n_classes = 2,
    #之前定义好的feature_columns传入
    feature_columns = feature_columns)
linear_estimator.train(input_fn = lambda : make_dataset(
    train_df, y_train, epochs = 100))

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': 'linear_model', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_service': None, '_cluster_spec': ClusterSpec({}), '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
Instructions for updating:
If using Keras pass *_constraint arguments to layers.
Instructions for updating:
Use Variable.

<tensorflow_estimator.python.estimator.canned.linear.LinearClassifierV2 at 0x7fe2bdfefbe0>

In [14]:
linear_estimator.get_variable_names()

['global_step',
 'linear/linear_model/age/weights',
 'linear/linear_model/alone_indicator/weights',
 'linear/linear_model/bias_weights',
 'linear/linear_model/class_indicator/weights',
 'linear/linear_model/deck_indicator/weights',
 'linear/linear_model/embark_town_indicator/weights',
 'linear/linear_model/fare/weights',
 'linear/linear_model/n_siblings_spouses_indicator/weights',
 'linear/linear_model/parch_indicator/weights',
 'linear/linear_model/sex_indicator/weights',
 'training/Ftrl/decay',
 'training/Ftrl/l1_regularization_strength',
 'training/Ftrl/l2_regularization_strength',
 'training/Ftrl/learning_rate',
 'training/Ftrl/learning_rate_power',
 'training/Ftrl/linear/linear_model/age/weights/accumulator',
 'training/Ftrl/linear/linear_model/age/weights/linear',
 'training/Ftrl/linear/linear_model/alone_indicator/weights/accumulator',
 'training/Ftrl/linear/linear_model/alone_indicator/weights/linear',
 'training/Ftrl/linear/linear_model/bias_weights/accumulator',
 'training/Ft

In [15]:
linear_estimator.get_variable_value('training/Ftrl/linear/linear_model/parch_indicator/weights/linear')

array([[-1.2879492 ],
       [-2.9463675 ],
       [-1.4032975 ],
       [ 1.2244579 ],
       [-0.91094524],
       [ 2.236593  ]], dtype=float32)

In [17]:
linear_estimator.get_variable_value('training/Ftrl/linear/linear_model/sex_indicator/weights/accumulator')

array([[7.7724013],
       [4.0160937]], dtype=float32)

In [18]:
!ls -l linear_model

总用量 1648
-rw-rw-r-- 1 luke luke    130 May  7 15:11 checkpoint
-rw-rw-r-- 1 luke luke 925955 May  7 15:11 graph.pbtxt
-rw-rw-r-- 1 luke luke    448 May  7 15:11 model.ckpt-0.data-00000-of-00001
-rw-rw-r-- 1 luke luke   1777 May  7 15:11 model.ckpt-0.index
-rw-rw-r-- 1 luke luke 367755 May  7 15:11 model.ckpt-0.meta
-rw-rw-r-- 1 luke luke    448 May  7 15:11 model.ckpt-1960.data-00000-of-00001
-rw-rw-r-- 1 luke luke   1777 May  7 15:11 model.ckpt-1960.index
-rw-rw-r-- 1 luke luke 367755 May  7 15:11 model.ckpt-1960.meta


In [None]:
!rm -rf linear_model

In [12]:
linear_estimator.evaluate(input_fn = lambda : make_dataset(
    eval_df, y_eval, epochs = 1, shuffle = False))

INFO:tensorflow:Calling model_fn.


To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.

INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2022-05-03T15:29:13Z
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from linear_model/model.ckpt-3920
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Inference Time : 1.02220s
INFO:tensorflow:Finished evaluation at 2022-05-03-15:29:14
INFO:tensorflow:Saving dict for global step 3920: accuracy = 0.78409094, accuracy_baseline = 0.625, auc = 0.83752674, auc_precision_recall = 0.7863668, average_loss = 0.48858187, global_step = 3920, label/mean = 0.375, loss = 0.4695381, precision = 0.68421054, prediction/me

{'accuracy': 0.78409094,
 'accuracy_baseline': 0.625,
 'auc': 0.83752674,
 'auc_precision_recall': 0.7863668,
 'average_loss': 0.48858187,
 'label/mean': 0.375,
 'loss': 0.4695381,
 'precision': 0.68421054,
 'prediction/mean': 0.43869576,
 'recall': 0.7878788,
 'global_step': 3920}

In [18]:
!rm -rf dnn_model
!ls

chapter_5.tar.gz	linear_model_new_features
data			tf01_keras_to_estimator.ipynb
dnn_model_new_features	tf02_premade_estimators.ipynb
linear_model		tf03_premade_estimators-new_feature.ipynb


In [19]:
#下面是使用dnn估计器
dnn_output_dir = './dnn_model'
if not os.path.exists(dnn_output_dir):
    os.mkdir(dnn_output_dir)
#创建dnn估计器
dnn_estimator = tf.estimator.DNNClassifier(
    model_dir = dnn_output_dir,
    n_classes = 2,
    feature_columns=feature_columns,
    #因为是dnn，我们定义层，两层，每一层是128
    hidden_units = [128, 128,128],
    #激活函数
    activation_fn = tf.nn.relu,
    #在Linear也有这个参数，只不过默认的，我们没有设置
    optimizer = 'Adam')
#开始训练
dnn_estimator.train(input_fn = lambda : make_dataset(
    train_df, y_train, epochs = 100))

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': './dnn_model', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_service': None, '_cluster_spec': ClusterSpec({}), '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
INFO:tensorflow:Calling model_fn.


To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floa

<tensorflow_estimator.python.estimator.canned.dnn.DNNClassifierV2 at 0x7fe1f2f3a898>

In [20]:
dnn_estimator.get_variable_names()

['dnn/hiddenlayer_0/bias',
 'dnn/hiddenlayer_0/kernel',
 'dnn/hiddenlayer_1/bias',
 'dnn/hiddenlayer_1/kernel',
 'dnn/hiddenlayer_2/bias',
 'dnn/hiddenlayer_2/kernel',
 'dnn/logits/bias',
 'dnn/logits/kernel',
 'global_step',
 'training/Adam/beta_1',
 'training/Adam/beta_2',
 'training/Adam/decay',
 'training/Adam/dnn/hiddenlayer_0/bias/m',
 'training/Adam/dnn/hiddenlayer_0/bias/v',
 'training/Adam/dnn/hiddenlayer_0/kernel/m',
 'training/Adam/dnn/hiddenlayer_0/kernel/v',
 'training/Adam/dnn/hiddenlayer_1/bias/m',
 'training/Adam/dnn/hiddenlayer_1/bias/v',
 'training/Adam/dnn/hiddenlayer_1/kernel/m',
 'training/Adam/dnn/hiddenlayer_1/kernel/v',
 'training/Adam/dnn/hiddenlayer_2/bias/m',
 'training/Adam/dnn/hiddenlayer_2/bias/v',
 'training/Adam/dnn/hiddenlayer_2/kernel/m',
 'training/Adam/dnn/hiddenlayer_2/kernel/v',
 'training/Adam/dnn/logits/bias/m',
 'training/Adam/dnn/logits/bias/v',
 'training/Adam/dnn/logits/kernel/m',
 'training/Adam/dnn/logits/kernel/v',
 'training/Adam/learning

In [15]:
dnn_estimator.get_variable_value('training/Adam/dnn/hiddenlayer_0/kernel/m').shape

(34, 128)

In [21]:
# 评估
dnn_estimator.evaluate(input_fn = lambda : make_dataset(
    eval_df, y_eval, epochs = 1, shuffle = False))

INFO:tensorflow:Calling model_fn.


To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.

INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2022-05-03T16:15:40Z
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from ./dnn_model/model.ckpt-1960
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Inference Time : 0.91049s
INFO:tensorflow:Finished evaluation at 2022-05-03-16:15:41
INFO:tensorflow:Saving dict for global step 1960: accuracy = 0.81060606, accuracy_baseline = 0.625, auc = 0.8351087, auc_precision_recall = 0.7745796, average_loss = 0.5145089, global_step = 1960, label/mean = 0.375, loss = 0.4921469, precision = 0.7752809, prediction/mean =

{'accuracy': 0.81060606,
 'accuracy_baseline': 0.625,
 'auc': 0.8351087,
 'auc_precision_recall': 0.7745796,
 'average_loss': 0.5145089,
 'label/mean': 0.375,
 'loss': 0.4921469,
 'precision': 0.7752809,
 'prediction/mean': 0.35656777,
 'recall': 0.6969697,
 'global_step': 1960}