# 目录
## 1. 导入模块
## 2. 加载泰坦尼克数据
## 3. 定义输入数据的格式feature_columns
  - `tf.feature_column.indicator_column`
  - `tf.feature_column.categorical_column_with_vocabulary_list`
  - `tf.feature_column.numeric_column`

## 4. tf.data.Dataset制作可迭代的数据
  - `tf.data.Dataset.from_tensor_slices`
  - `shuffle`
  - `repeat`
  - `batch`
  - `make_one_shot_iterator`
  - `get_next`
  
## 5. 定义estimator模型
### 5.1 模型函数定义
  - 参数`features` `labels` `mode` `params`
  - 数据输入层 `tf.feature_column.input_layer`,传入features数据和feature_columns （都是 列名--> 数值）
  - 三种mode，`tf.estimator.ModeKeys.PREDICT` `tf.estimator.ModeKeys.EVAL` `tf.estimator.ModeKeys.TRAIN`
  
### 5.2 生成estimator模型
  - `tf.estimator.Estimator`
  

## 6. estimator 训练

## 7. estimator 测试

## 1. 导入模块

In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import sklearn

from tensorflow import keras
import tensorflow as tf
import sys
import os
import time
import datetime

for module in [np, pd, mpl, sklearn, keras, tf]:
    print(module.__name__, module.__version__)

numpy 1.17.2
pandas 0.25.1
matplotlib 3.1.1
sklearn 0.21.3
tensorflow.python.keras.api._v1.keras 2.2.4-tf
tensorflow 1.15.0


## 2. 加载泰坦尼克数据

In [2]:
train_file = "./data/titanic/train.csv"
eval_file = "./data/titanic/eval.csv"

train_df = pd.read_csv(train_file)
eval_df = pd.read_csv(eval_file)

# y_train y_eval 是 pd.Series 类型
y_train = train_df.pop("survived") # survived 这一列的数据从 train_df 中移除，并返回给 y_train
y_eval = eval_df.pop("survived")

x_train = train_df.copy()
x_eval = eval_df.copy()

## 3. 定义输入数据的格式feature_columns

In [3]:
numerical_columns = ["age", "fare"]
categorical_columns = list(set(x_train.columns.tolist()).difference(set(numerical_columns)))

feature_columns = []

# 离散数据的输入
for categorical_column in categorical_columns:
    vocab = x_train[categorical_column].unique()
    
    print(categorical_column, "---> ",vocab)
    feature_columns.append(
        tf.feature_column.indicator_column(
            tf.feature_column.categorical_column_with_vocabulary_list(categorical_column, vocab) # 列名 --> 几种类别的列表
        )
    )

# 连续数据的输入
for numerical_column in numerical_columns:
    feature_columns.append(
        tf.feature_column.numeric_column(
            numerical_column, dtype=tf.float32   # 列名 --> 维度为0 的连续数据
        )
    )

n_siblings_spouses --->  [1 0 3 4 2 5 8]
deck --->  ['unknown' 'C' 'G' 'A' 'B' 'D' 'F' 'E']
class --->  ['Third' 'First' 'Second']
alone --->  ['n' 'y']
sex --->  ['male' 'female']
embark_town --->  ['Southampton' 'Cherbourg' 'Queenstown' 'unknown']
parch --->  [0 1 2 5 3 4]


## 4. tf.data.Dataset制作可迭代的数据

In [4]:
def make_dataset(train_df, label_df, shuffle=False, epochs=10, batch_size=32):
    '''
    train_df DataFrame 类型
    label_dfel  Series 类型
    '''
    # (features, labels), features 必须是一个字典类型，才能与feature_columns 的列名对应
    dataset = tf.data.Dataset.from_tensor_slices((dict(train_df), label_df))
    if shuffle:
        dataset = dataset.shuffle(10000)
    dataset = dataset.repeat(epochs).batch(batch_size)
    return dataset.make_one_shot_iterator().get_next()

## 5. 定义estimator模型

### 5.1 模型函数定义

In [5]:
def model_fn(features, labels, mode, params):
    # 网络机构    
    input_for_next_layer = tf.feature_column.input_layer(features, params["feature_columns"])
    
    for hidden_unit in params["hidden_units"]:
        input_for_next_layer = tf.layers.dense(input_for_next_layer, units=hidden_unit, activation=tf.nn.relu)
    
    logits = tf.layers.dense(input_for_next_layer, units=params["class_num"], activation=None)
    
    predictions = tf.argmax(logits, axis=1)
    
    # PREDICT
    if mode == tf.estimator.ModeKeys.PREDICT:
        return tf.estimator.EstimatorSpec(
            mode=mode,
            predictions={
                "logits": logits,
                "probabilities": tf.nn.softmax(logits),
                "class_ids": predictions[:, tf.newaxis]
            }
        )
    
    # EVAL
    loss = tf.losses.sparse_softmax_cross_entropy(logits=logits, labels=labels)
    accuracy = tf.metrics.accuracy(labels=labels, predictions=predictions, name="acc_op")
    if mode == tf.estimator.ModeKeys.EVAL:
        return tf.estimator.EstimatorSpec(
            mode=mode,
            loss=loss,
            eval_metric_ops={"accuracy": accuracy}
        )
    
    # TRAIN
    train_op = tf.train.AdamOptimizer(1e-3).minimize(loss=loss, global_step=tf.train.get_global_step())
    if mode == tf.estimator.ModeKeys.TRAIN:
        return tf.estimator.EstimatorSpec(
            mode=mode,
            loss=loss,
            train_op=train_op
        )

### 5.2 生成estimator模型

In [7]:
output_dir = "customized_estimator"
if not os.path.exists(output_dir):
    os.mkdir(output_dir)

estimator = tf.estimator.Estimator(
    model_fn=model_fn,
    model_dir=output_dir,
    params={
        "feature_columns": feature_columns,
        "hidden_units": [100, 100],
        "class_num": 2
    }
)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': 'customized_estimator', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7faed14d17f0>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


## 6. estimator 训练

In [9]:
estimator.train(input_fn=lambda: make_dataset(x_train, y_train, shuffle=True, epochs=100, batch_size=32))

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from customized_estimator/model.ckpt-196
Instructions for updating:
Use standard file utilities to get mtimes.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 196 into customized_estimator/model.ckpt.
INFO:tensorflow:loss = 0.45267773, step = 196
INFO:tensorflow:global_step/sec: 308.639
INFO:tensorflow:loss = 0.45816305, step = 296 (0.325 sec)
INFO:tensorflow:global_step/sec: 469.416
INFO:tensorflow:loss = 0.34162676, step = 396 (0.213 sec)
INFO:tensorflow:global_step/sec: 476.16
INFO:tensorflow:loss = 0.4171638, step = 496 (0.210 sec)
INFO:tensorflow:global_step/sec: 472.983
INFO:tensorflow:loss = 0.34072155, step = 596 (0.211 sec)
INFO:tensorflow:global_step/sec: 474.336
INFO:tensorflow:loss = 0.49389106, step = 696 (0.211 

<tensorflow_estimator.python.estimator.estimator.Estimator at 0x7faed16224a8>

## 7. estimator 测试

In [10]:
estimator.evaluate(input_fn=lambda: make_dataset(x_eval, y_eval, shuffle=False, epochs=1, batch_size=32))

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2020-01-23T13:49:43Z
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from customized_estimator/model.ckpt-2156
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Finished evaluation at 2020-01-23-13:49:44
INFO:tensorflow:Saving dict for global step 2156: accuracy = 0.8068182, global_step = 2156, loss = 0.47885895
INFO:tensorflow:Saving 'checkpoint_path' summary for global step 2156: customized_estimator/model.ckpt-2156


{'accuracy': 0.8068182, 'loss': 0.47885895, 'global_step': 2156}