## 加载数据

In [2]:
!pip install -q tf-nightly-gpu

ERROR: Exception:
Traceback (most recent call last):
  File "d:\python\python36\lib\site-packages\pip\_vendor\urllib3\response.py", line 397, in _error_catcher
    yield
  File "d:\python\python36\lib\site-packages\pip\_vendor\urllib3\response.py", line 479, in read
    data = self._fp.read(amt)
  File "d:\python\python36\lib\site-packages\pip\_vendor\cachecontrol\filewrapper.py", line 62, in read
    data = self.__fp.read(amt)
  File "d:\python\python36\lib\http\client.py", line 449, in read
    n = self.readinto(b)
  File "d:\python\python36\lib\http\client.py", line 493, in readinto
    n = self.fp.readinto(b)
  File "d:\python\python36\lib\socket.py", line 586, in readinto
    return self._sock.recv_into(b)
  File "d:\python\python36\lib\ssl.py", line 1009, in recv_into
    return self.read(nbytes, buffer)
  File "d:\python\python36\lib\ssl.py", line 871, in read
    return self._sslobj.read(len, buffer)
  File "d:\python\python36\lib\ssl.py", line 631, in read
    v = self._sslobj

In [None]:
from __future__ import absolute_import, division, print_function

import numpy as np
import pandas as pd
import tensorflow as tf

tf.enable_eager_execution()


In [None]:
tf.logging.set_verbosity(tf.logging.ERROR)
tf.set_random_seed(123)

# Load dataset.
dftrain = pd.read_csv('https://storage.googleapis.com/tfbt/titanic_train.csv')
dfeval = pd.read_csv('https://storage.googleapis.com/tfbt/titanic_eval.csv')
y_train = dftrain.pop('survived')
y_eval = dfeval.pop('survived')

## 预览数据

In [None]:
dftrain.head()

预览统计大纲

In [None]:
dftrain.describe()

对单独特征进行分析预览

In [None]:
dftrain.age.hist(bins=20);

In [None]:
dftrain.sex.value_counts().plot(kind='barh');

In [None]:
(dftrain['class']
  .value_counts()
  .plot(kind='barh'));

In [None]:
(dftrain['embark_town']
  .value_counts()
  .plot(kind='barh'));

In [None]:
ax = (pd.concat([dftrain, y_train], axis=1)\
  .groupby('sex')
  .survived
  .mean()
  .plot(kind='barh'))
ax.set_xlabel('% survive');

## 创建特征列

In [None]:
fc = tf.feature_column
CATEGORICAL_COLUMNS = ['sex', 'n_siblings_spouses', 'parch', 'class', 'deck', 
                       'embark_town', 'alone']
NUMERIC_COLUMNS = ['age', 'fare']
  
def one_hot_cat_column(feature_name, vocab):
  return fc.indicator_column(
      fc.categorical_column_with_vocabulary_list(feature_name,
                                                 vocab))
feature_columns = []
for feature_name in CATEGORICAL_COLUMNS:
  # Need to one-hot encode categorical features.
  vocabulary = dftrain[feature_name].unique()
  feature_columns.append(one_hot_cat_column(feature_name, vocabulary))
  
for feature_name in NUMERIC_COLUMNS:
  feature_columns.append(fc.numeric_column(feature_name,
                                           dtype=tf.float32))

In [None]:
独热码的转换效果

In [None]:
example = dftrain.head(1)
class_fc = one_hot_cat_column('class', ('First', 'Second', 'Third'))
print('Feature value: "{}"'.format(example['class'].iloc[0]))
print('One-hot encoded: ', fc.input_layer(dict(example), [class_fc]).numpy())

## 创建输入函数

In [None]:
# Use entire batch since this is such a small dataset.
NUM_EXAMPLES = len(y_train)

def make_input_fn(X, y, n_epochs=None, shuffle=True):
  def input_fn():
    dataset = tf.data.Dataset.from_tensor_slices((dict(X), y))
    if shuffle:
      dataset = dataset.shuffle(NUM_EXAMPLES)
    # For training, cycle thru dataset as many times as need (n_epochs=None).    
    dataset = dataset.repeat(n_epochs)  
    # In memory training doesn't use batching.
    dataset = dataset.batch(NUM_EXAMPLES)
    return dataset
  return input_fn

# Training and evaluation input functions.
train_input_fn = make_input_fn(dftrain, y_train)
eval_input_fn = make_input_fn(dfeval, y_eval, shuffle=False, n_epochs=1)

## 训练与评估模型
操作步骤：
 - 初始化模型，指定特征与超参数
 - 喂给模型参数，开始训练模型
 - 评估模型性能，

评估线性分类模型做为基准参考，将与提升树做对比

In [None]:
linear_est = tf.estimator.LinearClassifier(feature_columns)

# Train model.
linear_est.train(train_input_fn, max_steps=100)

# Evaluation.
results = linear_est.evaluate(eval_input_fn)
print('Accuracy : ', results['accuracy'])
print('Dummy model: ', results['accuracy_baseline'])

评估提升树模型

In [None]:
# Since data fits into memory, use entire dataset per layer. It will be faster.
# Above one batch is defined as the entire dataset. 
n_batches = 1
est = tf.estimator.BoostedTreesClassifier(feature_columns,
                                          n_batches_per_layer=n_batches)

# The model will stop training once the specified number of trees is built, not 
# based on the number of steps.
est.train(train_input_fn, max_steps=100)

# Eval.
results = est.evaluate(eval_input_fn)
print('Accuracy : ', results['accuracy'])
print('Dummy model: ', results['accuracy_baseline'])

出于性能方面考虑，推荐使用`boosted_trees_classifier_train_in_memory`方法
此方法时，不应该对输入数据进行批处理，因为该方法对整个数据集进行操作

In [None]:
def make_inmemory_train_input_fn(X, y):
  def input_fn():
    return dict(X), y
  return input_fn


train_input_fn = make_inmemory_train_input_fn(dftrain, y_train)
eval_input_fn = make_input_fn(dfeval, y_eval, shuffle=False, n_epochs=1)
est = tf.contrib.estimator.boosted_trees_classifier_train_in_memory(
    train_input_fn,
    feature_columns)
print(est.evaluate(eval_input_fn)['accuracy'])

TensorFlow模型经过优化，可以同时对一批或多个示例进行预测。

In [None]:
pred_dicts = list(est.predict(eval_input_fn))
probs = pd.Series([pred['probabilities'][1] for pred in pred_dicts])

probs.plot(kind='hist', bins=20, title='predicted probabilities');

您还可以查看结果的接收者操作特性(ROC)，这将使我们更好地了解真实阳性率和假阳性率之间的权衡。

In [None]:
from sklearn.metrics import roc_curve
from matplotlib import pyplot as plt

fpr, tpr, _ = roc_curve(y_eval, probs)
plt.plot(fpr, tpr)
plt.title('ROC curve')
plt.xlabel('false positive rate')
plt.ylabel('true positive rate')
plt.xlim(0,)
plt.ylim(0,);