In [None]:
import tensorflow as tf
print(tf.__version__)

In [None]:
import pandas as pd

linesep = '-\n______________________________________________________________\n'
sep='---'

# Load dataset.
dftrain = pd.read_csv('https://storage.googleapis.com/tfbt/titanic_train.csv')
dfeval = pd.read_csv('https://storage.googleapis.com/tfbt/titanic_eval.csv')

print('***** train ******', dftrain.keys(), dftrain.head(), dftrain.describe(), sep=linesep, end=linesep)
print('***** eval ******', dfeval.keys(), dfeval.head(), dfeval.describe(), sep=linesep, end=linesep)
print(dftrain.shape[0], dfeval.shape[0], sep=sep)



In [None]:
dftrain.age.hist(bins=20)

In [None]:
colors=['b', 'g', 'r', 'c', 'm', 'y', 'k']

dftrain.sex.value_counts().plot.bar(color=colors)




In [None]:
y_train = dftrain.pop('survived')
y_eval = dfeval.pop('survived')

ax = (pd.concat([dftrain, y_train], axis=1)\
  .groupby('sex')
  .survived
  .mean()
  .plot.barh(color=colors))
ax.set_xlabel('% survive')

In [None]:
fc = tf.feature_column
CATEGORICAL_COLUMNS = ['sex', 'n_siblings_spouses', 'parch', 'class', 'deck', 
                       'embark_town', 'alone']
NUMERIC_COLUMNS = ['age', 'fare']
  
def one_hot_cat_column(feature_name, vocab):
  return fc.indicator_column(
      fc.categorical_column_with_vocabulary_list(feature_name,
                                                 vocab))

feature_columns = []
for feature_name in CATEGORICAL_COLUMNS:
  # Need to one-hot encode categorical features.
  vocabulary = dftrain[feature_name].unique()
  feature_columns.append(one_hot_cat_column(feature_name, vocabulary))
  
for feature_name in NUMERIC_COLUMNS:
  feature_columns.append(fc.numeric_column(feature_name,
                                           dtype=tf.float32))



In [None]:
example = dftrain.head(1)
print(example)
print(example['class'].iloc[0])
class_fc = one_hot_cat_column('class', ('First', 'Second', 'Third'))

print('Feature value: "{}"'.format(example['class'].iloc[0]))
#print('One-hot encoded: ', fc(dict(example), [class_fc]).numpy())

In [None]:
# Use entire batch since this is such a small dataset.
NUM_EXAMPLES = len(y_train)

def make_input_fn(X, y, n_epochs=None, shuffle=True):
  def input_fn():
    dataset = tf.data.Dataset.from_tensor_slices((dict(X), y))
    if shuffle:
      dataset = dataset.shuffle(NUM_EXAMPLES)
    # For training, cycle thru dataset as many times as need (n_epochs=None).    
    dataset = dataset.repeat(n_epochs)  
    # In memory training doesn't use batching.
    dataset = dataset.batch(NUM_EXAMPLES)
    return dataset
  return input_fn

# Training and evaluation input functions.
train_input_fn = make_input_fn(dftrain, y_train)
eval_input_fn = make_input_fn(dfeval, y_eval, shuffle=False, n_epochs=1)

In [None]:
linear_est = tf.estimator.LinearClassifier(feature_columns)

# Train model.
linear_est.train(train_input_fn, max_steps=100)

# Evaluation.
results = linear_est.evaluate(eval_input_fn)
print('Accuracy : ', results['accuracy'])
print('Dummy model: ', results['accuracy_baseline'])


In [None]:

# Since data fits into memory, use entire dataset per layer. It will be faster.
# Above one batch is defined as the entire dataset. 

classifier = tf.estimator.BoostedTreesClassifier(feature_columns,
                                          n_batches_per_layer=1)

# The model will stop training once the specified number of trees is built, not 
# based on the number of steps.
classifier.train(train_input_fn, max_steps=100)

# Eval.
metrics = classifier.evaluate(eval_input_fn)
print('Accuracy : ', metrics['accuracy'])
print('Baseline ', metrics['accuracy_baseline'])

In [None]:

def make_inmemory_train_input_fn(X, y):
  def input_fn():
    return dict(X), y
  return input_fn


train_input_fn = make_inmemory_train_input_fn(dftrain, y_train)
eval_input_fn = make_input_fn(dfeval, y_eval, shuffle=False, n_epochs=1)

classifier = tf.estimator.BoostedTreesClassifier(feature_columns,
                                          n_batches_per_layer=1,
                                          train_in_memory=True)

classifier.train(train_input_fn, max_steps=100)

# Eval.
metrics = classifier.evaluate(eval_input_fn)
print('Accuracy : ', metrics['accuracy'])
print('Baseline: ', metrics['accuracy_baseline'])


In [None]:
pred_dicts = list(classifier.predict(eval_input_fn))
probs = pd.Series([pred['probabilities'][1] for pred in pred_dicts])

probs.plot(kind='hist', bins=20, title='predicted probabilities');

In [None]:

from sklearn.metrics import roc_curve
from matplotlib import pyplot as plt

fpr, tpr, _ = roc_curve(y_eval, probs)
plt.plot(fpr, tpr)
plt.title('ROC curve')
plt.xlabel('false positive rate')
plt.ylabel('true positive rate')
plt.xlim(0,)
plt.ylim(0,);