# Download data from GCS

In [7]:
from google.cloud import storage
import pandas as pd
from io import StringIO

storage_client = storage.Client()
bucket = storage_client.get_bucket('caip_notebooks_demo_temp')
blob = storage.Blob("train_df.csv", bucket)
train_df = pd.read_csv(StringIO(str(blob.download_as_string(),'utf-8')))
blob = storage.Blob("test_df.csv", bucket)
test_df = pd.read_csv(StringIO(str(blob.download_as_string(),'utf-8')))

## Model
### Input functions

[Estimator framework](https://www.tensorflow.org/get_started/premade_estimators#overview_of_programming_with_estimators) provides [input functions](https://www.tensorflow.org/api_docs/python/tf/estimator/inputs/pandas_input_fn) that wrap Pandas dataframes.

In [8]:
from absl import logging

import tensorflow as tf
import tensorflow_hub as hub
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import re
import seaborn as sns
# Training input on the whole training set with no limit on training epochs.
train_input_fn = tf.estimator.inputs.pandas_input_fn(
    train_df, train_df["polarity"], num_epochs=None, shuffle=True)

# Prediction on the whole training set.
predict_train_input_fn = tf.estimator.inputs.pandas_input_fn(
    train_df, train_df["polarity"], shuffle=False)
# Prediction on the test set.
predict_test_input_fn = tf.estimator.inputs.pandas_input_fn(
    test_df, test_df["polarity"], shuffle=False)

### Feature columns

TF-Hub provides a [feature column](https://github.com/tensorflow/hub/blob/master/docs/api_docs/python/hub/text_embedding_column.md) that applies a module on the given text feature and passes further the outputs of the module. In this tutorial we will be using the [nnlm-en-dim128 module](https://tfhub.dev/google/nnlm-en-dim128/1). For the purpose of this tutorial, the most important facts are:

* The module takes **a batch of sentences in a 1-D tensor of strings** as input.
* The module is responsible for **preprocessing of sentences** (e.g. removal of punctuation and splitting on spaces).
* The module works with any input (e.g. **nnlm-en-dim128** hashes words not present in vocabulary into ~20.000 buckets).

In [9]:
embedded_text_feature_column = hub.text_embedding_column(
    key="sentence", 
    module_spec="https://tfhub.dev/google/nnlm-en-dim128/1")

### Estimator

For classification we can use a [DNN Classifier](https://www.tensorflow.org/api_docs/python/tf/estimator/DNNClassifier) (note further remarks about different modelling of the label function at the end of the tutorial).

In [10]:
estimator = tf.estimator.DNNClassifier(
    hidden_units=[500, 100],
    feature_columns=[embedded_text_feature_column],
    n_classes=2,
    optimizer=tf.train.AdagradOptimizer(learning_rate=0.003))

INFO:tensorflow:Using default config.


INFO:tensorflow:Using default config.






INFO:tensorflow:Using config: {'_experimental_distribute': None, '_global_id_in_cluster': 0, '_train_distribute': None, '_keep_checkpoint_max': 5, '_num_ps_replicas': 0, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_is_chief': True, '_service': None, '_protocol': None, '_device_fn': None, '_save_checkpoints_secs': 600, '_num_worker_replicas': 1, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_experimental_max_worker_delay_secs': None, '_eval_distribute': None, '_task_type': 'worker', '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_tf_random_seed': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7fc3a99e94a8>, '_evaluation_master': '', '_task_id': 0, '_model_dir': '/tmp/tmpm4yf25wz', '_master': '', '_session_creation_timeout_secs': 7200}


INFO:tensorflow:Using config: {'_experimental_distribute': None, '_global_id_in_cluster': 0, '_train_distribute': None, '_keep_checkpoint_max': 5, '_num_ps_replicas': 0, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_is_chief': True, '_service': None, '_protocol': None, '_device_fn': None, '_save_checkpoints_secs': 600, '_num_worker_replicas': 1, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_experimental_max_worker_delay_secs': None, '_eval_distribute': None, '_task_type': 'worker', '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_tf_random_seed': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7fc3a99e94a8>, '_evaluation_master': '', '_task_id': 0, '_model_dir': '/tmp/tmpm4yf25wz', '_master': '', '_session_creation_timeout_secs': 7200}


### Training

Train the estimator for a reasonable amount of steps.

In [11]:
# Training for 1,000 steps means 128,000 training examples with the default
# batch size. This is roughly equivalent to 5 epochs since the training dataset
# contains 25,000 examples.
estimator.train(input_fn=train_input_fn, steps=2);

INFO:tensorflow:Calling model_fn.


INFO:tensorflow:Calling model_fn.


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Done calling model_fn.


INFO:tensorflow:Done calling model_fn.


INFO:tensorflow:Create CheckpointSaverHook.


INFO:tensorflow:Create CheckpointSaverHook.


INFO:tensorflow:Graph was finalized.


INFO:tensorflow:Graph was finalized.


INFO:tensorflow:Running local_init_op.


INFO:tensorflow:Running local_init_op.


INFO:tensorflow:Done running local_init_op.


INFO:tensorflow:Done running local_init_op.


INFO:tensorflow:Saving checkpoints for 0 into /tmp/tmpm4yf25wz/model.ckpt.


INFO:tensorflow:Saving checkpoints for 0 into /tmp/tmpm4yf25wz/model.ckpt.


INFO:tensorflow:loss = 88.42821, step = 1


INFO:tensorflow:loss = 88.42821, step = 1


INFO:tensorflow:Saving checkpoints for 2 into /tmp/tmpm4yf25wz/model.ckpt.


INFO:tensorflow:Saving checkpoints for 2 into /tmp/tmpm4yf25wz/model.ckpt.


INFO:tensorflow:Loss for final step: 87.030464.


INFO:tensorflow:Loss for final step: 87.030464.


# Export artifacts

In [19]:
def serving_input_receiver_fn():
  text_input = tf.placeholder(dtype=tf.string, shape=[None])
  # embed = hub.Module("https://tfhub.dev/google/universal-sentence-encoder/2")
  # embedded_text = embed(text_input)
  feed_dict={"sentence": text_input}
  return tf.estimator.export.ServingInputReceiver(feed_dict, feed_dict)

estimator.export_savedmodel('saved_model', serving_input_receiver_fn)
# Removing old model (just in case)
!gsutil -m rm -rf "gs://caip_notebooks_demo_temp/models/saved_model/"
# Uploading latest model
!gsutil -m cp -r "./saved_model/*" "gs://caip_notebooks_demo_temp/models/saved_model/"

INFO:tensorflow:Calling model_fn.


INFO:tensorflow:Calling model_fn.


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Done calling model_fn.


INFO:tensorflow:Done calling model_fn.


INFO:tensorflow:Signatures INCLUDED in export for Predict: ['predict']


INFO:tensorflow:Signatures INCLUDED in export for Predict: ['predict']


INFO:tensorflow:Signatures INCLUDED in export for Train: None


INFO:tensorflow:Signatures INCLUDED in export for Train: None


INFO:tensorflow:Signatures INCLUDED in export for Regress: ['regression']


INFO:tensorflow:Signatures INCLUDED in export for Regress: ['regression']


INFO:tensorflow:Signatures INCLUDED in export for Eval: None


INFO:tensorflow:Signatures INCLUDED in export for Eval: None


INFO:tensorflow:Signatures INCLUDED in export for Classify: ['classification', 'serving_default']


INFO:tensorflow:Signatures INCLUDED in export for Classify: ['classification', 'serving_default']


INFO:tensorflow:Restoring parameters from /tmp/tmpm4yf25wz/model.ckpt-2


INFO:tensorflow:Restoring parameters from /tmp/tmpm4yf25wz/model.ckpt-2


INFO:tensorflow:Assets added to graph.


INFO:tensorflow:Assets added to graph.


INFO:tensorflow:Assets written to: saved_model/temp-b'1576611704'/assets


INFO:tensorflow:Assets written to: saved_model/temp-b'1576611704'/assets


INFO:tensorflow:SavedModel written to: saved_model/temp-b'1576611704'/saved_model.pb


INFO:tensorflow:SavedModel written to: saved_model/temp-b'1576611704'/saved_model.pb


Removing gs://caip_notebooks_demo_temp/models/saved_model/1576606089/assets/tokens.txt#1576611676457268...
Removing gs://caip_notebooks_demo_temp/models/saved_model/1576606089/saved_model.pb#1576611676135790...
Removing gs://caip_notebooks_demo_temp/models/saved_model/1576606089/variables/variables.data-00000-of-00002#1576611675965270...
Removing gs://caip_notebooks_demo_temp/models/saved_model/1576606089/variables/variables.data-00001-of-00002#1576611686243030...
Removing gs://caip_notebooks_demo_temp/models/saved_model/1576606089/variables/variables.index#1576611675723877...
Removing gs://caip_notebooks_demo_temp/models/saved_model/1576608066/assets/tokens.txt#1576611676168166...
Removing gs://caip_notebooks_demo_temp/models/saved_model/1576611381/assets/tokens.txt#1576611677264107...
Removing gs://caip_notebooks_demo_temp/models/saved_model/1576611363/assets/tokens.txt#1576611676571906...
Removing gs://caip_notebooks_demo_temp/models/saved_model/1576608066/saved_model.pb#15766116757