In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import functools
import io
import re
import glob
import tempfile
import math

import numpy as np
import pandas as pd

import tensorflow as tf
from tensorflow.python.client import device_lib

import tensorflow_transform as tft
from tensorflow_transform.beam import impl as beam_impl
from tensorflow_transform.beam.tft_beam_io import transform_fn_io
from tensorflow_transform.tf_metadata import dataset_metadata, dataset_schema

import apache_beam as beam
from apache_beam.io import tfrecordio

from scipy.stats import truncnorm

from sklearn.model_selection import ParameterGrid

  from ._random import sample_without_replacement


In [3]:
tf.logging.set_verbosity(tf.logging.ERROR)
tf.set_random_seed(1)
np.random.seed(1)

In [4]:
def get_available_gpus():
    devices = device_lib.list_local_devices()
    return [d.name for d in devices if d.device_type == 'GPU']

get_available_gpus()

[u'/device:GPU:0']

### Download data

In [5]:
#!gsutil cp gs://djr-data/movie-reviews/aclImdb_v1.tar.gz .
#!tar -xzf aclImdb_v1.tar.gz && rm aclImdb_v1.tar.gz

In [6]:
#!gsutil cp gs://djr-data/movie-reviews/glove.twitter.27B.zip .
#!unzip glove.twitter.27B.zip -d glove && rm glove.twitter.27B.zip

### Load data into TFRecords

NOTE: RE2 does not support constructs for which only backtracking solutions are known to exist. Thus, backreferences and look-around assertions are not supported!  As a result, I can't put this in tf.regex_replace(...), and it must live outside the input serving function.

In [7]:
# this pulls out our proper nouns and treats them as single words
def proper_preprocessing(review):
    proper = r"([A-Z]([a-z]+|\.)(?:\s+[A-Z]([a-z]+|\.))*(?:\s+[a-z][a-z\-]+){0,2}\s+[A-Z]([a-z]+|\.)(?:\s+([0-9]+(?:,[0-9]+)?))?)"
    space_between_brackets = r"[\.\s]+(?=[^\[\]]*]])"
    brackets = r"(?:[\[]{2})(.*?)(?:[\]]{2})"
    
    review = re.sub(proper, '[[\\1]]', review)
    review = re.sub(space_between_brackets, '~', review)
    review = re.sub(brackets, '\\1', review)
    return review

In [8]:
def load_data(g, out):
    inputs = glob.glob(g)
    np.random.shuffle(inputs)
    with tf.python_io.TFRecordWriter(out) as writer:
        for i in inputs:
            label = 1 if i.split('/')[2] == 'pos' else 0
            with open(i, 'r') as f:
                review = f.read()
            
            example = tf.train.Example()
            example.features.feature['review'].bytes_list.value.append(proper_preprocessing(review))
            example.features.feature['label'].int64_list.value.append(label)
                                
            writer.write(example.SerializeToString())
    
load_data('aclImdb/train/[posneg]*/*.txt', 'data/train.tfrecord')
load_data('aclImdb/test/[posneg]*/*.txt', 'data/test.tfrecord')

### Use TFT to preprocess data

In [9]:
# schema for raw data
RAW_DATA_FEATURE = {
    'review': tf.FixedLenFeature(shape=[1], dtype=tf.string),
    'label': tf.FixedLenFeature(shape=[1], dtype=tf.int64)
}

RAW_DATA_METADATA = dataset_metadata.DatasetMetadata(
    dataset_schema.from_feature_spec(RAW_DATA_FEATURE))

In [10]:
!rm -Rf tft_output/transform_fn 
!rm -Rf tft_output/transformed_metadata

In [11]:
# train our tft transformer
with beam.Pipeline() as pipeline:
    with beam_impl.Context(temp_dir=tempfile.mkdtemp()):
        coder = tft.coders.ExampleProtoCoder(RAW_DATA_METADATA.schema)

        train_data = (
            pipeline
            | 'ReadTrain' >> tfrecordio.ReadFromTFRecord('data/train.tfrecord')
            | 'DecodeTrain' >> beam.Map(coder.decode))

        test_data = (
            pipeline
            | 'ReadTest' >> tfrecordio.ReadFromTFRecord('data/test.tfrecord')
            | 'DecodeTest' >> beam.Map(coder.decode))

        
        # remove links, tags, quotes, apostraphes
        # bracketize proper nouns, names, and numbers
        # then lowercase, split by punctuation, and remove low frequency words
        def preprocessing_fn(inputs):
            remove = '|'.join(["https?:\/\/(www\.)?([^\s]*)", "<([^>]+)>", "\'", "\""])
            punctuation = r"([.,;!?\(\)\/])+"
            
            reviews = tf.reshape(inputs['review'], [-1])
            
            reviews = tf.regex_replace(reviews, remove, '')
            reviews = tf.regex_replace(tf.regex_replace(reviews, punctuation, ' \\1 '), r"\s+", ' ')
            
            for letter in list('ABCDEFGHIJKLMNOPQRSTUVWXYZ'):
                reviews = tf.regex_replace(reviews, letter, letter.lower())
                
            terms = tf.string_split(reviews, ' ')
            terms_indices = tft.compute_and_apply_vocabulary(terms, frequency_threshold=5, num_oov_buckets=1, vocab_filename='vocab')
            
            return {
                'terms': terms,
                'terms_indices': terms_indices,
                'label': inputs['label']
            }

        
        (transformed_train_data, transformed_metadata), transform_fn = (
            (train_data, RAW_DATA_METADATA)
            | 'AnalyzeAndTransform' >> beam_impl.AnalyzeAndTransformDataset(preprocessing_fn))

        transformed_test_data, _ = (
            ((test_data, RAW_DATA_METADATA), transform_fn)
            | 'Transform' >> beam_impl.TransformDataset())
        
        transformed_data_coder = tft.coders.ExampleProtoCoder(transformed_metadata.schema)

        _ = (
            transformed_train_data
            | 'EncodeTrain' >> beam.Map(transformed_data_coder.encode)
            | 'WriteTrain' >> tfrecordio.WriteToTFRecord('data/train_transformed.tfrecord'))

        _ = (
            transformed_test_data
            | 'EncodeTest' >> beam.Map(transformed_data_coder.encode)
            | 'WriteTest' >> tfrecordio.WriteToTFRecord('data/test_transformed.tfrecord'))
        
        _ = (
            transform_fn
            | 'WriteTransformFn' >> transform_fn_io.WriteTransformFn('tft_output'))



### Initialize word embeddings with [GloVe](https://nlp.stanford.edu/projects/glove/)

In [12]:
# get vocabulary
tf_transform_output = tft.TFTransformOutput('tft_output')
vocab = tf_transform_output.vocabulary_by_name('vocab')
vocab_size = len(vocab)

In [13]:
# load glove embeddings
embedding_size = 100
glove_embeddings = {}

with open('glove/glove.twitter.27B.{}d.txt'.format(embedding_size), mode='r') as f:  
    for line in f:
        values = line.strip().split()
        w = values[0]
        vectors = np.asarray(values[1:], dtype='float32')
        glove_embeddings[w] = vectors

In [14]:
# create initialized embedding matrix
embedding_matrix = truncnorm.rvs(a=-2, b=2, size=(vocab_size+1, embedding_size))

glove_np = pd.DataFrame(glove_embeddings).values
glove_mu, glove_std = np.mean(glove_np), np.std(glove_np)
        
for i, w in enumerate(vocab):
    try:
        embedding_matrix[i] = np.clip((glove_embeddings[w] - glove_mu)/glove_std, -2, 2)
    except KeyError:
        pass

embedding_matrix = embedding_matrix / math.sqrt(embedding_size)
    
def embedding_initializer(shape=None, dtype=tf.float32, partition_info=None):  
    assert dtype is tf.float32
    return embedding_matrix

### Build classifier

In [15]:
# input function
feature_spec = tf_transform_output.transformed_feature_spec()

def input_fn(input_file_pattern, num_epochs=None, batch_size=25, shuffle=True):  
    input_file_names = glob.glob(input_file_pattern)
    
    ds = tf.data.TFRecordDataset(input_file_names)
    ds = ds.map(lambda x: tf.parse_single_example(x, feature_spec))

    if shuffle:
        ds = ds.shuffle(100000)

    ds = ds.batch(batch_size).repeat(num_epochs)
    
    features = ds.make_one_shot_iterator().get_next()
    labels = features.pop('label')
    return features, labels

In [16]:
# create estimator spec
def make_model(features, labels, mode, params, config):

    # hyperparameters
    dropout = params['dropout']
    conv_filters = params['conv_filters']
    dense_units = params['dense_units']
    learning_start = params['learning_start']
    
    # set up feature columns
    terms = features['terms_indices']
    terms = tf.sparse_to_dense(terms.indices, terms.dense_shape, terms.values, default_value=vocab_size)    
    terms_embed_seq = tf.contrib.layers.embed_sequence(terms, vocab_size=vocab_size+1, embed_dim=embedding_size, initializer=embedding_initializer)
    
    # build graph
    net = terms_embed_seq
    net = tf.layers.dropout(net, rate=dropout, training=(mode == tf.estimator.ModeKeys.TRAIN))
    net = tf.layers.conv1d(inputs=net, filters=conv_filters, kernel_size=3, strides=1, activation=tf.nn.leaky_relu)
    net = tf.reduce_max(input_tensor=net, axis=1)      
    net = tf.layers.dense(net, units=dense_units, activation=tf.nn.leaky_relu)
    net = tf.layers.dropout(net, rate=dropout, training=(mode == tf.estimator.ModeKeys.TRAIN))
    logits = tf.layers.dense(net, 2)
    
    # compute predictions
    predicted_classes = tf.argmax(logits, 1)
    predicted_probs = tf.nn.softmax(logits)
    
    # generate predictions
    if mode == tf.estimator.ModeKeys.PREDICT:
        predictions = {
            'class': predicted_classes,
            'prob': predicted_probs
        }
        
        export_outputs = {
          'predict': tf.estimator.export.PredictOutput(outputs=predictions)
        }
        
        return tf.estimator.EstimatorSpec(mode, predictions=predictions, export_outputs=export_outputs)

    # compute loss
    loss = tf.losses.sparse_softmax_cross_entropy(labels=labels, logits=logits)

    # create training op with cosine annealing for learning rate
    if mode == tf.estimator.ModeKeys.TRAIN:
        global_step = tf.train.get_global_step()
        
        learning_rate = tf.train.cosine_decay(learning_rate=learning_start, global_step=global_step, alpha=0.05, decay_steps=5000)
        
        optimizer = tf.train.AdagradOptimizer(learning_rate=learning_rate)
        optimizer = tf.contrib.estimator.clip_gradients_by_norm(optimizer, 5.0)
        
        train_op = optimizer.minimize(loss, global_step=global_step)
        
        return tf.estimator.EstimatorSpec(mode, loss=loss, train_op=train_op)

    # compute evaluation metrics
    eval_metric_ops = {
        'accuracy': tf.metrics.accuracy(labels=labels, predictions=predicted_classes),
        'auc': tf.metrics.auc(labels=labels, predictions=predicted_probs[:, 1])
    }
    return tf.estimator.EstimatorSpec(mode, loss=loss, eval_metric_ops=eval_metric_ops)

### Train classifiers for each set of hyperparameters

In [17]:
# make hyperparameter grid
param_grid = {
    'dropout': [0.2, 0.3],
    'conv_filters': [50, 100, 200],
    'dense_units': [50, 100, 200],
    'learning_start': [0.05, 0.1, 0.2]
}

param_grid = list(ParameterGrid(param_grid))

In [18]:
# train all our models
classifiers = []
classifiers_stats = []

for p in param_grid:
    classifier = tf.estimator.Estimator(model_fn=make_model, params=p,
                                        config=tf.estimator.RunConfig(session_config=tf.ConfigProto(log_device_placement=True)))

    classifier.train(input_fn=lambda: input_fn('data/train_transformed.tfrecord*', num_epochs=5))
    classifier_stats = classifier.evaluate(input_fn=lambda: input_fn('data/test_transformed.tfrecord*', num_epochs=1))
    
    classifiers.append(classifier)
    classifiers_stats.append(classifier_stats)

### Evaluate and select best classifier

In [19]:
# select best hyperparameters
results = zip(param_grid, classifiers, classifiers_stats)
results.sort(key=lambda x: -x[2]['auc'])
best_params, best_classifier, best_classifier_stats = results[0]
best_params

{'conv_filters': 200,
 'dense_units': 200,
 'dropout': 0.3,
 'learning_start': 0.1}

In [20]:
# overall stats
train_stats = best_classifier.evaluate(input_fn=lambda: input_fn('data/train_transformed.tfrecord*', num_epochs=1))
test_stats = best_classifier.evaluate(input_fn=lambda: input_fn('data/test_transformed.tfrecord*', num_epochs=1))

train_stats = pd.DataFrame.from_dict(train_stats, orient='index', columns=['train'])
test_stats = pd.DataFrame.from_dict(test_stats, orient='index', columns=['test'])
stats = train_stats.join(test_stats)
stats

Unnamed: 0,train,test
loss,0.12346,0.24877
auc,0.99162,0.96591
global_step,5000.0,5000.0
accuracy,0.95612,0.90092


### Export

In [21]:
def serving_input_fn():
    review = tf.placeholder(dtype=tf.string)
    label = tf.zeros(dtype=tf.int64, shape=[1, 1]) # just a placeholder
    
    transformed_features = tf_transform_output.transform_raw_features({'review': review, 'label': label})
    
    return tf.estimator.export.ServingInputReceiver(transformed_features, {'review': review})


export_path = best_classifier.export_savedmodel(export_dir_base='exports',
                                                serving_input_receiver_fn=serving_input_fn)

export_path = export_path.decode('utf-8')

In [22]:
!!venv/bin/saved_model_cli run --input_exprs 'review=["this is a terrible movie", "this is a great movie"]'  \
--dir $export_path --tag_set serve --signature_def predict

['2018-09-28 19:35:45.244595: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:897] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero',
 '2018-09-28 19:35:45.245774: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1405] Found device 0 with properties: ',
 'name: Tesla P100-PCIE-16GB major: 6 minor: 0 memoryClockRate(GHz): 1.3285',
 'pciBusID: 0000:00:04.0',
 'totalMemory: 15.90GiB freeMemory: 354.06MiB',
 '2018-09-28 19:35:45.245806: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1484] Adding visible gpu devices: 0',
 '2018-09-28 19:35:45.562533: I tensorflow/core/common_runtime/gpu/gpu_device.cc:965] Device interconnect StreamExecutor with strength 1 edge matrix:',
 '2018-09-28 19:35:45.562583: I tensorflow/core/common_runtime/gpu/gpu_device.cc:971]      0 ',
 '2018-09-28 19:35:45.562592: I tensorflow/core/common_runtime/gpu/gpu_device.cc:984] 0:   N ',
 '2018-09-28 19:35:45.562758: I tensor