# Enterprise Deep Learning with TensorFlow: openSAP

## SAP Innovation Center Network

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

## Load the necessary modules

In [11]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import tensorflow as tf
import numpy as np

from functools import reduce

import os
import shutil

## Setting up our environment

In [12]:
tf.logging.set_verbosity(tf.logging.INFO)

In [21]:
TRAINING_POS_PATH = "data/sentiment/rt-polarity-pos.txt"
TRAINING_NEG_PATH = "data/sentiment/rt-polarity-neg.txt"
NUM_WORDS = 200
TRAIN_RATIO = .8
NUM_EPOCHS = 50
BATCH_SIZE = 512
LEARNING_RATE = .001
NUM_LAYERS = 1
NUM_UNITS = 10
DROPOUT_PROB = .8
EMBEDDING_SIZE = 10
MODEL_PATH = "dump/"
EVERY_N_ITER = 500
CLEAN = True

In [22]:
# Clean
if CLEAN:
    if os.path.exists(MODEL_PATH):
        shutil.rmtree(MODEL_PATH)
        os.mkdir(MODEL_PATH)

## Creating our data set

In [18]:
# create data/sentiment folder, if it doesn't exist
import os
if not os.path.exists("data/sentiment"):
    os.makedirs("data/sentiment")
    
#Download sentence polarity movie review dataset 
# https://www.cs.cornell.edu/people/pabo/movie-review-data/
!wget https://raw.githubusercontent.com/abromberg/sentiment_analysis/master/polarityData/rt-polaritydata/rt-polarity-pos.txt --directory-prefix=./data/sentiment/
!wget https://raw.githubusercontent.com/abromberg/sentiment_analysis/master/polarityData/rt-polaritydata/rt-polarity-neg.txt --directory-prefix=./data/sentiment/

--2017-11-05 17:43:43--  https://raw.githubusercontent.com/abromberg/sentiment_analysis/master/polarityData/rt-polaritydata/rt-polarity-pos.txt
Resolving raw.githubusercontent.com... 151.101.112.133
Connecting to raw.githubusercontent.com|151.101.112.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 626168 (611K) [text/plain]
Saving to: './data/sentiment/rt-polarity-pos.txt'


2017-11-05 17:43:43 (2.44 MB/s) - './data/sentiment/rt-polarity-pos.txt' saved [626168/626168]

--2017-11-05 17:43:43--  https://raw.githubusercontent.com/abromberg/sentiment_analysis/master/polarityData/rt-polaritydata/rt-polarity-neg.txt
Resolving raw.githubusercontent.com... 151.101.112.133
Connecting to raw.githubusercontent.com|151.101.112.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 612290 (598K) [text/plain]
Saving to: './data/sentiment/rt-polarity-neg.txt'


2017-11-05 17:43:44 (2.18 MB/s) - './data/sentiment/rt-polarity-neg.txt' saved [612290/6

In [30]:
# Load datasets
def clean_corpus(fname):
    sentences = []
    for line in open(fname, encoding="utf-8", errors="ignore"):
        # Remove leading/trailing whitespace at the boundaries
        line = line.strip()

        # Don't include empty lines
        if not line:
            continue

        # Remove non-alphanumeric characters (excl. spaces)
        line = "".join([char for char in line if char.isalnum() or char == " "])

        # Remove leading/trailing spaces between words
        line = " ".join([word for word in line.split(" ") if word.strip()])

        # Split into words
        words = line.split(" ")

        # Add to collection
        sentences.append(words)
    return sentences

In [31]:
# Define input
sentences_pos = clean_corpus(fname=TRAINING_POS_PATH)
sentences_neg = clean_corpus(fname=TRAINING_NEG_PATH)


In [32]:
# Truncate after word threshold
sentences_pos = [sentence[:NUM_WORDS] for sentence in sentences_pos]
sentences_neg = [sentence[:NUM_WORDS] for sentence in sentences_neg]

In [33]:
# Determine dictionary (0 is used for padding)
sentences = sentences_pos + sentences_neg
dictionary = [word for sentence in sentences for word in sentence]
dictionary = list(set(dictionary))
dictionary = dict(zip(dictionary, range(1, len(dictionary) + 1)))

In [34]:
# Convert sentences to sequences of integers
sentences_pos = [[dictionary[word] for word in sentence] for sentence in sentences_pos]
sentences_neg = [[dictionary[word] for word in sentence] for sentence in sentences_neg]

In [35]:
# Check integrity
dictionary_inv = {b: a for a, b in dictionary.items()}
print("POS: " + " ".join([dictionary_inv[index] for index in sentences_pos[0]]))
print("NEG: " + " ".join([dictionary_inv[index] for index in sentences_neg[0]]))

POS: the rock is destined to be the 21st centurys new conan and that hes going to make a splash even greater than arnold schwarzenegger jeanclaud van damme or steven segal
NEG: simplistic silly and tedious


In [36]:
# Pad sentences to same length from the left side (with zeros)
def pad_zeros(sentence):
    if len(sentence) == NUM_WORDS:
        return sentence
    else:
        return [0] * (NUM_WORDS - len(sentence)) + sentence

In [37]:
sentences_pos = [pad_zeros(sentence) for sentence in sentences_pos]
sentences_neg = [pad_zeros(sentence) for sentence in sentences_neg]

In [38]:
# Create data set
data_pos = np.array(sentences_pos, dtype=np.int32)
data_pos_labels = np.ones(shape=[len(sentences_pos)], dtype=np.int32)

data_neg = np.array(sentences_neg, dtype=np.int32)
data_neg_labels = np.zeros(shape=[len(sentences_neg)], dtype=np.int32)

data = np.vstack((data_pos, data_neg))
data_labels = np.concatenate((data_pos_labels, data_neg_labels))

In [39]:
# Split into training/test set
np.random.shuffle(data)
num_rows = data.shape[0]

split_train = int(num_rows * TRAIN_RATIO)
train, train_labels = data[:split_train, :], data_labels[:split_train]
test, test_labels = data[split_train:, :], data_labels[split_train:]

In [40]:
# Create input function
def get_input_fn(x, y=None, batch_size=128, num_epochs=1, shuffle=False):
    return tf.estimator.inputs.numpy_input_fn(x={"x": x},
                                              y=y,
                                              batch_size=batch_size,
                                              num_epochs=num_epochs,
                                              shuffle=shuffle)

In [41]:
# Set model params
model_params = {"learning_rate": LEARNING_RATE,
                "num_layers": NUM_LAYERS,
                "num_units": NUM_UNITS,
                "embedding_size": EMBEDDING_SIZE,
                "dropout_prob": DROPOUT_PROB,
                "vocabulary_size": len(dictionary) + 1}

In [42]:
# Log loss
loss_hook = tf.train.LoggingTensorHook(["loss"], every_n_iter=EVERY_N_ITER)

## Define our model

In [43]:
# Define LSTM model function
def lstm_model_fn(features, labels, mode, params):

    # Define input layer
    input_layer = features["x"]

    # Embedding layer
    word_embeddings = tf.get_variable(name="word_embeddings",
                                      shape=[params["vocabulary_size"], params["embedding_size"]],
                                      initializer=tf.random_normal_initializer())
    input_layer = tf.nn.embedding_lookup(word_embeddings, input_layer)

    # LSTM (with dropout)
    basic_lstm_cells = [tf.contrib.rnn.BasicLSTMCell(num_units=params["num_units"],
                                                     activation=tf.nn.tanh)
                        for _ in range(params["num_layers"])]
    dropout_lstm_cells = [tf.nn.rnn_cell.DropoutWrapper(basic_lstm_cell, output_keep_prob=params["dropout_prob"])
                          for basic_lstm_cell in basic_lstm_cells]
    multi_lstm_cells = tf.nn.rnn_cell.MultiRNNCell(dropout_lstm_cells)
    outputs, states = tf.nn.dynamic_rnn(multi_lstm_cells, input_layer, dtype=tf.float32)

    # Extract final state (last hidden state of sequence of topmost layer)
    final_state = states[-1].h

    # Fully connected layer (with linear activation)
    logits = tf.squeeze(tf.layers.dense(inputs=final_state, units=1, activation=None))

    # Define output
    sentiment = tf.sigmoid(logits)

    if mode == tf.estimator.ModeKeys.PREDICT:
        return tf.estimator.EstimatorSpec(
            mode=mode,
            predictions={"sentiment": sentiment})

    # Cast labels
    labels = tf.cast(labels, dtype=tf.float32)

    # Define loss
    loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(labels=labels, logits=logits), name="loss")

    with tf.name_scope("summaries"):
        tf.summary.scalar("cross_entropy", loss)

    # Optimizer
    optimizer = tf.train.RMSPropOptimizer(learning_rate=params["learning_rate"])
    train_op = optimizer.minimize(loss=loss, global_step=tf.train.get_global_step())

    return tf.estimator.EstimatorSpec(
        mode=mode,
        loss=loss,
        train_op=train_op)

## Instantiate our estimator

In [44]:
# Instantiate Estimator
nn = tf.estimator.Estimator(model_fn=lstm_model_fn,
                            params=model_params,
                            model_dir=MODEL_PATH)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': 'dump/', '_tf_random_seed': 1, '_save_summary_steps': 100, '_save_checkpoints_secs': 600, '_save_checkpoints_steps': None, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100}


## Start Training

In [45]:
# Train
nn.train(input_fn=get_input_fn(x=train,
                               y=train_labels,
                               batch_size=BATCH_SIZE,
                               num_epochs=NUM_EPOCHS,
                               shuffle=True),
         hooks=[loss_hook])

INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Saving checkpoints for 1 into dump/model.ckpt.
INFO:tensorflow:loss = 0.702975
INFO:tensorflow:loss = 0.702975, step = 1
INFO:tensorflow:global_step/sec: 4.70295
INFO:tensorflow:loss = 0.69077, step = 101 (21.264 sec)
INFO:tensorflow:global_step/sec: 4.12874
INFO:tensorflow:loss = 0.697919, step = 201 (24.220 sec)
INFO:tensorflow:global_step/sec: 4.32652
INFO:tensorflow:loss = 0.709841, step = 301 (23.117 sec)
INFO:tensorflow:global_step/sec: 3.96112
INFO:tensorflow:loss = 0.718091, step = 401 (25.246 sec)
INFO:tensorflow:global_step/sec: 4.16157
INFO:tensorflow:loss = 0.733326 (117.877 sec)
INFO:tensorflow:loss = 0.733326, step = 501 (24.029 sec)
INFO:tensorflow:global_step/sec: 4.37983
INFO:tensorflow:loss = 0.749342, step = 601 (22.829 sec)
INFO:tensorflow:global_step/sec: 4.07951
INFO:tensorflow:loss = 0.752695, step = 701 (24.517 sec)
INFO:tensorflow:global_step/sec: 3.47833
INFO:tensorflow:loss = 0.761257, step = 801 (28

<tensorflow.python.estimator.estimator.Estimator at 0x1133eaf60>

In [46]:
# Test
eval_dict = nn.evaluate(input_fn=get_input_fn(x=test,
                                              y=test_labels,
                                              batch_size=test.shape[0]))

print("Cross entropy (test set): {0:.2f}".format(eval_dict["loss"]))

INFO:tensorflow:Starting evaluation at 2017-11-05-16:49:02
INFO:tensorflow:Restoring parameters from dump/model.ckpt-833
INFO:tensorflow:Finished evaluation at 2017-11-05-16:49:03
INFO:tensorflow:Saving dict for global step 833: global_step = 833, loss = 1.04197
Cross entropy (test set): 1.04


## Start Prediction

In [47]:
# Predict
prediction = nn.predict(input_fn=get_input_fn(x=test,
                                              y=test_labels,
                                              batch_size=test.shape[0]))
sentiments = np.array([p["sentiment"] for p in prediction])

INFO:tensorflow:Restoring parameters from dump/model.ckpt-833


In [48]:
# Find indices that would sort array (ascending order)
idx = np.argsort(sentiments)
idx_lo = idx[:5]
idx_hi = idx[-5:]

## What is the sentiment of some example sentences?

In [49]:
# Map word indices back to strings
dictionary_inv[0] = ""
map2str = np.vectorize(dictionary_inv.__getitem__)
test_str = map2str(test)
test_str = np.apply_along_axis(lambda row: reduce(lambda a, b: a.strip() + " " + b.strip(), row), axis=1, arr=test_str)

In [50]:
# Most negative
print("NEGATIVE:")
for i in idx_lo:
    print("\t{} ::: {:.3f}".format(test_str[i], sentiments[i]))
print()

NEGATIVE:
	you may think you have figured out the con and the players in this debut film by argentine director fabian bielinsky but while you were thinking someone made off with your wallet ::: 0.521
	its quite an achievement to set and shoot a movie at the cannes film festival and yet fail to capture its visual appeal or its atmosphere ::: 0.527
	it might be tempting to regard mr andrew and his collaborators as oddballs but mr earnharts quizzical charming movie allows us to see them finally as artists ::: 0.533
	a crisply made movie that is no more than mildly amusing ::: 0.536
	with a cast that includes some of the top actors working in independent film lovely amazing involves us because it is so incisive so bleakly amusing about how we go about our lives ::: 0.538



In [51]:
# Most positive
print("POSITIVE:")
for i in idx_hi[::-1]:
    print("\t{} ::: {:.3f}".format(test_str[i], sentiments[i]))

POSITIVE:
	the people in abc africa are treated as docile mostly wordless ethnographic extras ::: 0.761
	apparently romantic comedy with a fresh point of view just doesnt figure in the present hollywood program ::: 0.761
	those unfamiliar with mormon traditions may find the singles ward occasionally bewildering ::: 0.749
	it lacks the compassion goodnatured humor and the level of insight that made eyres first film something of a sleeper success ::: 0.744
	playfully profound and crazier than michael jackson on the top floor of a skyscraper nursery surrounded by open windows ::: 0.743
