In [None]:
pip install -U tensorflow-serving-api==1.15.* --user

In [None]:
import tensorflow as tf

print(tf.__version__)
tf.logging.set_verbosity(tf.logging.INFO)

## 0. Imports

In [None]:
import pandas as pd
import numpy as np
import os

## 1. Configuration work

In [None]:
PROJECT = 'news-ml-257304'
BUCKET = 'news-ml'
ROOT = 'mlpipeline'
MODEL_DIR = os.path.join(ROOT,'models').replace("\\","/")
PACKAGES_DIR = os.path.join(ROOT,'packages').replace("\\","/")

In [None]:
!gcloud config set project {PROJECT}

In [None]:
!gsutil rm -r gs://{BUCKET}/{ROOT}

## 2. Get data

### 2.1. Input data

In [None]:
sentiment_mapping = {
    0: 'negative',
    2: 'neutral',
    4: 'positive'
}

Data can be downloaded from: https://www.kaggle.com/kazanova/sentiment140

In [None]:
df_twitter = pd.read_csv('training.csv', encoding='latin1', header=None)\
             .rename(columns={
                 0: 'sentiment',
                 1: 'id',
                 2: 'posted_at',
                 3: 'query',
                 4: 'username',
                 5: 'text'
             })[['sentiment', 'text']]

In [None]:
df_twitter["sentiment_label"] = df_twitter["sentiment"].map(sentiment_mapping)

In [None]:
df_twitter["sentiment_label"].count()

### 2.2. Data processing fn

In [None]:
%%writefile preprocess.py

from tensorflow.python.keras.preprocessing import sequence
from tensorflow.keras.preprocessing import text
import re


class TextPreprocessor(object):
    def __init__(self, vocab_size, max_sequence_length):
        self._vocab_size = vocab_size
        self._max_sequence_length = max_sequence_length
        self._tokenizer = None

    def _clean_line(self, text):
        text = re.sub(r"http\S+", "", text)
        text = re.sub(r"@[A-Za-z0-9]+", "", text)
        text = re.sub(r"#[A-Za-z0-9]+", "", text)
        text = text.replace("RT","")
        text = text.lower()
        text = text.strip()
        return text
    
    def fit(self, text_list):        
        # Create vocabulary from input corpus.
        text_list_cleaned = [self._clean_line(txt) for txt in text_list]
        tokenizer = text.Tokenizer(num_words=self._vocab_size)
        tokenizer.fit_on_texts(text_list)
        self._tokenizer = tokenizer

    def transform(self, text_list):        
        # Transform text to sequence of integers
        text_list = [self._clean_line(txt) for txt in text_list]
        text_sequence = self._tokenizer.texts_to_sequences(text_list)

        # Fix sequence length to max value. Sequences shorter than the length are
        # padded in the beginning and sequences longer are truncated
        # at the beginning.
        padded_text_sequence = sequence.pad_sequences(
          text_sequence, maxlen=self._max_sequence_length)
        return padded_text_sequence

Some small test:

In [None]:
from preprocess import TextPreprocessor

processor = TextPreprocessor(5, 5)
processor.fit(['hello machine learning','test'])
processor.transform(['hello machine learning',"lol"])

### 2.3. Prep data

In [None]:
CLASSES = {'negative': 0, 'positive': 1}  # label-to-int mapping
VOCAB_SIZE = 25000  # Limit on the number vocabulary size used for tokenization
MAX_SEQUENCE_LENGTH = 50  # Sentences will be truncated/padded to this length

In [None]:
from preprocess import TextPreprocessor
from sklearn.model_selection import train_test_split

sents = df_twitter.text
labels = np.array(df_twitter.sentiment_label.map(CLASSES))

# Train and test split
X, _, y, _ = train_test_split(sents, labels, test_size=0.1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

# Create vocabulary from training corpus.
processor = TextPreprocessor(VOCAB_SIZE, MAX_SEQUENCE_LENGTH)
processor.fit(X_train)

# Preprocess the data
train_texts_vectorized = processor.transform(X_train)
eval_texts_vectorized = processor.transform(X_test)

In [None]:
import pickle

with open('./processor_state.pkl', 'wb') as f:
    pickle.dump(processor, f)

## 3. Model

In [None]:
# Hyperparameters

LEARNING_RATE = .001
EMBEDDING_DIM = 50
FILTERS = 64
DROPOUT_RATE = 0.5
POOL_SIZE = 3
NUM_EPOCH = 25
BATCH_SIZE = 128
KERNEL_SIZES = [2, 5, 8]

### 3.1. Basic model

In [None]:
def create_model(vocab_size, embedding_dim, filters, kernel_sizes, dropout_rate, pool_size, embedding_matrix):
    
    # Input layer
    model_input = tf.keras.layers.Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')

    # Embedding layer
    z = tf.keras.layers.Embedding(
        input_dim=vocab_size + 1,
        output_dim=embedding_dim,
        input_length=MAX_SEQUENCE_LENGTH,
        weights=[embedding_matrix]
    )(model_input)

    z = tf.keras.layers.Dropout(dropout_rate)(z)

    # Convolutional block
    conv_blocks = []
    for kernel_size in kernel_sizes:
        conv = tf.keras.layers.Convolution1D(
            filters=filters,
            kernel_size=kernel_size,
            padding="valid",
            activation="relu",
            bias_initializer='random_uniform',
            strides=1)(z)
        conv = tf.keras.layers.MaxPooling1D(pool_size=2)(conv)
        conv = tf.keras.layers.Flatten()(conv)
        conv_blocks.append(conv)
        
    z = tf.keras.layers.Concatenate()(conv_blocks) if len(conv_blocks) > 1 else conv_blocks[0]

    z = tf.keras.layers.Dropout(dropout_rate)(z)
    z = tf.keras.layers.Dense(100, activation="relu")(z)
    model_output = tf.keras.layers.Dense(1, activation="sigmoid")(z)
    model = tf.keras.models.Model(model_input, model_output)
    
    return model

### 3.2. Pretrained Glove embeddings

Embedding can be downloaded here: https://nlp.stanford.edu/projects/glove/
- Download file [here](http://nlp.stanford.edu/data/glove.twitter.27B.zip)
- Twitter (2B tweets, 27B tokens, 1.2M vocab, uncased, 25d, 50d, 100d, & 200d vectors, 1.42 GB download)


In [None]:
def get_coefs(word, *arr): 
    return word, np.asarray(arr, dtype='float32')

embeddings_index = dict(get_coefs(*o.strip().split()) for o in open('glove.twitter.27B.50d.txt','r', encoding='utf8'))                                                                                                                                    

In [None]:
word_index = processor._tokenizer.word_index
nb_words = min(VOCAB_SIZE, len(word_index))
embedding_matrix = np.zeros((nb_words + 1, EMBEDDING_DIM))

for word, i in word_index.items():
    if i >= VOCAB_SIZE: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector

### 3.3. Create, compile and train model

In [None]:
model = create_model(VOCAB_SIZE, EMBEDDING_DIM, FILTERS, KERNEL_SIZES, DROPOUT_RATE,POOL_SIZE, embedding_matrix)

In [None]:
# Compile model with learning parameters.

optimizer = tf.keras.optimizers.Nadam(lr=0.001)
model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['acc'])

In [None]:
#Keras train
history = model.fit(
    train_texts_vectorized, 
    y_train, 
    epochs=NUM_EPOCH, 
    batch_size=BATCH_SIZE,
    validation_data=(eval_texts_vectorized, y_test),
    verbose=2,
    callbacks=[
        tf.keras.callbacks.ReduceLROnPlateau(
            monitor='val_acc',
            min_delta=0.005,
            patience=3,
            factor=0.5),
        tf.keras.callbacks.EarlyStopping(
            monitor='val_loss',
            min_delta=0.005, 
            patience=5, 
            verbose=0, 
            mode='auto'
        ),
        tf.keras.callbacks.History()
    ]
)

In [None]:
with open("history.pkl",'wb') as file:
    pickle.dump(history.history,file)

In [None]:
model.save('keras_saved_model.h5')

## 4. Deployment

### 4.1. Prepare custom model prediction

In [None]:
%%writefile model_prediction.py

import os
import pickle
import numpy as np


class CustomModelPrediction(object):

  def __init__(self, model, processor):
    self._model = model
    self._processor = processor

  def _postprocess(self, predictions):
    labels = ['negative', 'positive']
    return [
        {
            "label":labels[int(np.round(prediction))],
            "score":float(np.round(prediction,4))
        } for prediction in predictions]


  def predict(self, instances, **kwargs):
    preprocessed_data = self._processor.transform(instances)
    predictions =  self._model.predict(preprocessed_data)
    labels = self._postprocess(predictions)
    return labels


  @classmethod
  def from_path(cls, model_dir):
    import tensorflow.keras as keras
    model = keras.models.load_model(
      os.path.join(model_dir,'keras_saved_model.h5'))
    with open(os.path.join(model_dir, 'processor_state.pkl'), 'rb') as f:
      processor = pickle.load(f)

    return cls(model, processor)

Test

In [None]:
requests = (["God I hate the north", "god I love this"])

In [None]:
from model_prediction import CustomModelPrediction

classifier = CustomModelPrediction.from_path('.')
results = classifier.predict(requests)
results

### 4.2. Package it

In [None]:
%%writefile setup.py

from setuptools import setup

setup(
  name="tweet_sentiment_classifier",
  version="0.1",
  include_package_data=True,
  scripts=["preprocess.py", "model_prediction.py"]
)

Wrap it up and copy to GCP

In [None]:
!python setup.py sdist
!gsutil cp ./dist/tweet_sentiment_classifier-0.1.tar.gz gs://{BUCKET}/{PACKAGES_DIR}/tweet_sentiment_classifier-0.1.tar.gz

In [None]:
!gsutil cp keras_saved_model.h5 gs://{BUCKET}/{MODEL_DIR}/
!gsutil cp processor_state.pkl gs://{BUCKET}/{MODEL_DIR}/

## 5. Create model and version

In [None]:
MODEL_NAME='twitter_model'
VERSION_NAME='v1'
RUNTIME_VERSION='1.15'
REGION='us-central1'

In [None]:
!gcloud ai-platform models create {MODEL_NAME} --regions {REGION}

In [None]:
!gcloud ai-platform versions delete {VERSION_NAME} --model {MODEL_NAME} --quiet

In [None]:
!gcloud beta ai-platform versions create {VERSION_NAME} \
--model {MODEL_NAME} \
--origin gs://{BUCKET}/{MODEL_DIR} \
--python-version 3.5 \
--runtime-version {RUNTIME_VERSION} \
--package-uris gs://{BUCKET}/{PACKAGES_DIR}/tweet_sentiment_classifier-0.1.tar.gz \
--prediction-class=model_prediction.CustomModelPrediction

## 6. Testing

In [None]:
from googleapiclient import discovery
from oauth2client.client import GoogleCredentials
import json

In [None]:
requests = [
    "god this episode sucks",
    "meh, I kinda like it",
    "what were the writer thinking, omg!",
    "omg! what a twist, who would'v though :o!",
    "woohoow, sansa for the win!"
]

In [None]:
# JSON format the requests
request_data = {'instances': requests}

# Authenticate and call CMLE prediction API 
#credentials = GoogleCredentials.get_application_default()

In [None]:
%%time

api = discovery.build(
  'ml', 'v1',
  discoveryServiceUrl='https://storage.googleapis.com/cloud-ml/discovery/ml_v1_discovery.json')

parent = 'projects/{}/models/{}/versions/{}'.format(PROJECT, MODEL_NAME, VERSION_NAME)
parent = 'projects/{}/models/{}'.format(PROJECT, MODEL_NAME)
response = api.projects().predict(body=request_data, name=parent).execute()

In [None]:
response["predictions"]