# Why use docker/tensorflow serving?
- flask etc, upon single user's inference, locks up the server cpu/gpu and other users have to wait
- flask cannot load large models, eg computer vision model with 1gb weights
- extremely cumbersome to rewrite code for each flask deployment/model updates and onboard other team members.
- with tf serving, you can export new models to the deployment folder and tf serving will automatically update to the latest detected saved model.


### Let's begin with a simple sentiment analysis model on an amazon reviews dataset.

In [42]:
# %%writefile -a train.py

import pandas as pd
import numpy as np
import os
import time
import tensorflow as tf
import tensorflow_hub as hub

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 50)
float_formatter = "{:.2f}".format
np.set_printoptions(formatter={'float_kind':float_formatter})

Writing train.py


In [3]:

###? IN LINUX YOU CAN READ THE FIRST FEW LINES OF THE CSV FROM CLI, UNFORTUNATELY NO EQUIVALENT IN WINDOWS.
# !head -n 2 ./data/amazonfinefoodreviews/Reviews.csv

In [43]:
# %%writefile -a train.py 

def load_dataset(filepath, num_samples):
    df = pd.read_csv(filepath, usecols=[6,9], nrows=num_samples)
    df.columns = ['rating','text']
    df['label'] = df['rating'].apply(lambda x: 1 if x>=4 else 0 if x==3 else -1)

    text = df['text'].tolist()
    # text = [str(t).encode('ascii', 'replace') for t in text]
    text = np.array(text, dtype=object)

    labels = df['label'].tolist()
    labels = np.array(pd.get_dummies(labels), dtype=int)[:]

    return labels, text

Appending to train.py


In [13]:

###? LOAD DATASET
labels, text = load_dataset(filepath='./data/amazonfinefoodreviews/Reviews.csv', num_samples=5)

for lbl,txt in list(zip(labels, text)):
    print(lbl,txt)

[0 1] I have bought several of the Vitality canned dog food products and have found them all to be of good quality. The product looks more like a stew than a processed meat and it smells better. My Labrador is finicky and she appreciates this product better than  most.
[1 0] Product arrived labeled as Jumbo Salted Peanuts...the peanuts were actually small sized unsalted. Not sure if this was an error or if the vendor intended to represent the product as "Jumbo".
[0 1] This is a confection that has been around a few centuries.  It is a light, pillowy citrus gelatin with nuts - in this case Filberts. And it is cut into tiny squares and then liberally coated with powdered sugar.  And it is a tiny mouthful of heaven.  Not too chewy, and very flavorful.  I highly recommend this yummy treat.  If you are familiar with the story of C.S. Lewis' "The Lion, The Witch, and The Wardrobe" - this is the treat that seduces Edmund into selling out his Brother and Sisters to the Witch.
[1 0] If you are 

### We'll use a pre-trained model from tfhub.dev 
- these pretrained NNs return 50 dims and 128 dims respectively. go to tfhub.dev to browse other available models.
- https://tfhub.dev/google/tf2-preview/nnlm-en-dim50/1
- https://tfhub.dev/google/tf2-preview/nnlm-en-dim128/1


In [44]:
# %%writefile -a train.py 

def get_model():
    hub_layer = hub.KerasLayer('https://tfhub.dev/google/tf2-preview/nnlm-en-dim128/1', output_shape=[128], input_shape=[], dtype=tf.string, name='input', trainable=False)

    model = tf.keras.Sequential()
    model.add(hub_layer)
    model.add(tf.keras.layers.Dense(64, activation='relu'))
    model.add(tf.keras.layers.Dense(3, activation='softmax', name='output'))
    model.compile(loss='categorical_crossentropy', optimizer='Adam', metrics=['accuracy'])
    model.summary()
    return model

Appending to train.py


### The pre-trained model already does text preprocessing for us. Let's get a sense of the embeddings it produces.

In [22]:
statements = [
    'very bad',
    'atrocious',
    'good',
    'this is interesting',
]

embed = hub.load('https://tfhub.dev/google/tf2-preview/nnlm-en-dim128/1')
embeddings = embed(statements)
embeddings = pd.DataFrame(embeddings)
embeddings

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127
0,0.225497,-0.097742,0.23063,0.139735,0.086153,-0.045874,0.071118,-0.160892,0.112284,0.011471,-0.007685,-0.075731,-0.092246,-0.120212,-0.040878,0.18859,0.045904,0.006215,-0.124716,0.082918,0.118537,0.050865,-0.06102,-0.034868,0.024469,0.085191,0.109251,-0.024774,0.011921,0.146256,-0.037281,0.080321,-0.095374,-0.043692,0.136369,0.042527,0.053423,-0.100668,-0.034157,0.115415,-0.135856,-0.059653,-0.008103,0.105087,0.045498,-0.033328,-0.006868,0.004209,0.091305,-0.003479,-0.03337,0.105743,-0.112079,0.06811,-0.041359,-0.018282,-0.13267,0.016037,-0.096326,-0.003783,-0.059288,-0.059012,0.078189,-0.089968,0.046925,0.117595,-0.164911,-0.042599,0.09708,0.028123,-0.200099,0.088148,0.027126,-0.029836,0.045751,0.100097,0.051409,-0.131577,0.039602,-0.101337,0.0185,-0.029483,0.025566,-0.152488,0.136041,-0.041233,-0.017325,0.068525,0.108027,0.191753,0.034966,0.055814,-0.058237,-0.01229,0.030932,0.137788,0.047096,-0.037614,-0.04231,0.137921,0.164177,0.104449,0.124679,-0.109762,-0.015248,-0.188383,-0.210898,0.028174,-0.034462,-0.093645,-0.068922,-0.084514,-0.024698,0.107229,-0.042552,-0.14707,0.200573,0.060148,-0.022853,0.095694,0.162381,0.030324,0.004255,0.068805,0.030952,-0.146935,0.108283,0.048812
1,0.081005,0.004228,-0.053044,0.04052,0.024845,-0.008541,0.179885,0.05603,-0.036178,0.012456,0.044646,-0.046362,-0.058848,-0.065474,-0.073071,0.147705,-0.00553,-0.028087,-0.045952,0.049049,-0.002582,-0.00843,-0.073209,0.073207,-0.107293,0.103643,0.00663,-0.052113,0.079209,0.060558,-0.042594,0.041347,-0.104618,-0.068099,0.040426,0.021027,0.181957,-0.093084,-0.11065,0.033879,-0.117493,-0.068428,-0.067708,0.02373,0.008597,-0.035992,0.016599,0.030511,0.195763,0.029416,-0.101782,-0.032113,-0.186283,0.007604,-0.039727,0.013488,-0.028124,0.07382,-0.198047,-0.012959,-0.099689,-0.034324,-0.015093,0.005853,-0.025376,0.181673,-0.087652,-0.018117,0.050967,0.077971,-0.226367,-0.038977,0.033673,-0.021974,-0.165693,0.055661,0.095474,0.021724,0.069713,0.036554,-0.033836,-0.084823,-0.006852,-0.018747,-0.050216,0.054739,0.045196,0.191172,-0.015382,0.236838,0.109173,0.056431,-0.026758,0.085752,0.025446,0.166266,0.029647,-0.124076,0.097932,0.151055,0.08289,0.046979,0.057959,-0.088994,-0.019969,-0.041011,-0.164735,0.071074,-0.000972,-0.045036,0.0271,0.078719,-0.078794,0.043396,0.043838,-0.15506,0.175683,0.152753,-0.023194,0.044113,0.16254,-0.036762,-0.014564,0.092364,-0.087684,-0.120937,0.202286,-0.008139
2,0.123925,-0.052066,0.138516,0.071824,0.129243,-0.16227,0.139032,-0.001358,0.152213,0.068851,-0.019834,-0.091487,-0.024753,-0.131013,-0.113271,0.037514,0.060303,0.044764,0.000226,0.083523,0.127654,-0.055403,-0.032948,0.062515,0.212051,-0.086504,0.082693,-0.041964,0.008191,0.021028,-0.021348,0.053067,0.023826,-0.053263,0.138226,0.034548,-0.003407,-0.066503,-0.06656,0.201065,-0.044213,-0.099786,-0.111227,0.112045,0.02347,0.029281,-0.095862,0.104739,0.138966,0.017368,-0.009061,-0.09456,-0.090627,0.074103,-0.08625,0.049916,0.004081,-0.098276,-0.092278,-0.026923,0.03028,0.035743,0.058266,-0.002155,-0.016706,0.129651,-0.090009,-0.065476,0.092167,0.080453,-0.020104,0.058897,-0.069826,-0.009155,0.079503,0.202416,0.019736,-0.102979,-0.01687,-0.048642,0.014475,0.066373,0.028614,0.023247,0.092261,0.141972,0.058337,0.093477,0.088508,0.173028,0.046415,0.061534,-0.096793,0.036895,0.053152,-0.011135,3e-06,-0.040696,0.022206,0.108058,0.180483,0.030415,-0.030597,-0.09553,-0.035163,-0.089695,-0.102284,0.092725,-0.10942,-0.034426,0.168165,0.022106,-0.139659,0.165264,-0.136218,0.012064,0.084255,0.117795,-0.073345,0.118273,0.1013,-0.055486,-0.043401,0.068995,-0.025478,-0.056114,0.200029,0.029064
3,0.293592,-0.056428,-0.024551,0.071801,-0.071654,-0.116774,-0.023651,0.003016,0.156659,-0.056435,-0.046234,-0.209807,-0.100383,-0.155047,-0.087509,-0.012929,-0.012676,0.106753,-0.178907,0.164996,0.066246,0.044185,0.120277,-0.039545,0.127516,-0.017064,0.043701,-0.050214,0.017625,0.114384,-0.068798,-0.036497,0.031297,-0.037767,0.072512,0.070644,-0.093135,-0.033789,-0.105243,0.03678,-0.111621,-0.076665,0.017004,0.094211,-0.045597,-0.091296,-0.104439,-0.285935,-0.005899,-0.011326,-0.003503,-0.051269,-0.13077,0.097393,-0.057601,0.099895,0.052592,-0.06107,0.059657,-0.019298,-0.003554,-0.139596,0.010336,-0.032279,0.04447,-0.003107,-0.038024,-0.232843,-0.006948,0.106289,-0.041557,0.080512,-0.1064,-0.135224,0.083332,0.005111,-0.136972,-0.091987,-0.077191,-0.02733,-0.085932,0.092863,-0.088209,0.003616,0.029883,0.068965,-0.018657,0.085188,0.327313,0.347952,0.005294,0.013236,-0.00908,-0.043094,-0.098621,0.109728,-0.023589,-0.007549,0.003626,0.055322,0.06279,0.102514,0.06806,0.049274,0.124404,-0.135533,-0.131421,-0.05887,-0.062151,0.023403,0.014734,-0.048934,0.071324,-0.014838,-0.091412,-0.08745,-0.014189,-0.003481,-0.07155,-0.060644,0.026855,-0.067761,-0.059928,0.128818,0.036933,0.031695,0.049605,-0.023125


### Train model

In [45]:
# %%writefile -a train.py 

def train(EPOCHS=8, BATCH_SIZE=8, TRAIN_FILE='train.csv', VAL_FILE='test.csv'):
    WORKDIR = os.getcwd()
    print("Loading train and val data...")
    y_train, x_train = load_dataset(TRAIN_FILE, num_samples=100000)
    y_val, x_val     = load_dataset(VAL_FILE,   num_samples=10000)

    print("Training the model...")
    model = get_model()
    model.fit(x_train, y_train, batch_size=BATCH_SIZE, epochs=EPOCHS,
              verbose=1, validation_data=(x_val, y_val),
              callbacks=[tf.keras.callbacks.ModelCheckpoint(os.path.join(WORKDIR, 'model_checkpoint'), monitor='val_loss', verbose=1, save_best_model=True, save_weights_only=False, mode='auto')])
    
    return model

def export_model(model, base_path='amazon_review/'):
    path = os.path.join(base_path, str(int(time.time())))
    tf.saved_model.save(model, path)

Appending to train.py


In [46]:
# %%writefile -a train.py 

###? TRAIN AND EXPORT MODEL AS PROTOBUF
if __name__=='__main__':
    model = train(TRAIN_FILE='./data/amazonfinefoodreviews/train.csv', VAL_FILE='./data/amazonfinefoodreviews/test.csv')
    export_model(model)

Appending to train.py


### Quick Inference Check

In [39]:
test_sentences = [
    'waste of time.',
    'this sucks',
    'brilliant, i love it',
]

[ model.predict([t]) for t in test_sentences ]



[array([[0.64, 0.09, 0.27]], dtype=float32),
 array([[0.91, 0.00, 0.09]], dtype=float32),
 array([[0.01, 0.00, 0.98]], dtype=float32)]

### Deploying the model
- Run the following in CLI.
  
        docker pull tensorflow/serving:2.8.0

        docker run -p 8500:8500 -p 8501:8501 \
            --mount type=bind,source=d:/sentimentanalysis/amazon_review/,target=/models/am_rvw \
            -e MODEL_NAME=am_rvw \
            -t tensorflow/serving:2.8.0

        (sidenote, this command also accomplishes the same:)
        docker run -p 8500:8500 -p 8501:8501 -v d:/sentimentanalysis/amazon_review:/models/am_rvw -e MODEL_NAME=am_rvw -t tensorflow/serving:2.8.0

- port `8500` is for the **gRPC API**, and `8501` is for **REST API**. The short of it is, REST API is easier/convenient, but gRPC is more performant, efficient.
            
- in my case, training and building on tensorflow 2.9.0, I had to deploy on tensorflow/serving:2.8.0 otherwise I kept getting the following crash during deployment/inference:
  - tensorflow-serving      | terminate called after throwing an instance of 'std::bad_alloc'
  - tensorflow-serving      |   what():  std::bad_alloc
  - which seemed to be an out-of-memory thing according to most threads, however even after increasing my WSL2 backend memory=20GB and swap=4GB, it would still crash, and Docker stats showed the container only around 600mb. A day of trial and erroring around with tf versions I finally found the above combination to work.

### Inference using REST and curl from command line:


- in windows, you have to escape the quotes
  - `curl -d "{\"instances\": [\"this sucks\"]}" -X POST http://127.0.0.1:8501/v1/models/am_rvw:predict`
- linux/mac
  - `curl -d '{"instances": ["this sucks"]}' -X POST http://127.0.0.1:8501/v1/models/am_rvw:predict`

endpoints here:

        # LATEST MODEL
        http://localhost:8501/v1/models/amrvw
        # SPECIFIC MODEL
        http://localhost:8501/v1/models/amrvw/versions/1:predict

### Inference using a python REST client

In [47]:
%%writefile tf_serving_rest_client.py

import json
import requests
import sys

def get_rest_url(model_name, host='127.0.0.1', port='8501', verb='predict', version=None):
    """ generate the URL path"""
    url = "http://{host}:{port}/v1/models/{model_name}".format(host=host, port=port, model_name=model_name)
    if version:
        url += 'versions/{version}'.format(version=version)
    url += ':{verb}'.format(verb=verb)
    return url

def get_model_prediction(model_input, model_name='amrvw', signature_name='serving_default'):
    """ no error handling at all, just poc"""

    url = get_rest_url(model_name)
    #In the row format, inputs are keyed to instances key in the JSON request.
    #When there is only one named input, specify the value of instances key to be the value of the input:
    
    # in our case, no difference between using "instances" or "inputs".
    # data = {"instances": [model_input]}
    data = {"inputs": [model_input]}
    
    rv = requests.post(url, data=json.dumps(data))
    if rv.status_code != requests.codes.ok:
        rv.raise_for_status()
    
    return rv.json()['predictions']

if __name__ == '__main__':

    print("\nGenerate REST url ...")
    url = get_rest_url(model_name='amrvw')
    print(url)
    
    while True:
        print("\nEnter an Amazon review [:q for Quit]")
        if sys.version_info[0] <= 3:
            sentence = input()
        if sentence == ':q':
            break
        model_input = sentence
        model_prediction = get_model_prediction(model_input)
        print("The model predicted ...")
        print(model_prediction)


Overwriting tf_serving_rest_client.py


### Inference using a gRPC client

In [48]:
%%writefile tf_serving_grpc_client.py
import sys
import grpc
from grpc.beta import implementations
import tensorflow as tf
from tensorflow_serving.apis import predict_pb2
from tensorflow_serving.apis import prediction_service_pb2, get_model_metadata_pb2
from tensorflow_serving.apis import prediction_service_pb2_grpc


def get_stub(host='127.0.0.1', port='8500'):
    channel = grpc.insecure_channel('127.0.0.1:8500') 
    stub = prediction_service_pb2_grpc.PredictionServiceStub(channel)
    return stub


def get_model_prediction(model_input, stub, model_name='amrvw', signature_name='serving_default'):
    """ no error handling at all, just poc"""
    request = predict_pb2.PredictRequest()
    request.model_spec.name = model_name
    request.model_spec.signature_name = signature_name
    request.inputs['input_input'].CopyFrom(tf.make_tensor_proto(model_input))
    response = stub.Predict.future(request, 5.0)  # 5 seconds
    return response.result().outputs["output"].float_val


def get_model_version(model_name, stub):
    request = get_model_metadata_pb2.GetModelMetadataRequest()
    request.model_spec.name = 'amrvw'
    request.metadata_field.append("signature_def")
    response = stub.GetModelMetadata(request, 10)
    # signature of loaded model is available here: response.metadata['signature_def']
    return response.model_spec.version.value

if __name__ == '__main__':
    print("\nCreate RPC connection ...")
    stub = get_stub()
    while True:
        print("\nEnter an Amazon review [:q for Quit]")
        if sys.version_info[0] <= 3:
            sentence = raw_input() if sys.version_info[0] < 3 else input()
        if sentence == ':q':
            break
        model_input = [sentence]
        model_prediction = get_model_prediction(model_input, stub)
        print("The model predicted ...")
        print(model_prediction)

Overwriting tf_serving_grpc_client.py
