In [5]:
import os
import urllib
import zipfile
import pandas as pd

In [6]:
data_directory = os.path.expanduser('~/data/fastai/lesson4')
model_path = '/tmp/fastai/lesson4'

if not os.path.isdir(data_directory):
    os.makedirs(data_directory)
if not os.path.isdir(model_path):
    os.makedirs(model_path)
    
movielens_folder = os.path.join(data_directory, 'ml-latest-small')

Download the dataset if we don't have it locally

In [7]:
fallback_url = 'http://files.grouplens.org/datasets/movielens/ml-latest-small.zip'

if not os.path.isdir(movielens_folder):
    local_zip_path = os.path.join(data_directory, os.path.basename(fallback_url))
    if not os.path.isfile(local_zip_path):
        # Download zip file
        urllib.request.urlretrieve(fallback_url, local_zip_path)
    # Unzip file
    with zipfile.ZipFile(local_zip_path, 'r') as z:
        z.extractall(data_directory)

## Data setup

In [8]:
ratings_path = os.path.join(movielens_folder, 'ratings.csv')
ratings = pd.read_csv(ratings_path)
ratings = ratings.drop(['timestamp'], axis=1)
ratings.head()

Unnamed: 0,userId,movieId,rating
0,1,31,2.5
1,1,1029,3.0
2,1,1061,3.0
3,1,1129,2.0
4,1,1172,4.0


In [9]:
len(ratings)

100004

In [10]:
movies_path = os.path.join(movielens_folder, 'movies.csv')
movie_names = pd.read_csv(movies_path, index_col='movieId')['title'].to_dict()

In [11]:
users = ratings['userId'].unique()
movies = ratings['movieId'].unique()

In [12]:
userid2idx = { userId: index for index, userId in enumerate(users) }
movieid2idx = { movieId: index for index, movieId in enumerate(movies) }

In [13]:
# TODO: write index to user and movie label metadata to model folder for tensorboard to use and display

Update movie and user Ids in ratings to be the index so we have a contiguous integer range for embeddings

In [14]:
ratings['userId'] = ratings['userId'].apply(userid2idx.get)
ratings['movieId'] = ratings['movieId'].apply(movieid2idx.get)

In [15]:
ratings.head()

Unnamed: 0,userId,movieId,rating
0,0,0,2.5
1,0,1,3.0
2,0,2,3.0
3,0,3,2.0
4,0,4,4.0


## Neural Net

In [16]:
import tensorflow as tf

  return f(*args, **kwds)


In [17]:
tf.__version__

'1.4.0-rc1'

In [18]:
x = ratings[['userId', 'movieId']]
y = ratings['rating']
train_input_fn = tf.estimator.inputs.pandas_input_fn(x, y=y, target_column='rating', shuffle=True, num_epochs=8)

In [19]:
user_embedding = tf.feature_column.embedding_column(
    categorical_column=tf.feature_column.categorical_column_with_identity('userId', num_buckets=len(users), default_value=0), 
    dimension=50)
movie_embedding = tf.feature_column.embedding_column(
    categorical_column=tf.feature_column.categorical_column_with_identity('movieId', num_buckets=len(movies), default_value=0), 
    dimension=50)

In [20]:
feature_columns = [user_embedding, movie_embedding]
estimator = tf.estimator.DNNRegressor(
    feature_columns=feature_columns,
    hidden_units=[70], 
    dropout=0.75,
    optimizer=tf.train.AdamOptimizer(learning_rate=0.001),
    model_dir=model_path
)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/tmp/fastai/lesson4', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x12103be48>, '_task_type': 'worker', '_task_id': 0, '_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [21]:
estimator.train(input_fn=train_input_fn)

INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Restoring parameters from /tmp/fastai/lesson4/model.ckpt-6251
INFO:tensorflow:Saving checkpoints for 6252 into /tmp/fastai/lesson4/model.ckpt.
INFO:tensorflow:loss = 67.1762, step = 6252
INFO:tensorflow:global_step/sec: 209.879
INFO:tensorflow:loss = 141.99, step = 6352 (0.478 sec)
INFO:tensorflow:global_step/sec: 255.541
INFO:tensorflow:loss = 88.3842, step = 6452 (0.391 sec)
INFO:tensorflow:global_step/sec: 271.15
INFO:tensorflow:loss = 148.684, step = 6552 (0.369 sec)
INFO:tensorflow:global_step/sec: 261.923
INFO:tensorflow:loss = 56.9887, step = 6652 (0.382 sec)
INFO:tensorflow:global_step/sec: 274.597
INFO:tensorflow:loss = 90.7707, step = 6752 (0.365 sec)
INFO:tensorflow:global_step/sec: 273.052
INFO:tensorflow:loss = 77.5105, step = 6852 (0.366 sec)
INFO:tensorflow:global_step/sec: 277.005
INFO:tensorflow:loss = 91.5582, step = 6952 (0.362 sec)
INFO:tensorflow:global_step/sec: 274.168
INFO:tensorflow:loss = 78.4077, ste

<tensorflow.python.estimator.canned.dnn.DNNRegressor at 0x12103bac8>

In [22]:
eval_input_fn = tf.estimator.inputs.pandas_input_fn(x, y=y, target_column='rating', shuffle=False, num_epochs=1)
estimator.evaluate(input_fn=train_input_fn)

INFO:tensorflow:Starting evaluation at 2017-10-31-06:38:29
INFO:tensorflow:Restoring parameters from /tmp/fastai/lesson4/model.ckpt-12502
INFO:tensorflow:Finished evaluation at 2017-10-31-06:38:39
INFO:tensorflow:Saving dict for global step 12502: average_loss = 0.623869, global_step = 12502, loss = 79.8457


{'average_loss': 0.62386942, 'global_step': 12502, 'loss': 79.845703}

## Serving

In [23]:
feature_spec = tf.feature_column.make_parse_example_spec(feature_columns)
export_input_fn = tf.estimator.export.build_parsing_serving_input_receiver_fn(feature_spec)
estimator.export_savedmodel('exports', export_input_fn)

INFO:tensorflow:Restoring parameters from /tmp/fastai/lesson4/model.ckpt-12502
INFO:tensorflow:Assets added to graph.
INFO:tensorflow:No assets to write.
INFO:tensorflow:SavedModel written to: b"exports/temp-b'1509431920'/saved_model.pb"


b'exports/1509431920'

## Prediction w/Serving

Build the server docker image. Will include outputs from export above

In [None]:
!docker build -t movie-rating-tfserving .

Run the docker image and expose `localhost:8500` for the grpc server

In [None]:
!docker run --rm -d -p 8500:8500 movie-rating-tfserving

Generate the python grpc client if needed

In [24]:
serving_src_root = os.path.expanduser('~/developer/serving')

In [48]:
!python3 -m grpc_tools.protoc -I{serving_src_root} -I{serving_src_root}/tensorflow --python_out={os.getcwd()} --grpc_python_out={os.getcwd()} {serving_src_root}/tensorflow_serving/apis/*.proto

In [25]:
from grpc.beta import implementations
from tensorflow_serving.apis import predict_pb2
from tensorflow_serving.apis import prediction_service_pb2
from tensorflow_serving.apis import get_model_metadata_pb2

In [26]:
def _int_feature(value):
    return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

channel = implementations.insecure_channel('localhost', int(8500))
stub = prediction_service_pb2.beta_create_PredictionService_stub(channel)

In [27]:
def make_request(userId:int, movieId:int):
    request = predict_pb2.PredictRequest()
    request.model_spec.name = 'default'
    request.model_spec.signature_name = 'serving_default'

    feature_dict = {
        'userId': _int_feature(userId),
        'movieId': _int_feature(movieId)
    }
    label = 0

    example = tf.train.Example(features=tf.train.Features(feature=feature_dict))
    serialized = example.SerializeToString()

    request.inputs['inputs'].CopyFrom(
        tf.contrib.util.make_tensor_proto(serialized, shape=[1]))

    result_future = stub.Predict.future(request, 5.0)
    prediction = result_future.result()

    predicted_rating = prediction.outputs["outputs"].float_val[0]
    actual_rating = ratings[(ratings['userId'] == userId) & (ratings['movieId'] == movieId)]['rating'][0]
    print(f'Predicted value: {predicted_rating} vs actual {actual_rating}')

In [30]:
%time make_request(0, 0)

Predicted value: 2.4376325607299805 vs actual 2.5
CPU times: user 4.47 ms, sys: 1.57 ms, total: 6.04 ms
Wall time: 5.39 ms
