In [1]:
import os
import urllib
import zipfile
import pandas as pd

In [2]:
data_directory = os.path.expanduser('~/data/fastai/lesson4')
model_path = '/tmp/fastai/lesson4'

if not os.path.isdir(data_directory):
    os.makedirs(data_directory)
if not os.path.isdir(model_path):
    os.makedirs(model_path)
    
movielens_folder = os.path.join(data_directory, 'ml-latest-small')

Download the dataset if we don't have it locally

In [3]:
fallback_url = 'http://files.grouplens.org/datasets/movielens/ml-latest-small.zip'

if not os.path.isdir(movielens_folder):
    local_zip_path = os.path.join(data_directory, os.path.basename(fallback_url))
    if not os.path.isfile(local_zip_path):
        # Download zip file
        urllib.request.urlretrieve(fallback_url, local_zip_path)
    # Unzip file
    with zipfile.ZipFile(local_zip_path, 'r') as z:
        z.extractall(data_directory)

## Data setup

In [4]:
ratings_path = os.path.join(movielens_folder, 'ratings.csv')
ratings = pd.read_csv(ratings_path)
ratings = ratings.drop(['timestamp'], axis=1)
ratings.head()

Unnamed: 0,userId,movieId,rating
0,1,31,2.5
1,1,1029,3.0
2,1,1061,3.0
3,1,1129,2.0
4,1,1172,4.0


In [5]:
len(ratings)

100004

In [6]:
movies_path = os.path.join(movielens_folder, 'movies.csv')
movie_names = pd.read_csv(movies_path, index_col='movieId')['title'].to_dict()

In [7]:
users = ratings['userId'].unique()
movies = ratings['movieId'].unique()

In [8]:
userid2idx = { userId: index for index, userId in enumerate(users) }
movieid2idx = { movieId: index for index, movieId in enumerate(movies) }

In [9]:
# TODO: write index to user and movie label metadata to model folder for tensorboard to use and display

Update movie and user Ids in ratings to be the index so we have a contiguous integer range for embeddings

In [10]:
ratings['userId'] = ratings['userId'].apply(userid2idx.get)
ratings['movieId'] = ratings['movieId'].apply(movieid2idx.get)

In [11]:
ratings.head()

Unnamed: 0,userId,movieId,rating
0,0,0,2.5
1,0,1,3.0
2,0,2,3.0
3,0,3,2.0
4,0,4,4.0


## Neural Net

In [12]:
import tensorflow as tf

  return f(*args, **kwds)


In [13]:
tf.__version__

'1.4.0-rc1'

In [14]:
x = ratings[['userId', 'movieId']]
y = ratings['rating']
train_input_fn = tf.estimator.inputs.pandas_input_fn(x, y=y, target_column='rating', shuffle=True, num_epochs=8)

In [15]:
user_embedding = tf.feature_column.embedding_column(
    categorical_column=tf.feature_column.categorical_column_with_identity('userId', num_buckets=len(users), default_value=0), 
    dimension=50)
movie_embedding = tf.feature_column.embedding_column(
    categorical_column=tf.feature_column.categorical_column_with_identity('movieId', num_buckets=len(movies), default_value=0), 
    dimension=50)

In [34]:
feature_columns = [user_embedding, movie_embedding]
estimator = tf.estimator.DNNRegressor(
    feature_columns=feature_columns,
    hidden_units=[70], 
    dropout=0.75,
    optimizer=tf.train.AdamOptimizer(learning_rate=0.001),
    model_dir=model_path
)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/tmp/fastai/lesson4', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x10a5f4978>, '_task_type': 'worker', '_task_id': 0, '_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [17]:
estimator.train(input_fn=train_input_fn)

INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Saving checkpoints for 1 into /tmp/fastai/lesson4/model.ckpt.
INFO:tensorflow:loss = 1867.53, step = 1
INFO:tensorflow:global_step/sec: 228.592
INFO:tensorflow:loss = 840.253, step = 101 (0.439 sec)
INFO:tensorflow:global_step/sec: 299.857
INFO:tensorflow:loss = 313.73, step = 201 (0.333 sec)
INFO:tensorflow:global_step/sec: 286.427
INFO:tensorflow:loss = 282.478, step = 301 (0.349 sec)
INFO:tensorflow:global_step/sec: 281.557
INFO:tensorflow:loss = 231.264, step = 401 (0.355 sec)
INFO:tensorflow:global_step/sec: 280.165
INFO:tensorflow:loss = 299.091, step = 501 (0.358 sec)
INFO:tensorflow:global_step/sec: 266.103
INFO:tensorflow:loss = 321.18, step = 601 (0.376 sec)
INFO:tensorflow:global_step/sec: 268.07
INFO:tensorflow:loss = 261.147, step = 701 (0.372 sec)
INFO:tensorflow:global_step/sec: 276.649
INFO:tensorflow:loss = 415.107, step = 801 (0.361 sec)
INFO:tensorflow:global_step/sec: 266.756
INFO:tensorflow:loss = 228.497,

<tensorflow.python.estimator.canned.dnn.DNNRegressor at 0x116b8df60>

In [18]:
eval_input_fn = tf.estimator.inputs.pandas_input_fn(x, y=y, target_column='rating', shuffle=False, num_epochs=1)
estimator.evaluate(input_fn=train_input_fn)

INFO:tensorflow:Starting evaluation at 2017-10-31-02:25:14
INFO:tensorflow:Restoring parameters from /tmp/fastai/lesson4/model.ckpt-6251
INFO:tensorflow:Finished evaluation at 2017-10-31-02:25:23
INFO:tensorflow:Saving dict for global step 6251: average_loss = 0.664216, global_step = 6251, loss = 85.0095


{'average_loss': 0.66421634, 'global_step': 6251, 'loss': 85.009491}

## Serving

In [35]:
feature_spec = tf.feature_column.make_parse_example_spec(feature_columns)
export_input_fn = tf.estimator.export.build_parsing_serving_input_receiver_fn(feature_spec)
estimator.export_savedmodel('exports', export_input_fn)

INFO:tensorflow:Restoring parameters from /tmp/fastai/lesson4/model.ckpt-6251
INFO:tensorflow:Assets added to graph.
INFO:tensorflow:No assets to write.
INFO:tensorflow:SavedModel written to: b"exports/temp-b'1509419744'/saved_model.pb"


b'exports/1509419744'

## Prediction w/Serving

Build the server docker image. Will include outputs from export above

In [None]:
!docker build -t movie-rating-tfserving .

Run the docker image and expose `localhost:8500` for the grpc server

In [None]:
!docker run --rm -d -p 8500:8500 movie-rating-tfserving

Generate the python grpc client if needed

In [43]:
serving_src_root = os.path.expanduser('~/developer/serving')

In [48]:
!python3 -m grpc_tools.protoc -I{serving_src_root} -I{serving_src_root}/tensorflow --python_out={os.getcwd()} --grpc_python_out={os.getcwd()} {serving_src_root}/tensorflow_serving/apis/*.proto

In [64]:
from grpc.beta import implementations
from tensorflow_serving.apis import predict_pb2
from tensorflow_serving.apis import prediction_service_pb2
from tensorflow_serving.apis import get_model_metadata_pb2

In [65]:
def _int_feature(value):
    return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

channel = implementations.insecure_channel('localhost', int(8500))
stub = prediction_service_pb2.beta_create_PredictionService_stub(channel)

In [93]:
def make_request(userId:int, movieId:int):
    request = predict_pb2.PredictRequest()
    request.model_spec.name = 'default'
    request.model_spec.signature_name = 'serving_default'

    feature_dict = {
        'userId': _int_feature(userId),
        'movieId': _int_feature(movieId)
    }
    label = 0

    example = tf.train.Example(features=tf.train.Features(feature=feature_dict))
    serialized = example.SerializeToString()

    request.inputs['inputs'].CopyFrom(
        tf.contrib.util.make_tensor_proto(serialized, shape=[1]))

    result_future = stub.Predict.future(request, 5.0)
    prediction = result_future.result()

    predicted_rating = prediction.outputs["outputs"].float_val[0]
    actual_rating = ratings[(ratings['userId'] == userId) & (ratings['movieId'] == movieId)]['rating'][0]
    print(f'Predicted value: {predicted_rating} vs actual {actual_rating}')

In [94]:
make_request(0, 0)

Predicted value: 2.4376325607299805 vs actual 2.5
