In [10]:
import os
import urllib
import zipfile
import pandas as pd

In [8]:
data_directory = os.path.expanduser('~/data/fastai/lesson4')
model_path = '/tmp/fastai/lesson4'

if not os.path.isdir(data_directory):
    os.makedirs(data_directory)
if not os.path.isdir(model_path):
    os.makedirs(model_path)
    
movielens_folder = os.path.join(data_directory, 'ml-latest-small')

Download the dataset if we don't have it locally

In [9]:
fallback_url = 'http://files.grouplens.org/datasets/movielens/ml-latest-small.zip'

if not os.path.isdir(movielens_folder):
    local_zip_path = os.path.join(data_directory, os.path.basename(fallback_url))
    if not os.path.isfile(local_zip_path):
        # Download zip file
        urllib.request.urlretrieve(fallback_url, local_zip_path)
    # Unzip file
    with zipfile.ZipFile(local_zip_path, 'r') as z:
        z.extractall(data_directory)

## Data setup

In [25]:
ratings_path = os.path.join(movielens_folder, 'ratings.csv')
ratings = pd.read_csv(ratings_path)
ratings = ratings.drop(['timestamp'], axis=1)
ratings.head()

Unnamed: 0,userId,movieId,rating
0,1,31,2.5
1,1,1029,3.0
2,1,1061,3.0
3,1,1129,2.0
4,1,1172,4.0


In [26]:
len(ratings)

100004

In [27]:
movies_path = os.path.join(movielens_folder, 'movies.csv')
movie_names = pd.read_csv(movies_path, index_col='movieId')['title'].to_dict()

In [28]:
users = ratings['userId'].unique()
movies = ratings['movieId'].unique()

In [29]:
userid2idx = { userId: index for index, userId in enumerate(users) }
movieid2idx = { movieId: index for index, movieId in enumerate(movies) }

In [30]:
# TODO: write index to user and movie label metadata to model folder for tensorboard to use and display

Update movie and user Ids in ratings to be the index so we have a contiguous integer range for embeddings

In [31]:
ratings['userId'] = ratings['userId'].apply(userid2idx.get)
ratings['movieId'] = ratings['movieId'].apply(movieid2idx.get)

In [32]:
ratings.head()

Unnamed: 0,userId,movieId,rating
0,0,0,2.5
1,0,1,3.0
2,0,2,3.0
3,0,3,2.0
4,0,4,4.0


## Neural Net

In [18]:
import tensorflow as tf

  return f(*args, **kwds)


In [19]:
tf.__version__

'1.4.0-rc1'

In [45]:
x = ratings[['userId', 'movieId']]
y = ratings['rating']
train_input_fn = tf.estimator.inputs.pandas_input_fn(x, y=y, target_column='rating', shuffle=True, num_epochs=8)

In [46]:
user_embedding = tf.feature_column.embedding_column(
    categorical_column=tf.feature_column.categorical_column_with_identity('userId', num_buckets=len(users), default_value=0), 
    dimension=50)
movie_embedding = tf.feature_column.embedding_column(
    categorical_column=tf.feature_column.categorical_column_with_identity('movieId', num_buckets=len(movies), default_value=0), 
    dimension=50)

In [50]:
estimator = tf.estimator.DNNRegressor(
    feature_columns=[user_embedding, movie_embedding],
    hidden_units=[70], 
    dropout=0.75,
    optimizer=tf.train.AdamOptimizer(learning_rate=0.001),
    model_dir=model_path
)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/tmp/fastai/lesson4', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x109355048>, '_task_type': 'worker', '_task_id': 0, '_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [51]:
estimator.train(input_fn=train_input_fn)

INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Saving checkpoints for 1 into /tmp/fastai/lesson4/model.ckpt.
INFO:tensorflow:loss = 1552.69, step = 1
INFO:tensorflow:global_step/sec: 214.18
INFO:tensorflow:loss = 903.199, step = 101 (0.469 sec)
INFO:tensorflow:global_step/sec: 258.562
INFO:tensorflow:loss = 382.138, step = 201 (0.387 sec)
INFO:tensorflow:global_step/sec: 272.584
INFO:tensorflow:loss = 302.374, step = 301 (0.366 sec)
INFO:tensorflow:global_step/sec: 278.191
INFO:tensorflow:loss = 313.725, step = 401 (0.359 sec)
INFO:tensorflow:global_step/sec: 281.252
INFO:tensorflow:loss = 274.853, step = 501 (0.356 sec)
INFO:tensorflow:global_step/sec: 280.167
INFO:tensorflow:loss = 233.222, step = 601 (0.357 sec)
INFO:tensorflow:global_step/sec: 237.301
INFO:tensorflow:loss = 179.12, step = 701 (0.422 sec)
INFO:tensorflow:global_step/sec: 226.043
INFO:tensorflow:loss = 248.087, step = 801 (0.444 sec)
INFO:tensorflow:global_step/sec: 222.636
INFO:tensorflow:loss = 207.675

<tensorflow.python.estimator.canned.dnn.DNNRegressor at 0x1093551d0>

In [52]:
eval_input_fn = tf.estimator.inputs.pandas_input_fn(x, y=y, target_column='rating', shuffle=False, num_epochs=1)
estimator.evaluate(input_fn=train_input_fn)

INFO:tensorflow:Starting evaluation at 2017-10-25-04:38:03
INFO:tensorflow:Restoring parameters from /tmp/fastai/lesson4/model.ckpt-6251
INFO:tensorflow:Finished evaluation at 2017-10-25-04:38:12
INFO:tensorflow:Saving dict for global step 6251: average_loss = 0.66668, global_step = 6251, loss = 85.3248


{'average_loss': 0.66668004, 'global_step': 6251, 'loss': 85.324806}