In [1]:
import tensorflow as tf
print("TensorFlow version:", tf.__version__)

TensorFlow version: 2.3.0


In [2]:
# Training samples path, change to your local path
training_samples_file_path = tf.keras.utils.get_file("trainingSamples.csv", r"file:///Working\GitHub\SparrowRecSys\src\main\resources\webroot\sampledata\trainingSamples.csv")
# Test samples path, change to your local path
test_samples_file_path = tf.keras.utils.get_file("testSamples.csv", r"file:///Working\GitHub\SparrowRecSys\src\main\resources\webroot\sampledata\testSamples.csv")

In [4]:
# load sample as tf dataset
def get_dataset(file_path):
    dataset = tf.data.experimental.make_csv_dataset(
        file_path,
        batch_size=12,
        label_name='label',
        na_value="0",
        num_epochs=1,
        ignore_errors=True)
    return dataset

In [5]:
# split as test dataset and training dataset
train_dataset = get_dataset(training_samples_file_path)
test_dataset = get_dataset(test_samples_file_path)

In [6]:
train_dataset

<PrefetchDataset shapes: (OrderedDict([(movieId, (None,)), (userId, (None,)), (rating, (None,)), (timestamp, (None,)), (releaseYear, (None,)), (movieGenre1, (None,)), (movieGenre2, (None,)), (movieGenre3, (None,)), (movieRatingCount, (None,)), (movieAvgRating, (None,)), (movieRatingStddev, (None,)), (userRatedMovie1, (None,)), (userRatedMovie2, (None,)), (userRatedMovie3, (None,)), (userRatedMovie4, (None,)), (userRatedMovie5, (None,)), (userRatingCount, (None,)), (userAvgReleaseYear, (None,)), (userReleaseYearStddev, (None,)), (userAvgRating, (None,)), (userRatingStddev, (None,)), (userGenre1, (None,)), (userGenre2, (None,)), (userGenre3, (None,)), (userGenre4, (None,)), (userGenre5, (None,))]), (None,)), types: (OrderedDict([(movieId, tf.int32), (userId, tf.int32), (rating, tf.float32), (timestamp, tf.int32), (releaseYear, tf.int32), (movieGenre1, tf.string), (movieGenre2, tf.string), (movieGenre3, tf.string), (movieRatingCount, tf.int32), (movieAvgRating, tf.float32), (movieRatingSt

In [7]:
# genre features vocabulary
genre_vocab = ['Film-Noir', 'Action', 'Adventure', 'Horror', 'Romance', 'War', 'Comedy', 'Western', 'Documentary',
               'Sci-Fi', 'Drama', 'Thriller',
               'Crime', 'Fantasy', 'Animation', 'IMAX', 'Mystery', 'Children', 'Musical']

In [8]:
GENRE_FEATURES = {
    'userGenre1': genre_vocab,
    'userGenre2': genre_vocab,
    'userGenre3': genre_vocab,
    'userGenre4': genre_vocab,
    'userGenre5': genre_vocab,
    'movieGenre1': genre_vocab,
    'movieGenre2': genre_vocab,
    'movieGenre3': genre_vocab
}

In [12]:
# all categorical features
categorical_columns = []
for feature, vocab in GENRE_FEATURES.items():
    cat_col = tf.feature_column.categorical_column_with_vocabulary_list(
        key=feature, vocabulary_list=vocab)
    emb_col = tf.feature_column.embedding_column(cat_col, 10)
    categorical_columns.append(emb_col)

In [15]:
# def call_feature_columns(feature_columns, inputs):
#   # This is a convenient way to call a `feature_column` outside of an estimator
#   # to display its output.
#   feature_layer = tf.keras.layers.DenseFeatures(feature_columns)
#   return feature_layer(inputs)

In [16]:
# vocab_col = tf.feature_column.categorical_column_with_vocabulary_list(
#     'col',
#     vocabulary_list=['small', 'medium', 'large'],
#     num_oov_buckets=0)
# embedding_col = tf.feature_column.embedding_column(vocab_col, 4)
# call_feature_columns(embedding_col, {'col': ['small', 'medium', 'large']})

<tf.Tensor: shape=(3, 4), dtype=float32, numpy=
array([[ 0.21985447, -0.6182158 ,  0.15023518,  0.1025155 ],
       [-0.14397748,  0.16230303, -0.00668169,  0.2759317 ],
       [ 0.32666937, -0.43590993,  0.23393054,  0.29681933]],
      dtype=float32)>

In [18]:
# movie id embedding feature
movie_col = tf.feature_column.categorical_column_with_identity(key='movieId', num_buckets=1001)
movie_emb_col = tf.feature_column.embedding_column(movie_col, 10)
categorical_columns.append(movie_emb_col)

In [19]:
# user id embedding feature
user_col = tf.feature_column.categorical_column_with_identity(key='userId', num_buckets=30001)
user_emb_col = tf.feature_column.embedding_column(user_col, 10)
categorical_columns.append(user_emb_col)

In [20]:
# all numerical features
numerical_columns = [tf.feature_column.numeric_column('releaseYear'),
                     tf.feature_column.numeric_column('movieRatingCount'),
                     tf.feature_column.numeric_column('movieAvgRating'),
                     tf.feature_column.numeric_column('movieRatingStddev'),
                     tf.feature_column.numeric_column('userRatingCount'),
                     tf.feature_column.numeric_column('userAvgRating'),
                     tf.feature_column.numeric_column('userRatingStddev')]

In [21]:
# embedding + MLP model architecture
model = tf.keras.Sequential([
    tf.keras.layers.DenseFeatures(numerical_columns + categorical_columns),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid'),
])

In [22]:
# compile the model, set loss function, optimizer and evaluation metrics
model.compile(
    loss='binary_crossentropy',
    optimizer='adam',
    metrics=['accuracy', tf.keras.metrics.AUC(curve='ROC'), tf.keras.metrics.AUC(curve='PR')])

In [23]:
# train the model
model.fit(train_dataset, epochs=5)

Epoch 1/5
Consider rewriting this model with the Functional API.
Consider rewriting this model with the Functional API.
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x283bbb42850>

In [24]:
# evaluate the model
test_loss, test_accuracy, test_roc_auc, test_pr_auc = model.evaluate(test_dataset)
print('\n\nTest Loss {}, Test Accuracy {}, Test ROC AUC {}, Test PR AUC {}'.format(test_loss, test_accuracy,
                                                                                   test_roc_auc, test_pr_auc))

Consider rewriting this model with the Functional API.


Test Loss 0.6122167706489563, Test Accuracy 0.684982180595398, Test ROC AUC 0.7477628588676453, Test PR AUC 0.7747372984886169


In [25]:
# print some predict results
predictions = model.predict(test_dataset)
for prediction, goodRating in zip(predictions[:12], list(test_dataset)[0][1][:12]):
    print("Predicted good rating: {:.2%}".format(prediction[0]),
          " | Actual rating label: ",
          ("Good Rating" if bool(goodRating) else "Bad Rating"))

Consider rewriting this model with the Functional API.
Predicted good rating: 7.49%  | Actual rating label:  Good Rating
Predicted good rating: 15.55%  | Actual rating label:  Bad Rating
Predicted good rating: 90.37%  | Actual rating label:  Good Rating
Predicted good rating: 42.65%  | Actual rating label:  Good Rating
Predicted good rating: 67.70%  | Actual rating label:  Good Rating
Predicted good rating: 15.50%  | Actual rating label:  Bad Rating
Predicted good rating: 77.35%  | Actual rating label:  Bad Rating
Predicted good rating: 59.73%  | Actual rating label:  Bad Rating
Predicted good rating: 33.75%  | Actual rating label:  Bad Rating
Predicted good rating: 58.96%  | Actual rating label:  Good Rating
Predicted good rating: 19.45%  | Actual rating label:  Bad Rating
Predicted good rating: 1.93%  | Actual rating label:  Good Rating
