## Install

In [None]:
!pip install deepr faiss_cpu

## Download the dataset

In [12]:
!wget http://files.grouplens.org/datasets/movielens/ml-20m.zip
!unzip ml-20m.zip

--2020-08-03 15:35:03--  http://files.grouplens.org/datasets/movielens/ml-20m.zip
Resolving files.grouplens.org (files.grouplens.org)... 128.101.65.152
Connecting to files.grouplens.org (files.grouplens.org)|128.101.65.152|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 198702078 (189M) [application/zip]
Saving to: 'ml-20m.zip'


2020-08-03 15:35:12 (24.0 MB/s) - 'ml-20m.zip' saved [198702078/198702078]

Archive:  ml-20m.zip
   creating: ml-20m/
  inflating: ml-20m/genome-scores.csv  
  inflating: ml-20m/genome-tags.csv  
  inflating: ml-20m/links.csv        
  inflating: ml-20m/movies.csv       
  inflating: ml-20m/ratings.csv      
  inflating: ml-20m/README.txt       
  inflating: ml-20m/tags.csv         


In [14]:
import os
dataset_path=os.getcwd()+"/ml-20m"

In [None]:
import logging
import tensorflow as tf
from deepr.examples import movielens

In [None]:
path_ratings = f"{dataset_path}/ratings.csv"

path_root = "wan"
path_model = path_root + "/model"
path_data = path_root + "/data"
path_variables = path_root + "/variables"
path_predictions = path_root + "/predictions.parquet.snappy"
path_saved_model = path_root + "/saved_model"
path_mapping = path_data + "/mapping.txt"
path_train = path_data + "/train.tfrecord.gz"
path_eval = path_data + "/eval.tfrecord.gz"
path_test = path_data + "/test.tfrecord.gz"
dpr.io.Path(path_root).mkdir(exist_ok=True)
dpr.io.Path(path_model).mkdir(exist_ok=True)
dpr.io.Path(path_data).mkdir(exist_ok=True)
max_steps = 100_000

## Training

Here we first download the movielens dataset then do a training using deepr library
It takes 30min to get movie embeddings and a timeline embedding model

## Build TF record

The job takes as input the csv ratings and create 3 tfrecords files (train, validation, test). Each file contains timeline of user ratings split into input and target.

In [None]:
# Build TF Records before defining the train job (need vocab size)
build = movielens.jobs.BuildRecords(
    path_ratings=path_ratings,
    path_mapping=path_mapping,
    path_train=path_train,
    path_eval=path_eval,
    path_test=path_test,
    min_rating=4,
    min_length=5,
    num_negatives=8,
    target_ratio=0.2,
    size_test=10_000,
    size_eval=10_000,
    shuffle_timelines=True,
    seed=2020,
)
build.run()

## Training

This defines a training job to build a model that transform a timeline of movies embedding to an user embedding.
It takes as input the tf records and upon completion writes 3 artifacts :
* dataframe of the biases and the embeddings
* saved model : protobuf containing the model definition and weights

In this specific instance we train an average model with a BPR loss and compute a triple precision on the validation set.

In [None]:
# Define train, predict and evaluate jobs
train = dpr.jobs.Trainer(
    path_model=path_model,
    pred_fn=movielens.layers.AverageModel(vocab_size=dpr.vocab.size(path_mapping), dim=100),
    loss_fn=movielens.layers.BPRLoss(vocab_size=dpr.vocab.size(path_mapping), dim=100),
    optimizer_fn=dpr.optimizers.TensorflowOptimizer("LazyAdam", 0.001),
    train_input_fn=dpr.readers.TFRecordReader(path_train),
    eval_input_fn=dpr.readers.TFRecordReader(path_eval, shuffle=False),
    prepro_fn=movielens.prepros.RecordPrepro(
        min_input_size=3,
        min_target_size=3,
        max_input_size=50,
        max_target_size=50,
        buffer_size=1024,
        batch_size=128,
        repeat_size=None,
        prefetch_size=1,
        num_parallel_calls=8,
    ),
    train_spec=dpr.jobs.TrainSpec(max_steps=max_steps),
    eval_spec=dpr.jobs.EvalSpec(steps=None, start_delay_secs=30, throttle_secs=30),
    final_spec=dpr.jobs.FinalSpec(steps=None),
    exporters=[
        # the training will keep the model with the best triplet precision
        dpr.exporters.BestCheckpoint(metric="triplet_precision", mode="increase"),
        # export biases and embeddings as a dataframe
        dpr.exporters.SaveVariables(path_variables=path_variables, variable_names=["biases", "embeddings"]),
        # export a saved model using specified fields as input
        dpr.exporters.SavedModel(
            path_saved_model=path_saved_model,
            fields=[
                dpr.Field(name="inputPositives", shape=(None,), dtype=tf.int64),
                dpr.Field(name="inputMask", shape=(None,), dtype=tf.bool),
            ],
        ),
    ],
    train_hooks=[
        # log metrics, hyperparams, initial values to the console, and optionally mlflow and graphite
        dpr.hooks.LoggingTensorHookFactory(
            name="training",
            functions={
                "memory_gb": dpr.hooks.ResidentMemory(unit="gb"),
                "max_memory_gb": dpr.hooks.MaxResidentMemory(unit="gb"),
            },
            every_n_iter=300,
            use_graphite=False,
            use_mlflow=False,
        ),
        dpr.hooks.SummarySaverHookFactory(save_steps=300),
        dpr.hooks.NumParamsHook(use_mlflow=False),
        dpr.hooks.LogVariablesInitHook(use_mlflow=False),
        dpr.hooks.StepsPerSecHook(
            name="training",
            batch_size=128,
            every_n_steps=300,
            skip_after_step=max_steps,
            use_mlflow=False,
            use_graphite=False,
        ),
        # stop the training if triplet precision does not improve
        dpr.hooks.EarlyStoppingHookFactory(
            metric="triplet_precision",
            mode="increase",
            max_steps_without_improvement=1000,
            min_steps=5_000,
            run_every_steps=300,
            final_step=max_steps,
        ),
    ],
    eval_hooks=[dpr.hooks.LoggingTensorHookFactory(name="validation", at_end=True)],
    final_hooks=[dpr.hooks.LoggingTensorHookFactory(name="final_validation", at_end=True)],
    train_metrics=[dpr.metrics.StepCounter(name="num_steps"), dpr.metrics.DecayMean(tensors=["loss"], decay=0.98)],
    eval_metrics=[dpr.metrics.Mean(tensors=["loss", "triplet_precision"])],
    final_metrics=[dpr.metrics.Mean(tensors=["loss", "triplet_precision"])],
    run_config=dpr.jobs.RunConfig(
        save_checkpoints_steps=300, save_summary_steps=300, keep_checkpoint_max=None, log_step_count_steps=300
    ),
    config_proto=dpr.jobs.ConfigProto(
        inter_op_parallelism_threads=8, intra_op_parallelism_threads=8, gpu_device_count=0, cpu_device_count=48,
    ),
)

## Predict job
The predict job reloads the test dataset and the saved model, perform the inference and write the user embeddings in a dataframe to be reused later by the validation job.

In [None]:
predict = movielens.jobs.Predict(
    path_saved_model=path_saved_model,
    path_predictions=path_predictions,
    input_fn=dpr.readers.TFRecordReader(path_test, shuffle=False),
    prepro_fn=movielens.prepros.RecordPrepro(),
)

## Evaluation job
This job takes user embedding, and uses faiss to retrieve the k nearest neighboors in the product embedding space, 
and compute metrics using the target timelines.

In [None]:
# compute the validation metrics
evaluate = [
    movielens.jobs.Evaluate(
        path_predictions=path_predictions,
        path_embeddings=path_variables + "/embeddings",
        path_biases=path_variables + "/biases",
        k=k,
    )
    for k in [10, 20, 50]
]


## Pipeline
All the jobs definition are lazy, and so is the pipeline. 
Calling run on it will actually perform all these steps.

In [None]:
# Run pipeline
pipeline = dpr.jobs.Pipeline([train, predict] + evaluate)

In [None]:
pipeline.run()

## KNN Search

Let's check if the movie embeddings produce make sense
* load the movie embeddings
* load the movie title
* build a knn index on that
* do a query with a known movie to check that its closest neighboors make sense

In [17]:
import numpy as np
import pyarrow.parquet as pq
import faiss
import pyarrow.csv as pc
from IPython.display import display
import pandas as pd

embeddings = np.vstack(pq.read_table("wan/variables/embeddings").to_pandas().to_numpy())
mapp = {int(movie_id):indice for indice, movie_id in enumerate(open("wan/data/mapping.txt", "r").read().split("\n"))}
inversed_map = {indice:movie_id for movie_id, indice in mapp.items()}
index = faiss.IndexFlatIP(embeddings.shape[-1])
index.add(np.ascontiguousarray(embeddings))

def knn_query(index, query, ksearch):
    D, I = index.search(np.expand_dims(query,0), ksearch)
    distances = D[0]
    product_indices = I[0]
    product_ids = [inversed_map[i] for i in product_indices]
    return list(zip(product_ids, distances))
movies = pc.read_csv(f"{dataset_path}/movies.csv").to_pandas()
    
def display_results_df(results):
    data = [[movies[movies.movieId == movie_id]["genres"].to_numpy()[0], movies[movies.movieId == movie_id]["title"].to_numpy()[0], distance] for movie_id, distance in results]
    
    df = pd.DataFrame(data, columns = ['Genre', 'Title', 'Distance']) 
    display(df)
    
def display_movie(movie_id):
    data = [[(movies[movies.movieId == movie_id]["genres"].to_numpy()[0]), (movies[movies.movieId == movie_id]["title"].to_numpy()[0])]]
    
    df = pd.DataFrame(data, columns = ['Genre', 'Title']) 
    display(df)


In [18]:
p = movies[movies.title.str.lower().str.contains("star wars")]["movieId"].to_numpy()[0]
print("Query")
display_movie(p)
print("Knn results")
display_results_df(knn_query(index, embeddings[mapp[p]], 5))


Query


Unnamed: 0,Genre,Title
0,Action|Adventure|Sci-Fi,Star Wars: Episode IV - A New Hope (1977)


Knn results


Unnamed: 0,Genre,Title,Distance
0,Action|Adventure|Sci-Fi,Star Wars: Episode IV - A New Hope (1977),24.18565
1,Action|Adventure|Sci-Fi,Star Wars: Episode VI - Return of the Jedi (1983),21.296133
2,Action|Adventure|Sci-Fi,Star Wars: Episode V - The Empire Strikes Back...,20.707195
3,Action|Adventure,Raiders of the Lost Ark (Indiana Jones and the...,18.069126
4,Action|Adventure|Sci-Fi|Thriller,Star Trek: First Contact (1996),16.387402


As you can see, the genre of the nearest neighboors is similar and the movies are related to the movie query