# Training Embeddings for Users and Movies

This tutorial shows how to learn **item embeddings** from ratings.

We use a simple NN architecture, a long with the conditional cost function used by the [Swivel](https://arxiv.org/pdf/1602.02215.pdf) algorithm. 

The learnt embeddings are then extracted from the model and saved as TSV file.

<img src="cooc2emb.png" width="600" height="400"/>

The following are the steps of this tutorial:


1. Define input data metadata
2. Implement data input function
3. Create feature columns
4. Create a custome estimator
5. Define the train and evaluate experiment
6. Set the experiment parameters
7. Run the experiment
8. Extract the learnt **movie embeddings** from the model
9. Export the saved model to serve as a **customer embedding** lookup


<a href="https://colab.research.google.com/github/ksalama/data2cooc2emb2ann/blob/master/02-Training_Embeddings_for_Users_and_Movies.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Setup

In [1]:
!pip install -r ../requirements.txt



In [2]:
import os
import math
import numpy as np
import tensorflow as tf
from datetime import datetime

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [3]:
WORKSPACE = './workspace'
COOC_DIR = '{}/cooc'.format(WORKSPACE)
MODELS_DIR = '{}/models'.format(WORKSPACE)
SEED = 19831060

In [4]:
!echo "Files:"
!ls {COOC_DIR}/
!echo ""

!echo "info:"
!head {COOC_DIR}/info.log
!echo ""

!echo "user vocab file:"
!head {COOC_DIR}/vocab-0.txt
!echo ""

!echo "movie vocab file:"
!head {COOC_DIR}/vocab-1.txt

Files:
cooc-00000-of-00001.tfrecords  info.log  vocab-0.txt  vocab-1.txt

info:
min: 1
P: 1000209
max: 5

user vocab file:
1
2
3
4
5
6
7
8
9
10

movie vocab file:
1193
661
914
3408
2355
1197
1287
2804
594
919


## 1.  Metadata

In [5]:
FEATURES_SCHEMA = {
    'item1': tf.FixedLenFeature(dtype=tf.string, shape=()),
    'item2': tf.FixedLenFeature(dtype=tf.string, shape=()),
    'score': tf.FixedLenFeature(dtype=tf.float32, shape=()),
    'weight': tf.FixedLenFeature(dtype=tf.float32, shape=()),
    'type': tf.FixedLenFeature(dtype=tf.string, shape=())
}

WEIGHT_FEATURE_NAME = 'weight'
TARGET_FEATURE_NAME = 'score'




## 2.  Data Input Function

In [6]:
def make_input_fn(file_pattern, 
                  batch_size=128, num_epochs=1, mode=tf.estimator.ModeKeys.EVAL):

    def _input_fn():
        dataset = tf.data.experimental.make_batched_features_dataset(
            file_pattern,
            batch_size,
            features=FEATURES_SCHEMA,
            label_key=TARGET_FEATURE_NAME,
            reader=tf.data.TFRecordDataset,
            shuffle_buffer_size=batch_size * 2,
            num_epochs=num_epochs,
            shuffle=(mode==tf.estimator.ModeKeys.TRAIN),
            sloppy_ordering=True,
            drop_final_batch=True
        )
        return dataset
    
    return _input_fn

## 3. Feature Columns

In [7]:
def create_feature_columns(embedding_size, vocab1_file, vocab2_file):
    
    feature_columns = []
    
    # User -> item1
    feature_columns.append(
        tf.feature_column.embedding_column(
            tf.feature_column.categorical_column_with_vocabulary_file(
                key='item1', 
                vocabulary_file=vocab1_file
            ), 
            embedding_size
        )
    )
    
    # Movie -> item2
    feature_columns.append(
        tf.feature_column.embedding_column(
            tf.feature_column.categorical_column_with_vocabulary_file(
                key='item2', 
                vocabulary_file=vocab2_file
            ), 
            embedding_size
        )
    )
        
    return feature_columns

## 4.  Custom Estimator

In [8]:
def compute_loss(labels, predictions, weights, types):
    
    def _positive_sample_cost(errors, weights):
        return 0.5 * weights * tf.math.square(errors)
    
    def _negative_sample_cost(errors, weights):
        return weights * tf.math.softplus(errors)
    
    errors = predictions - labels
    
    p_loss = _positive_sample_cost(errors, weights)
    n_loss = _negative_sample_cost(errors, weights)
    loss = tf.where(tf.equal(types, 'P'), p_loss, n_loss)
    
    return tf.reduce_sum(loss)

def model_fn(features, labels, mode, params):
    
    items1 = features['item1']
    feature_columns = create_feature_columns(
        params.embedding_size, params.vocab1_file, params.vocab2_file)
    
    # User -> item1
    item1_layer = tf.feature_column.input_layer(
        features={'item1': items1}, feature_columns=[feature_columns[0]])
    
    if mode != tf.estimator.ModeKeys.PREDICT:
        # movei -> item2
        items2 = features['item2']
        item2_layer = tf.feature_column.input_layer(
            features={'item2': items2}, feature_columns=[feature_columns[1]])
        
        dot_product = tf.keras.layers.Dot(axes=1)([item1_layer, item2_layer])
        logits = (params.max_value - params.min_value) * tf.sigmoid(dot_product) + params.min_value 

    predictions = None
    export_outputs = None
    loss = None
    train_op = None

    if mode == tf.estimator.ModeKeys.PREDICT:
        predictions =  item1_layer
        export_outputs = {'predictions': tf.estimator.export.PredictOutput(predictions)}
    else:
        types = features['type']
        weights = features[WEIGHT_FEATURE_NAME]

        loss = compute_loss(
            labels=labels, 
            predictions=tf.squeeze(logits), 
            weights=weights, 
            types=types
        )
        
        train_op=tf.train.AdamOptimizer(params.learning_rate).minimize(
            loss=loss, global_step=tf.train.get_global_step())
        
    
    return tf.estimator.EstimatorSpec(
        mode=mode,
        predictions=predictions,
        export_outputs=export_outputs,
        loss=loss,
        train_op=train_op
    )


def create_estimator(params, run_config):
    
    estimator = tf.estimator.Estimator(
        model_fn,
        params=params,
        config=run_config
    )
    
    return estimator

## 5. Experiment

In [9]:
def run_experiment(params, run_config):
    
    train_data_files = params.train_data_files
    eval_data_files = params.eval_data_files
    
    # TrainSpec ####################################
    train_input_fn = make_input_fn(
        train_data_files,
        batch_size=params.batch_size,
        num_epochs=None,
        mode=tf.estimator.ModeKeys.TRAIN
    )
    
    train_spec = tf.estimator.TrainSpec(
        input_fn = train_input_fn,
        max_steps=params.traning_steps
    )
    ###############################################    
    
    # EvalSpec ####################################
    eval_input_fn = make_input_fn(
        eval_data_files,
        num_epochs=None,
        batch_size=params.batch_size,
    )

    eval_spec = tf.estimator.EvalSpec(
        name=datetime.utcnow().strftime("%H%M%S"),
        input_fn = eval_input_fn,
        steps=params.eval_steps,
        start_delay_secs=0,
        throttle_secs=params.eval_throttle_secs
    )
    ###############################################

    tf.logging.set_verbosity(tf.logging.INFO)
    
    if tf.gfile.Exists(run_config.model_dir):
        print("Removing previous artefacts...")
        tf.gfile.DeleteRecursively(run_config.model_dir)
            
    print("")
    estimator = create_estimator(params, run_config)
    print("")
    
    time_start = datetime.utcnow() 
    print("Experiment started at {}".format(time_start.strftime("%H:%M:%S")))
    print(".......................................") 

    tf.estimator.train_and_evaluate(
        estimator=estimator,
        train_spec=train_spec, 
        eval_spec=eval_spec
    )

    time_end = datetime.utcnow() 
    print(".......................................")
    print("Experiment finished at {}".format(time_end.strftime("%H:%M:%S")))
    print("")
    time_elapsed = time_end - time_start
    print("Experiment elapsed time: {} seconds".format(time_elapsed.total_seconds()))
    
    return estimator

## 6. Parameters 

In [10]:
MODEL_NAME = 'cooc2emb-01'
model_dir = os.path.join(MODELS_DIR, MODEL_NAME)
info_file = os.path.join(COOC_DIR, 'info.log')
min_value = 15
max_value = -5

info_map = {}

if os.path.exists(info_file):
    try:
        with open(info_file) as f:
            for line in f.readlines():
                key, value = line.split(":")
                info_map[key] = float(value)
        min_value = math.floor(info_map['min'])
        max_value = math.ceil(info_map['max'])
    except: pass
    
class HParams():
    pass

params  = HParams()
params.train_data_files = "{}/cooc-*.tfrecords".format(COOC_DIR)
params.eval_data_files = "{}/cooc-*.tfrecords".format(COOC_DIR)
params.vocab1_file = os.path.join(COOC_DIR,'vocab-0.txt')
params.vocab2_file = os.path.join(COOC_DIR,'vocab-1.txt')
params.embedding_size = 32
params.min_value = min_value
params.max_value = max_value
params.batch_size = 265
params.traning_steps = 30000
params.learning_rate = 0.001
params.eval_steps = 1
params.eval_throttle_secs = 0

print(vars(params))

run_config = tf.estimator.RunConfig(
    tf_random_seed=SEED,
    save_checkpoints_steps=1000,
    keep_checkpoint_max=3,
    model_dir=model_dir,
)

{'train_data_files': './workspace/cooc/cooc-*.tfrecords', 'eval_data_files': './workspace/cooc/cooc-*.tfrecords', 'vocab1_file': './workspace/cooc/vocab-0.txt', 'vocab2_file': './workspace/cooc/vocab-1.txt', 'embedding_size': 32, 'min_value': 1, 'max_value': 5, 'batch_size': 265, 'traning_steps': 30000, 'learning_rate': 0.001, 'eval_steps': 1, 'eval_throttle_secs': 0}


## 7. Run

In [11]:
estimator = run_experiment(params, run_config)





INFO:tensorflow:Using config: {'_model_dir': './workspace/models/cooc2emb-01', '_tf_random_seed': 19831060, '_save_summary_steps': 100, '_save_checkpoints_steps': 1000, '_save_checkpoints_secs': None, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 3, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f8f2209c210>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}

Experiment started at 15:27:51
.......................................
INFO:tensorflow:Not using Distribute Co

2021-09-23 15:27:51.886499: I tensorflow/core/platform/cpu_feature_guard.cc:142] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2 FMA
2021-09-23 15:27:51.891199: I tensorflow/core/platform/profile_utils/cpu_utils.cc:94] CPU Frequency: 2299995000 Hz
2021-09-23 15:27:51.891724: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x55df67fe8150 executing computations on platform Host. Devices:
2021-09-23 15:27:51.891753: I tensorflow/compiler/xla/service/service.cc:175]   StreamExecutor device (0): <undefined>, <undefined>


INFO:tensorflow:Saving checkpoints for 0 into ./workspace/models/cooc2emb-01/model.ckpt.
INFO:tensorflow:loss = 217.38344, step = 1
INFO:tensorflow:global_step/sec: 213.562
INFO:tensorflow:loss = 226.87907, step = 101 (0.469 sec)
INFO:tensorflow:global_step/sec: 267.038
INFO:tensorflow:loss = 205.40399, step = 201 (0.375 sec)
INFO:tensorflow:global_step/sec: 263.236
INFO:tensorflow:loss = 173.58081, step = 301 (0.380 sec)
INFO:tensorflow:global_step/sec: 256.068
INFO:tensorflow:loss = 180.20935, step = 401 (0.391 sec)
INFO:tensorflow:global_step/sec: 255.156
INFO:tensorflow:loss = 171.92728, step = 501 (0.392 sec)
INFO:tensorflow:global_step/sec: 264.873
INFO:tensorflow:loss = 198.65341, step = 601 (0.377 sec)
INFO:tensorflow:global_step/sec: 258.186
INFO:tensorflow:loss = 207.334, step = 701 (0.387 sec)
INFO:tensorflow:global_step/sec: 262.314
INFO:tensorflow:loss = 217.5608, step = 801 (0.381 sec)
INFO:tensorflow:global_step/sec: 254.22
INFO:tensorflow:loss = 189.91565, step = 901 (0

<img src="loss.png" width="800" height="1000"/>

## 8. Extract movie embeddings

In [12]:
def extract_embeddings():
    
    with tf.Session() as sess:
        saver = tf.train.import_meta_graph(os.path.join(model_dir, 'model.ckpt-{}.meta'.format(params.traning_steps)))
        saver.restore(sess, os.path.join(model_dir, 'model.ckpt-{}'.format(params.traning_steps)))
        graph = tf.get_default_graph()
        # Movie -> item2
        weights_tensor = graph.get_tensor_by_name('input_layer_1/item2_embedding/embedding_weights:0')
        weights = np.array(sess.run(weights_tensor))

    return weights

In [13]:
embeddings = extract_embeddings()
print(len(embeddings))
print(embeddings[0])



INFO:tensorflow:Restoring parameters from ./workspace/models/cooc2emb-01/model.ckpt-30000

3706
[ 0.6798268   0.5556367   1.1059098  -0.6754466  -0.27329913 -0.80857956
 -0.6825148   0.37080616 -0.16984008 -0.5403977   0.42919523  0.02541406
  1.1071475  -0.02919012  0.8224972   0.9321426  -1.1110322   0.83254176
 -0.650931   -0.61255014  0.271765   -0.26846492 -0.39904588 -1.0344254
  0.67169684  0.8867066  -0.05373385  1.0619622  -0.6333273  -0.20295696
  0.6395458   0.08317006]


In [14]:
vocab_path = os.path.join(COOC_DIR,'vocab-1.txt')
output_path = os.path.join(WORKSPACE,'embeddings.tsv')

def write_embeddings_to_tsv():
    with open(output_path, 'w') as out_f:
        with open(vocab_path) as vocab_f:
            for index, item in enumerate(vocab_f):
                embedding = embeddings[index]
                print('\t'.join([item.strip()] + [str(x) for x in embedding]), file=out_f)
                
write_embeddings_to_tsv()

In [15]:
!head {output_path}

1193	0.6798268	0.5556367	1.1059098	-0.6754466	-0.27329913	-0.80857956	-0.6825148	0.37080616	-0.16984008	-0.5403977	0.42919523	0.025414065	1.1071475	-0.02919012	0.8224972	0.9321426	-1.1110322	0.83254176	-0.650931	-0.61255014	0.271765	-0.26846492	-0.39904588	-1.0344254	0.67169684	0.8867066	-0.053733848	1.0619622	-0.6333273	-0.20295696	0.6395458	0.08317006
661	0.39956775	-0.12685657	0.3994441	0.3577309	-0.4147067	-0.70409584	-0.07111848	-0.47574326	-0.45248806	0.11504769	0.072631724	-0.12559962	-0.08645887	-0.11595145	1.0908881	0.08897211	0.4393795	-0.08239584	-0.04049251	-0.41274592	-0.12589924	0.5660396	-0.15796429	-0.5901901	0.19269356	0.3904628	-0.03157688	-0.065891474	0.03805932	-0.05437888	0.06961365	0.9672038
914	0.7281641	0.8227486	0.3230356	-0.4052361	-0.18917897	-0.20509852	-0.0053347354	0.47330004	0.21962	-0.3326129	-0.92542213	0.31504893	0.27371508	0.7916412	-0.6852843	0.035883576	-1.5265368	0.57049793	-0.90304935	-0.7667272	-0.14371836	0.50079274	-0.4312241	-0.38702524	0.4431

## 9. Export saved model as user-embedding lookup

In [16]:
def make_serving_input_receiver_fn():
    return tf.estimator.export.build_raw_serving_input_receiver_fn(
        {'item1': tf.placeholder(shape=[None], dtype=tf.string)} # User -> item1
    )

export_dir = os.path.join(model_dir, 'export')

if tf.gfile.Exists(export_dir):
    tf.gfile.DeleteRecursively(export_dir)
        
estimator.export_savedmodel(
    export_dir_base=export_dir,
    serving_input_receiver_fn=make_serving_input_receiver_fn()
)


Instructions for updating:
This function has been renamed, use `export_saved_model` instead.
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:vocabulary_size = 6040 in item1 is inferred from the number of elements in the vocabulary_file ./workspace/cooc/vocab-0.txt.
INFO:tensorflow:vocabulary_size = 3706 in item2 is inferred from the number of elements in the vocabulary_file ./workspace/cooc/vocab-1.txt.
INFO:tensorflow:Done calling model_fn.
Instructions for updating:
This function will only be available through the v1 compatibility library as tf.compat.v1.saved_model.utils.build_tensor_info or tf.compat.v1.saved_model.build_tensor_info.
INFO:tensorflow:Signatures INCLUDED in export for Classify: None
INFO:tensorflow:Signatures INCLUDED in export for Regress: None
INFO:tensorflow:Signatures INCLUDED in export for Predict: ['predictions', 'serving_default']
INFO:tensorflow:Signatures INCLUDED in export for Train: None
INFO:tensorflow:Signatures INCLUDED in export for Eval: None
INFO:

b'./workspace/models/cooc2emb-01/export/1632410991'

In [17]:
export_dir = os.path.join(model_dir, "export")
saved_model_dir = os.path.join(
    export_dir, [f for f in os.listdir(export_dir) if f.isdigit()][0])

print(saved_model_dir)

predictor_fn = tf.contrib.predictor.from_saved_model(
    export_dir = saved_model_dir,
)

output = predictor_fn({'item1': ['1']})
print(output)

./workspace/models/cooc2emb-01/export/1632410991
The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.

Instructions for updating:
This function will only be available through the v1 compatibility library as tf.compat.v1.saved_model.loader.load or tf.compat.v1.saved_model.load. There will be a new function for importing SavedModels in Tensorflow 2.0.
INFO:tensorflow:Restoring parameters from ./workspace/models/cooc2emb-01/export/1632410991/variables/variables
{'output': array([[-0.07985792, -0.12831457,  0.02718114, -0.23858824,  0.31844494,
        -0.48679075, -0.28125256,  0.23553467,  0.04509607, -0.3351374 ,
         0.20016576,  0.0796771 , -0.12675636,  0.5088239 , -0.0141517