# Overview

The purpose of this notebook is to generate a RankNet dataset to train a scoring model

## Config

In [None]:
import pandas as pd
import cloudpickle
import pickle
import re
import jsonlines
from tqdm import tqdm 

In [None]:
embeddings = cloudpickle.load(open('./embedding_dictionary.p','rb'))
xl = pd.ExcelFile('./RANKING_SEMANAL.xlsx')

## JSONLINES file 
We will use a jsonlines file as an intermediate product, to store the embeddings of each pair as well as the ranking. 
For the RankNet neural network, the output labels are:
* 0.0 if the first pair is ranked lower than the second. 
* 0.5 if they are equivalent
* 1.0 if the first pair is ranked higher than the second. 

For our use case, we will discretize the ranking in the following way:
* Class A: top 1 result of the week;
* Class B: top 2-5; 
* Class C: top 6-15;
* Class D: top 16-50;
* Class E: bottom 50.

In [50]:
sheets = xl.sheet_names  # see all sheet names
with jsonlines.open('./ranknet_covers.jsonl', mode='w') as writer:
    for sheet in tqdm(sheets):
        try:
            data = xl.parse(sheet)
            ISBNs = [re.sub('-','',i) for i in data.ISBN]
            for idx, isbn_a in enumerate(ISBNs):
                for jdx, isbn_b in enumerate(ISBNs):
                    if idx<jdx: # Default, 1.0 if idx ranked higher than jdx, 0 otherwise
                        label = 1.0
                    else:
                        label = 0.0

                    # first result of LHS is always better
                    if (idx==0):
                        label = 1.0
                    # first result of RHS makes always LHS worst
                    elif(jdx==0):
                        label = 0.0
                    elif(1+idx<=5)&(1+jdx<=5)&(1+idx>=2)&(1+jdx>=2):
                        label = 0.5
                    elif(1+idx<=15)&(1+jdx<=15)&(1+idx>=6)&(1+jdx>=6):
                        label = 0.5
                    elif(1+idx<=50)&(1+jdx<=50)&(1+idx>=16)&(1+jdx>=16):
                        label = 0.5
                    elif(1+idx>15)&(1+jdx>15):
                        label = 0.5

                    _ = {'a': embeddings[isbn_a[:12]],
                         'b': embeddings[isbn_b[:12]],
                         'label': label}
                    writer.write(_)
        except:
            pass



100%|██████████| 33/33 [01:58<00:00,  3.59s/it]


## TFRecords dataset

In [53]:
import tensorflow as tf

def _bytes_feature(value):
    """Returns a bytes_list from a string / byte."""
    if isinstance(value, type(tf.constant(0))):
        value = value.numpy() # BytesList won't unpack a string from an EagerTensor.
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

def serialize_example_ranknet(a, b, label):
    """
    Creates a tf.Example message ready to be written to a file.
    """
    # Create a dictionary mapping the feature name to the tf.Example-compatible
    # data type.
        
    feature = {
      'a': _bytes_feature(tf.io.serialize_tensor(tf.cast(a, tf.float32))),
      'b': _bytes_feature(tf.io.serialize_tensor(tf.cast(b, tf.float32))),
      'label':  _bytes_feature(tf.io.serialize_tensor(label))
    }
    # Create a Features message using tf.train.Example.
    example_proto = tf.train.Example(features=tf.train.Features(feature=feature))
    return example_proto.SerializeToString()


In [55]:
tfrecord_file ='cover_ranknet.tfrecords'
with tf.device('CPU'):
    with tf.io.TFRecordWriter(tfrecord_file) as writer:
        with jsonlines.open('./ranknet_covers.jsonl', mode='r') as reader:
            for obj in tqdm(reader):
                writer.write(serialize_example_ranknet(obj['a'], obj['b'], obj['label']))
                

51394it [02:07, 404.11it/s]
