In [1]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from mlops_meetup import parquet_ingest, config, datawarehouse, modelling
import numpy as np
import pandas as pd

In [2]:
import prefect

In [3]:
with prefect.Flow("Training run") as f:
    con = datawarehouse.connect_dw()
    lookups = datawarehouse.create_lookups(
        con
    )

    max_item_id, max_user_id = datawarehouse.get_max_ids(con)
    training_data = datawarehouse.training_data(con, upstream_tasks=[lookups])

state = f.run()

[2021-07-11 17:59:22+0100] INFO - prefect.FlowRunner | Beginning Flow run for 'Training run'
[2021-07-11 17:59:22+0100] INFO - prefect.TaskRunner | Task 'create_lookups': Starting task run...
[2021-07-11 17:59:22+0100] INFO - prefect.TaskRunner | Task 'create_lookups': Finished task run for task with final state: 'Success'
[2021-07-11 17:59:22+0100] INFO - prefect.TaskRunner | Task 'get_max_ids': Starting task run...
[2021-07-11 17:59:22+0100] INFO - prefect.TaskRunner | Task 'get_max_ids': Finished task run for task with final state: 'Success'
[2021-07-11 17:59:22+0100] INFO - prefect.TaskRunner | Task 'get_max_ids[1]': Starting task run...
[2021-07-11 17:59:22+0100] INFO - prefect.TaskRunner | Task 'get_max_ids[1]': Finished task run for task with final state: 'Success'
[2021-07-11 17:59:22+0100] INFO - prefect.TaskRunner | Task 'get_max_ids[0]': Starting task run...
[2021-07-11 17:59:22+0100] INFO - prefect.TaskRunner | Task 'get_max_ids[0]': Finished task run for task with final st

In [4]:
data = state.result[training_data]._result.value
max_item_id_value = state.result[max_item_id]._result.value
max_user_id_value = state.result[max_user_id]._result.value

In [33]:
embedding_size=32

user_input = layers.Input(shape=(1,), name="user")
item_input = layers.Input(shape=(1,), name="item")

user_embedding = layers.Embedding(
    name="user_embedding", input_dim=max_user_id_value + 1, output_dim=embedding_size
)(user_input)
item_embedding = layers.Embedding(
    name="item_embedding", input_dim=max_item_id_value + 1, output_dim=embedding_size
)(item_input)

dot = layers.Dot(
    name="dot_product",
    normalize=True,
    axes=2,
)([item_embedding, user_embedding])

merged = layers.Reshape((1,))(dot)

model = keras.Model(inputs=[user_input, item_input], outputs=[merged])

model.compile(optimizer="nadam", loss="mse")




In [34]:
def make_empty_batch(n_positive_examples, negative_ratio):
    batch_size = (1 + negative_ratio) * n_positive_examples
    batch = np.zeros((batch_size, 3))
    return batch


def negative_sampler(data, positive_indexes, negative_ratio):
    mask = np.ones(len(data["user_id"]), np.bool)
    mask[positive_indexes] = 0
    users = data["user_id"][positive_indexes]
    negative_users = np.tile(users, negative_ratio)
    
    items = data["item_id"][mask]
    negative_items = np.random.choice(items, len(negative_users))
    
    return negative_users, negative_items

def batchifier(data, n_positive_examples, negative_ratio):
    batch = make_empty_batch(n_positive_examples, negative_ratio)
    ids = np.arange(len(data["user_id"]))
    while True:
        positive_indexes = np.random.choice(ids, n_positive_examples)
        batch[:n_positive_examples, 0] = data["user_id"][positive_indexes]
        batch[:n_positive_examples, 1] = data["item_id"][positive_indexes]
        # give them a score of 1
        batch[:n_positive_examples, 2] = 1
        
        negative_users, negative_items = negative_sampler(
            data,
            positive_indexes,
            negative_ratio
        )
        
        # give them the score of -1

        batch[n_positive_examples:, 0] = negative_users
        batch[n_positive_examples:, 1] = negative_items
        batch[n_positive_examples:, 2] = -1
        
        np.random.shuffle(batch)
        
        yield {
            "user": batch[:, 0],
            "item": batch[:, 1],
        }, batch[:, 2]

a = batchifier(data, 
           n_positive_examples=256, 
           negative_ratio=10, 
)

In [35]:
model.fit(
    a,
    epochs=25,
    steps_per_epoch=len(data["user_id"]) // 256,
    verbose=2
)

Epoch 1/25
56/56 - 1s - loss: 1.0256
Epoch 2/25
56/56 - 1s - loss: 1.0174
Epoch 3/25
56/56 - 1s - loss: 1.0009
Epoch 4/25
56/56 - 1s - loss: 0.9721
Epoch 5/25
56/56 - 1s - loss: 0.9118
Epoch 6/25
56/56 - 1s - loss: 0.8109
Epoch 7/25
56/56 - 1s - loss: 0.6918
Epoch 8/25
56/56 - 1s - loss: 0.5760
Epoch 9/25
56/56 - 1s - loss: 0.4908
Epoch 10/25
56/56 - 1s - loss: 0.4238
Epoch 11/25
56/56 - 1s - loss: 0.3789
Epoch 12/25
56/56 - 1s - loss: 0.3498
Epoch 13/25
56/56 - 1s - loss: 0.3306
Epoch 14/25
56/56 - 1s - loss: 0.3157
Epoch 15/25
56/56 - 1s - loss: 0.3069
Epoch 16/25
56/56 - 1s - loss: 0.2999
Epoch 17/25
56/56 - 1s - loss: 0.2950
Epoch 18/25
56/56 - 1s - loss: 0.2910
Epoch 19/25
56/56 - 1s - loss: 0.2876
Epoch 20/25
56/56 - 1s - loss: 0.2851
Epoch 21/25
56/56 - 1s - loss: 0.2829
Epoch 22/25
56/56 - 1s - loss: 0.2810
Epoch 23/25
56/56 - 1s - loss: 0.2794
Epoch 24/25
56/56 - 1s - loss: 0.2778
Epoch 25/25
56/56 - 1s - loss: 0.2769


<tensorflow.python.keras.callbacks.History at 0x16f8a43a0>

In [28]:
model.layers[3].embeddings

<tf.Variable 'user_embedding/embeddings:0' shape=(10980, 32) dtype=float32, numpy=
array([[-7.4076168e-03,  2.6773661e-05, -4.0767230e-02, ...,
         3.7153687e-02, -3.1127274e-02,  1.9083370e-02],
       [ 4.9473112e-03, -3.2740187e-02,  8.2770037e-03, ...,
        -8.9771915e-03, -4.5250885e-02, -4.5312863e-02],
       [-1.5054723e-02, -2.1298775e-02,  1.4644582e-02, ...,
         3.5936128e-02, -4.0803108e-02, -1.8823575e-02],
       ...,
       [-2.3449248e-02, -9.3753878e-03,  1.9535035e-02, ...,
         1.4218117e-02, -5.9859985e-03, -3.1603411e-02],
       [-6.4728200e-02, -7.0344158e-02,  3.4125306e-02, ...,
         2.4803118e-03, -2.2856964e-02, -5.3252839e-02],
       [-5.0490484e-02, -4.1035701e-02,  3.8105752e-02, ...,
         9.2800949e-03, -2.2715569e-02,  1.1011964e-03]], dtype=float32)>

In [27]:
len(set(data["user_id"]))

10978