# Building a recommender system with embedding

In [1]:
%reload_ext jupyter_black

Imports

In [2]:
import os

# os.environ["CUDA_VISIBLE_DEVICES"] = ""  # Disable use of gpu
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
import fastai.collab, torch
from typing import Any, Tuple
import numpy as np
import pandas as pd
from torch.utils.data import DataLoader, Dataset
from sklearn.preprocessing import LabelEncoder
import pickle

Import dataset from main embedding notebook

In [3]:
# Data loading

# Data_HM definition has to be here for pickle to understand what object it loads in
class Data_HM(Dataset):
    """This is the general HM Dataset class whose children are train-dataset and validation-dataset
    no

    Args:
        Dataset: Abstract Dataset class from pyTorch
    """

    def __init__(
        self,
        total_cases: int,
        portion_negatives: float,
        df_transactions: pd.DataFrame,
        df_articles: pd.DataFrame,
        df_customers: pd.DataFrame,
        batch_size: int,
        train_portion: float | None = None,
        test_portion: float | None = None,
        transform: Any = None,
        target_transform: Any = None,
    ) -> None:
        super().__init__()
        if train_portion is None:
            if test_portion is None:
                raise ValueError("Both train portion and test portion cannot be None.")
            self.train_portion = 1 - test_portion
        self.batch_size = batch_size
        self.df_id = self.generate_dataset(
            total_cases, portion_negatives, df_transactions
        )
        self.train_portion = train_portion
        self.train, self.val = self.split_dataset()
        self.transform, self.target_transform = transform, target_transform

    def generate_dataset(
        self, total_cases: int, portion_negatives: float, df_transactions: pd.DataFrame
    ) -> Tuple[pd.DataFrame, pd.DataFrame]:
        """Produce DataFrames for positive labels and generated negative samples

        Args:
            total_cases (int): Total number of transactions
            portion_negatives (float): The portion of the `total_cases` that should be negative. Balanced 0/1 when 0.5
            df_transactions (pd.DataFrame): Transactions to pull samples/generate samples from

        Returns:
            Tuple[pd.DataFrame, pd.DataFrame]: _description_
        """
        assert (
            0 <= portion_negatives <= 1
        ), r"portion negatives must be a float between 0%=0.0 and 100%=1.0!"
        n_positive = round(total_cases * (1 - portion_negatives))
        n_negative = total_cases - n_positive

        df_positive = df_transactions.sample(n=n_positive).reset_index(drop=True)
        df_positive = df_positive[["customer_id", "article_id"]]
        df_positive["label"] = 1

        # Sampling negative labels:
        #   We select a random combination of `customer_id`, `article_id`, and ensure that this is not a true transaction.
        #   Then we make a 2-column dataframe on same form as `df_positive`

        df_np = df_transactions[["customer_id", "article_id"]].to_numpy()
        neg_np = np.empty((n_negative, df_np.shape[1]), dtype="<U64")
        for i in range(n_negative):
            legit = False
            while not legit:
                sample = [
                    np.random.choice(df_np[:, col]) for col in range(df_np.shape[1])
                ]
                legit = not (
                    (df_np[:, 0] == sample[0]) & (df_np[:, 1] == sample[1])
                ).any()
            neg_np[i, :] = sample
        neg_np = np.column_stack((neg_np, [0] * neg_np.shape[0]))
        df_negative = pd.DataFrame(neg_np, columns=df_positive.columns)
        # Return a shuffled concatenation of the two dataframes
        full_data = (
            pd.concat((df_positive, df_negative)).sample(frac=1).reset_index(drop=True)
        )

        # Make label encodings of the IDs
        le_cust = LabelEncoder()
        le_art = LabelEncoder()
        le_cust.fit(full_data["customer_id"])
        le_art.fit(full_data["article_id"])
        cust_encode = le_cust.transform(full_data["customer_id"])
        art_encode = le_art.transform(full_data["article_id"])
        return pd.DataFrame(
            data={
                "customer_id": cust_encode,
                "article_id": art_encode,
                "label": full_data["label"].astype(np.uint8),
            }
        )

    def __len__(self):
        return len(self.df_id.index)

    def __getitem__(self, idx):
        row, label = self.df_id.iloc[idx, :-1].values, self.df_id.iloc[idx, -1]
        label = int(label)  # Stored as str initially
        if self.transform:
            row = self.transform(row)
        if self.target_transform:
            label = self.target_transform(label)
        return row, label

    def split_dataset(self):
        """Split full data to train and validation Subset-objects

        Returns:
            Tuple[Subset, Subset]: Train and validation subsets
        """
        length = len(self)
        train_size = int(length * self.train_portion)
        valid_size = length - train_size
        train, val = torch.utils.data.random_split(self, [train_size, valid_size])
        return train, val

    def get_data_from_subset(self, subset: torch.utils.data.Subset):
        """Not in use currently, but can retrieve data from Subset object directly"""
        return subset.dataset.df_id.iloc[subset.indices]

    def get_DataLoader(self, trainDL: bool = True):
        subset = self.train if trainDL else self.val
        return DataLoader(dataset=subset, batch_size=self.batch_size)

In [4]:
def read_dataset_obj(src: str) -> Any:

    with open(src, "rb") as f:
        data = pickle.load(f)
    return data


dataset = read_dataset_obj("object_storage/HM_data.pckl")
dls = fastai.collab.CollabDataLoaders.from_df(dataset.df_id, bs=8).to("cpu")
# Total of 2000 articles to use here

Model specifications

In [56]:
class HM_baseline(torch.nn.Module):
    """Collborative filtering model with bias,
    Modified to work with fastai
    """

    def __init__(self, num_customer, num_articles, embedding_size):
        super(HM_baseline, self).__init__()
        self.customer_embed = torch.nn.Embedding(
            num_embeddings=num_customer + 1, embedding_dim=embedding_size
        )
        self.art_embed = torch.nn.Embedding(
            num_embeddings=num_articles + 1, embedding_dim=embedding_size
        )
        # self.customer_bias = torch.nn.Embedding(num_customer, 1)
        # self.article_bias = torch.nn.Embedding(num_articles, 1)

    def forward(self, row):
        try:
            customer_embed = self.customer_embed(row[:, 0])
            art_embed = self.art_embed(row[:, 1])
        except:
            print("FAIIIL")
            print(f"{row[:,0].shape = }")
            print(f"{row[:,1].shape = }")
            raise IndexError
        finally:

            dot_prod = (customer_embed * art_embed).sum(dim=1, keepdim=True)
            # Add bias nodes to model:
            # dot_prod = (
            #     dot_prod + self.customer_bias(row[:, 0]) + self.article_bias(row[:, 1])
            # )
            return torch.sigmoid(dot_prod).to("cpu")

Training

In [57]:
# Revert to basic fitting scheme....
dataset = read_dataset_obj("object_storage/HM_data.pckl")
n_cust = len(dls.classes["customer_id"])
n_art = len(dls.classes["article_id"])
dls = fastai.collab.CollabDataLoaders.from_df(dataset.df_id, bs=64).to("cpu")
model = HM_baseline(n_cust, n_art, 50)
learner = fastai.collab.Learner(dls, model, loss_func=fastai.losses.BCELossFlat())
learner.fit_one_cycle(5, 1e-3)

epoch,train_loss,valid_loss,time


FAIIIL
row[:,0].shape = torch.Size([64])
row[:,1].shape = torch.Size([64])


UnboundLocalError: local variable 'art_embed' referenced before assignment

In [18]:
# n_cust, n_art, _ = dls.all_cols.nunique()
from dataclasses import dataclass


@dataclass
class ModelParameters:
    learn_rate_max: float = 0.01
    weight_decay: float = 0.1
    epochs: int = 5
    emb_size: int = 600
    batch_size: int = 64
    lr_red_patience: int = 1
    lr_red_factor = 10


def test_hyperparams(params: ModelParameters):
    assert (
        params.emb_size >= params.batch_size
    ), "Embedding size cannot be smaller than batch size."
    dataset = read_dataset_obj("object_storage/HM_data.pckl")
    n_cust = len(dls.classes["customer_id"])
    n_art = len(dls.classes["article_id"])
    device = "cpu" if torch.cuda.is_available() else "cpu"
    learner = fastai.collab.Learner(
        dls=fastai.collab.CollabDataLoaders.from_df(
            dataset.df_id, bs=params.batch_size
        ).to(device),
        model=HM_baseline(n_cust, n_art, params.emb_size).to(device),
        loss_func=fastai.losses.BCELossFlat(),
        cbs=[
            fastai.callback.tracker.SaveModelCallback(),
            fastai.callback.tracker.ReduceLROnPlateau(
                patience=params.lr_red_patience, factor=params.lr_red_factor
            ),
        ],
    )
    learner.fit_one_cycle(
        n_epoch=params.epochs, lr_max=params.learn_rate_max, wd=params.weight_decay
    )


def main():
    print("Testing WEIGHT DECAY")
    for wd in (1e-4, 1e-3, 1e-2, 1e-1):
        param = ModelParameters(weight_decay=wd)
        test_hyperparams(param)
    print("Testing EMBEDDING SIZE")
    for emb_sz in (100, 500, 1000, 1e5):
        param = ModelParameters(emb_size=emb_sz)
        test_hyperparams(param)
    print("Testing BATCH SIZE")
    for batch_size in (1, 8, 32, 64, 128):
        param = ModelParameters(batch_size=batch_size)
        test_hyperparams(param)


main()

Testing WEIGHT DECAY


epoch,train_loss,valid_loss,time


IndexError: index out of range in self