# Building a recommender system with embedding

In [1]:
%reload_ext jupyter_black

In [2]:
# Cleaning up the datasets
from typing import Iterable
import pandas as pd
import os
import numpy as np
import torch
from torch.utils.data import DataLoader, Dataset
from typing import Tuple, Any
import sklearn.model_selection


def load_min_data(filename: str | Iterable):
    dfs = []
    if isinstance(filename, str):
        filename = [filename]
    for fn in filename:
        df = pd.read_csv(fn)
        # All min-datasets have an index column which has to be dropped:
        dfs.append(df.drop(df.columns[0], axis=1))
    return dfs


def clean_customer_data(df):
    # df = df.drop("FN", axis=1) # I they're not exactly equal
    df.loc[
        ~df["fashion_news_frequency"].isin(["Regularly", "Monthly"]),
        "fashion_news_frequency",
    ] = "None"
    return df

Due to the size of the data, it's important to generate negative labels in an efficient way. The function `pandas.DataFrame.sample()` takes almost five seconds for each sample, which is called at least `n_negative` times, we instead transform the dataframe to a NumPy array. Below is a comparison to highlight the importance of working with simpler objects.

In [3]:
def time_pd_vs_np(n_negative, df) -> Tuple[float, float]:
    """Compute time it takes to sample n_negative negative transactions

    Args:
        n_negative (int): Number of negative samples
        df (pd.DataFrame): Dataframe to sample from, requires columns 'customer_id' and 'article_id'

    Returns:
        Tuple[float, float]: Time taken using Pandas objects (first value) and NumPy objects (second value)
    """
    import time

    start_pd = time.time()
    num_written = 0
    tmpStr = "customer_id,article_id\n"
    while num_written < n_negative:
        # Choose random customer and article
        selection = np.array(
            [
                df["customer_id"].sample().values,
                df["article_id"].sample().values,
            ]
        ).flatten()
        if not (
            (df["customer_id"] == selection[0]) & (df["article_id"] == selection[1])
        ).any():
            tmpStr += f"{selection[0]}, {selection[1]}\n"
            num_written += 1
    with open("tmp.csv", "w") as f:
        f.write(tmpStr)
    df_negative = pd.read_csv("tmp.csv")
    os.remove("tmp.csv")
    time_pd = time.time() - start_pd

    # Numpy method
    start_np = time.time()
    df_np = df[["customer_id", "article_id"]].to_numpy()
    neg_np = np.empty((n_negative, df_np.shape[1]), dtype="<U64")
    for i in range(n_negative):
        legit = False
        while not legit:
            sample = [np.random.choice(df_np[:, col]) for col in range(df_np.shape[1])]
            legit = not ((df_np[:, 0] == sample[0]) & (df_np[:, 1] == sample[1])).any()
        neg_np[i, :] = sample
    time_np = time.time() - start_np

    return time_pd, time_np


def plot_negative_sampling(
    start: int,
    stop: int,
    step: int = 1,
    filename: str | None = None,
    persist_data: bool = True,
    cont_from_checkpoint: bool = True,
) -> None:
    """Plot the outputs of `time_pd_vs_np` for different ranges of n_negative

    Args:
        start (int): Range of n_negative (inclusive)
        stop (int): Range of n_negative (exclusive)
        step (int, optional): Step in range of n_negative. Defaults to 1.
        filename (str | None, optional): Plot output file name, if None, does not save file. Defaults to None.
        persist_data (bool, optional): Serialization option to store each iterate's result. Defaults to True.
        cont_from_checkpoint (bool, optional): Reads previous runs and doesn't recompute if done before.
                                                Defaults to True.
    """
    import matplotlib.pyplot as plt
    from tqdm import tqdm
    import pickle

    xax = list(range(start, stop, step))

    if cont_from_checkpoint:
        with open("plotData.pckl", "rb") as f:
            plot_values = pickle.load(f)

        # Add empty list for keys not covered by checkpoint:
        computed = set([x_i for x_i in plot_values.keys()])
        to_add = set(xax) - computed
        for elem in to_add:
            plot_values[elem] = []

        # Skip those already computed
        xax = [x for x in xax if x not in computed]

    else:
        plot_values = {x_i: [] for x_i in xax}

    for n_negative in tqdm(xax):
        time_pd, time_np = time_pd_vs_np(n_negative)
        plot_values[n_negative].extend([time_pd, time_np])

        if persist_data:
            with open("plotData.pckl", "wb") as f:
                pickle.dump(plot_values, f)

    plt.plot(
        plot_values.keys(),
        plot_values.values(),
        label=[
            "pandas.DataFrame.sample implementation",
            "NumPy.random.choice implementation",
        ],
    )
    plt.legend()
    plt.xlabel("Number of negative (generated) samples")
    plt.ylabel("Time [s]")
    plt.title("Comparison between sampling methods time")
    if filename is not None:
        plt.savefig(f"{filename}.pdf")
    plt.show()


# plot_negative_sampling(
#     start=1,
#     stop=50,
#     step=1,
#     filename="Comp_1_to_50",
#     persist_data=True,
#     cont_from_checkpoint=True,
# )

In [4]:
# Data loading
from sklearn.preprocessing import LabelEncoder


class Data_HM(Dataset):
    """This is the general HM Dataset class whose children are train-dataset and validation-dataset
    no

    Args:
        Dataset: Abstract Dataset class from pyTorch
    """

    def __init__(
        self,
        total_cases: int,
        portion_negatives: float,
        df_transactions: pd.DataFrame,
        df_articles: pd.DataFrame,
        df_customers: pd.DataFrame,
        batch_size: int,
        train_portion: float | None = None,
        test_portion: float | None = None,
        transform: Any = None,
        target_transform: Any = None,
    ) -> None:
        super().__init__()
        if train_portion is None:
            if test_portion is None:
                raise ValueError("Both train portion and test portion cannot be None.")
            self.train_portion = 1 - test_portion
        self.batch_size = batch_size
        self.df_id = self.generate_dataset(
            total_cases, portion_negatives, df_transactions
        )
        self.train_portion = train_portion
        self.train, self.val = self.split_dataset()
        self.transform, self.target_transform = transform, target_transform

    def generate_dataset(
        self, total_cases: int, portion_negatives: float, df_transactions: pd.DataFrame
    ) -> Tuple[pd.DataFrame, pd.DataFrame]:
        """Produce DataFrames for positive labels and generated negative samples

        Args:
            total_cases (int): Total number of transactions
            portion_negatives (float): The portion of the `total_cases` that should be negative. Balanced 0/1 when 0.5
            df_transactions (pd.DataFrame): Transactions to pull samples/generate samples from

        Returns:
            Tuple[pd.DataFrame, pd.DataFrame]: _description_
        """
        assert (
            0 <= portion_negatives <= 1
        ), r"portion negatives must be a float between 0%=0.0 and 100%=1.0!"
        n_positive = round(total_cases * (1 - portion_negatives))
        n_negative = total_cases - n_positive

        df_positive = df_transactions.sample(n=n_positive).reset_index(drop=True)
        df_positive = df_positive[["customer_id", "article_id"]]
        df_positive["label"] = 1

        # Sampling negative labels:
        #   We select a random combination of `customer_id`, `article_id`, and ensure that this is not a true transaction.
        #   Then we make a 2-column dataframe on same form as `df_positive`

        df_np = df_transactions[["customer_id", "article_id"]].to_numpy()
        neg_np = np.empty((n_negative, df_np.shape[1]), dtype="<U64")
        for i in range(n_negative):
            legit = False
            while not legit:
                sample = [
                    np.random.choice(df_np[:, col]) for col in range(df_np.shape[1])
                ]
                legit = not (
                    (df_np[:, 0] == sample[0]) & (df_np[:, 1] == sample[1])
                ).any()
            neg_np[i, :] = sample
        neg_np = np.column_stack((neg_np, [0] * neg_np.shape[0]))
        df_negative = pd.DataFrame(neg_np, columns=df_positive.columns)
        # Return a shuffled concatenation of the two dataframes
        full_data = (
            pd.concat((df_positive, df_negative)).sample(frac=1).reset_index(drop=True)
        )

        # Make label encodings of the IDs
        le_cust = LabelEncoder()
        le_art = LabelEncoder()
        le_cust.fit(full_data["customer_id"])
        le_art.fit(full_data["article_id"])
        cust_encode = le_cust.transform(full_data["customer_id"])
        art_encode = le_art.transform(full_data["article_id"])
        return pd.DataFrame(
            data={
                "customer_id": cust_encode,
                "article_id": art_encode,
                "label": full_data["label"].astype(np.uint8),
            }
        )

    def __len__(self):
        return len(self.df_id.index)

    def __getitem__(self, idx):
        row, label = self.df_id.iloc[idx, :-1].values, self.df_id.iloc[idx, -1]
        label = int(label)  # Stored as str for some reason
        if self.transform:
            row = self.transform(row)
        if self.target_transform:
            label = self.target_transform(label)
        return row, label

    def split_dataset(self):
        """Split full data to train and validation Subset-objects

        Returns:
            Tuple[Subset, Subset]: Train and validation subsets
        """
        length = len(self)
        train_size = int(length * self.train_portion)
        valid_size = length - train_size
        train, val = torch.utils.data.random_split(self, [train_size, valid_size])
        return train, val

    def get_data_from_subset(subset: torch.utils.data.Subset):
        """Not in use currently, but can retrieve data from Subset object directly"""
        return subset.dataset.df_id.iloc[subset.indices]

    def get_DataLoader(self, trainDL: bool = True):
        subset = self.train if trainDL else self.val
        return DataLoader(dataset=subset, batch_size=self.batch_size)

In [5]:
# Embedding model (same model as Mind Data example)


class HM_model(torch.nn.Module):
    def __init__(self, num_customer, num_articles, embedding_size):
        super(HM_model, self).__init__()
        self.customer_embed = torch.nn.Embedding(
            num_embeddings=num_customer, embedding_dim=embedding_size
        )
        self.art_embed = torch.nn.Embedding(
            num_embeddings=num_articles, embedding_dim=embedding_size
        )

    def forward(self, customer_row, article_row):
        customer_embed = self.customer_embed(customer_row)
        art_embed = self.art_embed(article_row)
        dot_prod = torch.sum(torch.mul(customer_embed, art_embed), 1)
        return torch.sigmoid(dot_prod)

In [38]:
def train_one_epoch(model: HM_model, data: Data_HM, epoch_num: int, optimizer, loss):
    epoch_loss = 0
    for row, label in data.get_DataLoader(trainDL=True):
        optimizer.zero_grad()
        pred = model(row[:, 0], row[:, 1])
        loss_value = loss(pred.view(-1), torch.FloatTensor(label.tolist()))
        loss_value.backward()
        optimizer.step()
        epoch_loss += loss_value
    print(f"\t| Training loss for epoch {epoch_num+1}: {epoch_loss}")


def train(model, data, params):
    # Uses binary cross entropy at the moment
    loss_metric = torch.nn.BCELoss()
    optimizer = params.optimizer(
        model.parameters(), lr=params.lr_rate, weight_decay=params.weight_decay
    )
    for epoch in range(params.epochs):
        train_one_epoch(model, data, epoch, optimizer, loss_metric)
        if not epoch % params.validation_frequency:

            print(f"Provisory results for epoch {epoch+1}:")
            print(
                "Loss for training set",
                validate(model, data, train=True),
                sep="\t",
            )
            print(
                "Loss for validation set",
                validate(model, data, train=False),
                sep="\t",
            )
            print("-" * 20)


import utils.metrics as metric
import importlib

importlib.reload(metric)


def validate(model, data, train):
    # return # MAPk does not expect batch results so this doesn't work yet
    with torch.no_grad():
        preds, labels = [], []
        for row, label in data.get_DataLoader(trainDL=train):
            pred_i = model(row[:, 0], row[:, 1]).view(-1)
            preds.append(pred_i.detach().numpy())
            labels.append(label.detach().numpy())
        print(f"{preds = }, {labels = }")

        return metric.MAPk(
            k=3, preds=np.array(preds), true=np.array(labels)
        )  # TODO adjust to k=12
def save_dataset_obj(data: HM_model, dst: str) -> None:
    import pickle
    with open(dst, 'wb') as f: 
        pickle.dump(data, f)
def read_dataset_obj(src: str) -> None:
    import pickle
    with open(src, 'rb') as f:
        pickle.load(f)

In [8]:
from dataclasses import dataclass, asdict
from typing import Any


def main(use_min_dataset: bool = False):
    @dataclass
    class Hyperparameters:
        lr_rate: float = 1e-3  # TODO consider dynamically changing lr
        weight_decay: str = 1e-5
        epochs: int = 100
        validation_frequency: int = 10
        optimizer: Any = torch.optim.Adam
        embedding_size: int = 5
        # Add more here...

    # Load data
    if use_min_dataset:
        df_c, df_a, df_t = load_min_data(
            [
                f"dataset_sample/{n}_min.csv"
                for n in ("customer", "articles", "transactions")
            ]
        )
    else:
        df_c = pd.read_csv("dataset/customers.csv")
        # Articles IDs all start with 0 which disappears if cast to a number
        df_a = pd.read_csv("dataset/articles.csv", dtype={"article_id": str})
        df_t = pd.read_csv("dataset/transactions_train.csv", dtype={"article_id": str})
    df_c = clean_customer_data(df_c)

    # Transform to training and testing set
    dataset_params = {
        "total_cases": 20,
        "portion_negatives": 0.9,
        "df_transactions": df_t,
        "df_articles": df_a,
        "df_customers": df_c,
        "train_portion": 0.7,
        "batch_size": 5,
    }
    hyperparams = Hyperparameters()
    data = Data_HM(**dataset_params)
    n_cust, n_art, _ = data.df_id.nunique()
    model = HM_model(
        num_customer=n_cust,
        num_articles=n_art,
        embedding_size=hyperparams.embedding_size,
    )
    train(model, data, hyperparams)

    # Train, eval, save results and weights...


# main()
@dataclass
class Hyperparameters:
    lr_rate: float = 1e-3
    weight_decay: str = 1e-5
    epochs: int = 100
    validation_frequency: int = 10
    optimizer: Any = torch.optim.Adam
    embedding_size: int = 5
    # Add more here...


# Load data

df_c = pd.read_csv("dataset/customers.csv")
# Articles IDs all start with 0 which disappears if cast to a number
df_a = pd.read_csv("dataset/articles.csv", dtype={"article_id": str})
df_t = pd.read_csv("dataset/transactions_train.csv", dtype={"article_id": str})
df_c = clean_customer_data(df_c)

# Transform to training and testing set
dataset_params = {
    "total_cases": 2000,
    "portion_negatives": 0.9,
    "df_transactions": df_t,
    "df_articles": df_a,
    "df_customers": df_c,
    "train_portion": 0.7,
    "batch_size": 5,
}
hyperparams = Hyperparameters()
data = Data_HM(**dataset_params)
save_dataset_obj(data, "object_storage/HM_data.pckl")
n_cust, n_art, _ = data.df_id.nunique()
model = HM_model(
    num_customer=n_cust,
    num_articles=n_art,
    embedding_size=hyperparams.embedding_size,
)
# train(model, data, hyperparams)

## Debug

In [39]:
train(model, data, hyperparams)

	| Training loss for epoch 1: 3.042361259460449
Provisory results for epoch 1:
preds = [array([0.98983264, 0.43170428, 0.05614689, 0.26709473, 0.0503511 ],
      dtype=float32), array([7.1629530e-01, 9.9749869e-01, 4.9718261e-01, 9.6894181e-01,
       1.7172622e-04], dtype=float32), array([0.48154533, 0.08522618, 0.01036763, 0.41108695], dtype=float32)], labels = [array([0, 0, 0, 0, 0], dtype=int64), array([0, 0, 0, 1, 0], dtype=int64), array([0, 0, 0, 0], dtype=int64)]
Loss for training set	0.0
preds = [array([0.94313633, 0.6272779 , 0.17086524, 0.04010845, 0.004976  ],
      dtype=float32), array([0.31926334], dtype=float32)], labels = [array([1, 0, 0, 0, 0], dtype=int64), array([0], dtype=int64)]
Loss for validation set	0.0
--------------------
	| Training loss for epoch 2: 3.0270113945007324
	| Training loss for epoch 3: 3.012789011001587
	| Training loss for epoch 4: 2.9987945556640625
	| Training loss for epoch 5: 2.9849116802215576
	| Training loss for epoch 6: 2.971127033233642

  k=3, preds=np.array(preds), true=np.array(labels)


	| Training loss for epoch 58: 2.340359926223755
	| Training loss for epoch 59: 2.329638719558716
	| Training loss for epoch 60: 2.318960666656494
	| Training loss for epoch 61: 2.3083279132843018
Provisory results for epoch 61:
preds = [array([0.9763106 , 0.3202147 , 0.02399621, 0.14680271, 0.0233381 ],
      dtype=float32), array([5.2045709e-01, 9.9311358e-01, 3.5671180e-01, 9.8655707e-01,
       1.3778529e-04], dtype=float32), array([0.36568642, 0.04001066, 0.00471401, 0.25767764], dtype=float32)], labels = [array([0, 0, 0, 0, 0], dtype=int64), array([0, 0, 0, 1, 0], dtype=int64), array([0, 0, 0, 0], dtype=int64)]
Loss for training set	0.0
preds = [array([0.85305727, 0.59002054, 0.27670065, 0.1081333 , 0.02364885],
      dtype=float32), array([0.33123136], dtype=float32)], labels = [array([1, 0, 0, 0, 0], dtype=int64), array([0], dtype=int64)]
Loss for validation set	0.0
--------------------
	| Training loss for epoch 62: 2.2977380752563477
	| Training loss for epoch 63: 2.287191390