# Building a recommender system with embedding

In [48]:
%reload_ext jupyter_black

In [49]:
# Cleaning up the datasets
from typing import Iterable
import pandas as pd
import os
import numpy as np
import torch
from torch.utils.data import DataLoader, Dataset
from typing import Tuple, Any
import sklearn.model_selection


def load_min_data(filename: str | Iterable):
    dfs = []
    if isinstance(filename, str):
        filename = [filename]
    for fn in filename:
        df = pd.read_csv(fn)
        # All min-datasets have an index column which has to be dropped:
        dfs.append(df.drop(df.columns[0], axis=1))
    return dfs


def clean_customer_data(df):
    # df = df.drop("FN", axis=1) # I they're not exactly equal
    df.loc[
        ~df["fashion_news_frequency"].isin(["Regularly", "Monthly"]),
        "fashion_news_frequency",
    ] = "None"
    return df

In [50]:
import pandas as pd
import numpy as np

# df = pd.read_csv("dataset/transactions_train.csv", dtype={"article_id": str})
# df_np = df[["customer_id", "article_id"]].to_numpy()

Due to the size of the data, it's important to generate negative labels in an efficient way. The function `pandas.DataFrame.sample()` takes almost five seconds for each sample, which is called at least `n_negative` times, we instead transform the dataframe to a NumPy array. Below is a comparison to highlight the importance of working with simpler objects.

In [51]:
def time_pd_vs_np(n_negative, df):
    import time

    start_pd = time.time()
    num_written = 0
    tmpStr = "customer_id,article_id\n"
    while num_written < n_negative:
        # Choose random customer and article
        selection = np.array(  # TODO this can probably be optimized further
            [
                df["customer_id"].sample().values,
                df["article_id"].sample().values,
            ]
        ).flatten()
        if not (
            (df["customer_id"] == selection[0]) & (df["article_id"] == selection[1])
        ).any():
            tmpStr += f"{selection[0]}, {selection[1]}\n"
            num_written += 1
    with open("tmp.csv", "w") as f:
        f.write(tmpStr)
    df_negative = pd.read_csv("tmp.csv")
    os.remove("tmp.csv")
    time_pd = time.time() - start_pd

    # Numpy method
    start_np = time.time()
    df_np = df[["customer_id", "article_id"]].to_numpy()
    neg_np = np.empty((n_negative, df_np.shape[1]), dtype="<U64")
    for i in range(n_negative):
        legit = False
        while not legit:
            sample = [np.random.choice(df_np[:, col]) for col in range(df_np.shape[1])]
            legit = not ((df_np[:, 0] == sample[0]) & (df_np[:, 1] == sample[1])).any()
        neg_np[i, :] = sample
    time_np = time.time() - start_np

    return time_pd, time_np


def plot_negative_sampling(
    start,
    stop,
    step: int = 1,
    filename: str | None = None,
    persist_data: bool = True,
    cont_from_checkpoint: bool = True,
):
    import matplotlib.pyplot as plt
    from tqdm import tqdm
    import pickle

    xax = list(range(start, stop, step))

    if cont_from_checkpoint:
        with open("plotData.pckl", "rb") as f:
            plot_values = pickle.load(f)
        # Add empty list for keys not covered by checkpoint:
        computed = set([x_i for x_i in plot_values.keys()])
        to_add = set(xax) - computed
        for elem in to_add:
            plot_values[elem] = []
        # Skip those already computed
        xax = [x for x in xax if x not in computed]

    else:
        plot_values = {x_i: [] for x_i in xax}

    for n_negative in tqdm(xax):
        time_pd, time_np = time_pd_vs_np(n_negative)
        plot_values[n_negative].extend([time_pd, time_np])

        if persist_data:
            with open("plotData.pckl", "wb") as f:
                pickle.dump(plot_values, f)

    plt.plot(
        plot_values.keys(),
        plot_values.values(),
        label=[
            "pandas.DataFrame.Sample implementation",
            "NumPy.random.choice implementation",
        ],
    )
    plt.legend()
    plt.xlabel("Number of negative (generated) samples")
    plt.ylabel("Time [s]")
    plt.title("Comparison between sampling methods time")
    if filename is not None:
        plt.savefig(f"{filename}.pdf")
    plt.show()


# plot_negative_sampling(
#     start=1,
#     stop=50,
#     step=1,
#     filename="Comp_1_to_50",
#     persist_data=True,
#     cont_from_checkpoint=True,
# )

In [68]:
# Data loading


class Data_HM(Dataset):
    """This is the general HM Dataset class whose children are train-dataset and validation-dataset
    no

    Args:
        Dataset: Abstract Dataset class from pyTorch
    """

    def __init__(
        self,
        total_cases: int,
        portion_negatives: float,
        df_transactions: pd.DataFrame,
        df_articles: pd.DataFrame,
        df_customers: pd.DataFrame,
        batch_size: int,
        train_portion: float | None = None,
        test_portion: float | None = None,
        transform: Any = None,
        target_transform: Any = None,
    ) -> None:
        super().__init__()
        if train_portion is None:
            if test_portion is None:
                raise ValueError("Both train portion and test portion cannot be None.")
            self.train_portion = 1 - test_portion
        self.batch_size = batch_size
        self.df_id = self.generate_dataset(
            total_cases, portion_negatives, df_transactions
        )
        self.train_portion = train_portion
        self.train, self.val = self.split_dataset()
        self.transform, self.target_transform = transform, target_transform

    def generate_dataset(
        self, total_cases: int, portion_negatives: float, df_transactions: pd.DataFrame
    ) -> Tuple[pd.DataFrame, pd.DataFrame]:
        """Produce DataFrames for positive labels and generated negative samples

        Args:
            total_cases (int): Total number of transactions
            portion_negatives (float): The portion of the `total_cases` that should be negative. Balanced 0/1 when 0.5
            df_transactions (pd.DataFrame): Transactions to pull samples/generate samples from

        Returns:
            Tuple[pd.DataFrame, pd.DataFrame]: _description_
        """
        assert (
            0 <= portion_negatives <= 1
        ), r"portion negatives must be a float between 0%=0.0 and 100%=1.0!"
        n_positive = int(total_cases * (1 - portion_negatives))
        n_negative = total_cases - n_positive

        df_positive = df_transactions.sample(n=n_positive).reset_index(drop=True)
        df_positive = df_positive[["customer_id", "article_id"]]
        df_positive["label"] = 1

        # Sampling negative labels:
        #   We select a random combination of `customer_id`, `article_id`, and ensure that this is not a true transaction.
        #   Then we make a 2-column dataframe on same form as `df_positive`

        df_np = df_transactions[["customer_id", "article_id"]].to_numpy()
        neg_np = np.empty((n_negative, df_np.shape[1]), dtype="<U64")
        for i in range(n_negative):
            legit = False
            while not legit:
                sample = [
                    np.random.choice(df_np[:, col]) for col in range(df_np.shape[1])
                ]
                legit = not (
                    (df_np[:, 0] == sample[0]) & (df_np[:, 1] == sample[1])
                ).any()
            neg_np[i, :] = sample
        neg_np = np.column_stack((neg_np, [0] * neg_np.shape[0]))
        df_negative = pd.DataFrame(neg_np, columns=df_positive.columns)
        # Return a shuffled concatenation of the two dataframes
        return (
            pd.concat((df_positive, df_negative)).sample(frac=1).reset_index(drop=True)
        )

    def __len__(self):
        return len(self.df_id.index)

    def __getitem__(self, idx):
        row, label = self.df_id.iloc[idx, :-1].values, self.df_id.iloc[idx, -1].values
        if self.transform:
            row = self.transform(row)
        if self.target_transform:
            label = self.target_transform(label)
        return row, label

    def split_dataset(self):
        """Split full data to train and validation Subset-objects

        Returns:
            Tuple[Subset, Subset]: Train and validation subsets
        """
        length = len(self)
        train_size = int(length * self.train_portion)
        valid_size = length - train_size
        train, val = torch.utils.data.random_split(self, [train_size, valid_size])
        return train, val

    def get_data_from_subset(subset):
        return subset.dataset.df_id.iloc[subset.indices]

    def get_DataLoader(self, trainDL: bool = True):
        subset = self.train if trainDL else self.val
        return DataLoader(dataset=subset, batch_size=self.batch_size)

In [53]:
# Embedding model (same model as Mind Data example)


class HM_model(torch.nn.Module):
    def __init__(self, num_customer, num_articles, embedding_size):
        super(HM_model, self).__init__()
        self.customer_embed = torch.nn.Embedding(
            num_embeddings=num_customer, embedding_dim=embedding_size
        )
        self.art_embed = torch.nn.Embedding(
            num_embeddings=num_articles, embedding_dim=embedding_size
        )

    def forward(self, customer_row, article_row):
        customer_embed = self.customer_embed(customer_row)
        art_embed = self.art_embed(article_row)
        dot_prod = torch.sum(torch.mul(customer_embed, art_embed), 1)
        return torch.sigmoid(dot_prod)

In [54]:
def train_one_epoch(model: HM_model, data: Data_HM, epoch_num: int, optimizer, loss):
    epoch_loss = 0
    for row in data.get_DataLoader(trainDL=True):
        customer_id, article_id, label = row
        optimizer.zero_grad()
        # TODO: MindData seem to pass along several IDs and not only one. Which is correct?
        pred = model(customer_id, article_id)
        loss_value = loss(pred.view(-1), label)
        loss_value.backward()
        optimizer.step()
        epoch_loss += loss_value
    print(f"\t| Training loss for epoch {epoch_num+1}: {epoch_loss}")


def train(model, data, params):
    # Uses binary cross entropy at the moment
    loss_metric = torch.nn.BCELoss()  # TODO change to MAP12 once the rest works
    optimizer = params.optimizer(
        model.parameters(), lr=params.lr_rate, weight_decay=params.weight_decay
    )
    for epoch in range(params.epochs):
        train_one_epoch(model, data, epoch, optimizer, loss_metric)
        if not epoch % params.validation_frequency:

            print(f"Provisory results for epoch {epoch+1}:")
            print(
                "Loss for training set",
                validate(model, data, train=True),
                sep="\t",
            )
            print(
                "Loss for validation set",
                validate(model, data, train=False),
                sep="\t",
            )
            print("-" * 20)


import utils.metrics as metric


def validate(model, data, train):
    with torch.no_grad():
        preds, labels = [], []
        for row in data.getDataLoader(trainDL=train):
            customer_id, article_id, label = row
            pred_i = model(customer_id, article_id).view(-1)
            preds.append(
                pred_i.detach().numpy()
            )  # TODO same case for our case? not sure
            labels.append(label.detach().numpy())
        return metric.MAPk(k=12, preds=preds, true=labels)

In [55]:
test = pd.DataFrame(
    {
        "customer_id": ["a", "b", "c", "c"],
        "article_id": ["0", "01", "04", "08"],
        "label": [0, 0, 1, 1],
    }
)
n_cust, n_art, _ = test.nunique()
print(n_cust, n_art)

3 4


In [57]:
from dataclasses import dataclass, asdict
from typing import Any


def main(use_min_dataset: bool = False):
    @dataclass
    class Hyperparameters:
        lr_rate: float = 1e-3
        weight_decay: str = 1e-5
        epochs: int = 100
        validation_frequency: int = 10
        optimizer: Any = torch.optim.Adam
        embedding_size: int = 5
        # Add more here...

    # Load data
    if use_min_dataset:
        df_c, df_a, df_t = load_min_data(  # TODO change this so we just pd.readcsv
            [
                f"dataset_sample/{n}_min.csv"
                for n in ("customer", "articles", "transactions")
            ]
        )
    else:
        df_c = pd.read_csv("dataset/customers.csv")
        # Articles IDs all start with 0 which disappears if cast to a number
        df_a = pd.read_csv("dataset/articles.csv", dtype={"article_id": str})
        df_t = pd.read_csv("dataset/transactions_train.csv", dtype={"article_id": str})
    df_c = clean_customer_data(df_c)

    # Transform to training and testing set
    dataset_params = {
        "total_cases": 20,
        "portion_negatives": 0.9,
        "df_transactions": df_t,
        "df_articles": df_a,
        "df_customers": df_c,
        "train_portion": 0.7,
        "batch_size": 5,
    }
    hyperparams = Hyperparameters()
    data = Data_HM(**dataset_params)
    n_cust, n_art, _ = data.df_id.nunique()
    model = HM_model(
        num_customer=n_cust,
        num_articles=n_art,
        embedding_size=hyperparams.embedding_size,
    )
    train(model, data, hyperparams)

    # Train, eval, save results and weights...


# main()
@dataclass
class Hyperparameters:
    lr_rate: float = 1e-3
    weight_decay: str = 1e-5
    epochs: int = 100
    validation_frequency: int = 10
    optimizer: Any = torch.optim.Adam
    embedding_size: int = 5
    # Add more here...


# Load data

df_c = pd.read_csv("dataset/customers.csv")
# Articles IDs all start with 0 which disappears if cast to a number
df_a = pd.read_csv("dataset/articles.csv", dtype={"article_id": str})
df_t = pd.read_csv("dataset/transactions_train.csv", dtype={"article_id": str})
df_c = clean_customer_data(df_c)

# Transform to training and testing set
dataset_params = {
    "total_cases": 20,
    "portion_negatives": 0.9,
    "df_transactions": df_t,
    "df_articles": df_a,
    "df_customers": df_c,
    "train_portion": 0.7,
    "batch_size": 5,
}
hyperparams = Hyperparameters()
data = Data_HM(**dataset_params)
n_cust, n_art, _ = data.df_id.nunique()
model = HM_model(
    num_customer=n_cust,
    num_articles=n_art,
    embedding_size=hyperparams.embedding_size,
)
# train(model, data, hyperparams)

## Debug

AttributeError: 'HM_model' object has no attribute 'getDataLoader'

In [73]:
row, lab = data.__getitem__(2)
row.values

array(['61afed752a3e8fbf7d69c393156b19722a8c3ad5d01738e80ed821f50f919b58',
       '0693243016'], dtype=object)

In [65]:
dl = data.get_DataLoader(trainDL=True)
for el in dl:
    pass

TypeError: default_collate: batch must contain tensors, numpy arrays, numbers, dicts or lists; found <class 'pandas.core.series.Series'>