# Building a recommender system with embedding

In [1]:
%reload_ext jupyter_black

In [2]:
# Cleaning up the datasets
from typing import Iterable


def load_min_data(filename: str | Iterable):
    dfs = []
    if isinstance(filename, str):
        filename = [filename]
    for fn in filename:
        df = pd.read_csv(fn)
        # All min-datasets have an index column which has to be dropped:
        dfs.append(df.drop(df.columns[0], axis=1))
    return dfs


def clean_customer_data(df):
    # df = df.drop("FN", axis=1) # I they're not exactly equal
    df.loc[
        ~df["fashion_news_frequency"].isin(["Regularly", "Monthly"]),
        "fashion_news_frequency",
    ] = "None"
    return df

In [3]:
# New data loading principle
import pandas as pd
import os
import numpy as np
import torch
from torch.utils.data import DataLoader, Dataset
from typing import Tuple, Any


class Data_HM(Dataset):
    """This is the general HM Dataset class whose children are train-dataset and validation-dataset

    Args:
        Dataset: Abstract Dataset class from pyTorch
    """

    def __init__(
        self,
        total_cases: int,
        portion_negatives: float,
        df_transactions: pd.DataFrame,
        df_articles: pd.DataFrame,
        df_customers: pd.DataFrame,
        train_portion: float | None = None,
        test_portion: float | None = None,
        transform: Any = None,
        target_transform: Any = None,
    ) -> None:
        super().__init__()  # TODO not sure if we need this
        self.pos, self.neg = self.generate_dataset(
            total_cases, portion_negatives, df_transactions
        )
        self.df = pd.concat(
            [
                self.merge_dfs_add_label(
                    self.pos,
                    df_articles,
                    df_customers,
                    positive=True,
                ),
                self.merge_dfs_add_label(
                    self.neg,
                    df_articles,
                    df_customers,
                    positive=False,
                ),
            ]
        ).reset_index(drop=True)
        self.train, self.test = self.split(train_portion, test_portion)
        self.transform, self.target_transform = transform, target_transform

    def generate_dataset(
        self, total_cases: int, portion_negatives: float, df_transactions: pd.DataFrame
    ) -> Tuple[pd.DataFrame, pd.DataFrame]:
        """Produce DataFrames for positive labels and generated negative samples

        Args:
            total_cases (int): Total number of transactions
            portion_negatives (float): The portion of the `total_cases` that should be negative. Balanced 0/1 when 0.5
            df_transactions (pd.DataFrame): Transactions to pull samples/generate samples from

        Returns:
            Tuple[pd.DataFrame, pd.DataFrame]: _description_
        """
        assert (
            0 <= portion_negatives <= 1
        ), r"portion negatives must be a float between 0%=0.0 and 100%=1.0!"
        n_positive = int(total_cases * (1 - portion_negatives))
        n_negative = int(total_cases * portion_negatives)
        df_positive = df_transactions.sample(n=n_positive).reset_index(drop=True)
        df_positive = df_positive[["customer_id", "article_id"]]

        # Sampling negative labels:
        #   We select a random combination of `customer_id`, `article_id`, and ensure that this is not a true transaction.
        #   Then we write this tuple to a csv which is transformed into a DataFrame similar to `df_positive`

        num_written = 0
        tmpStr = "customer_id,article_id\n"
        while num_written < n_negative:
            # Choose random customer and article
            selection = np.array(  # TODO this can probably be optimized further
                [
                    df_transactions["customer_id"].sample().values,
                    df_transactions["article_id"].sample().values,
                ]
            ).flatten()
            if not (
                (df_transactions["customer_id"] == selection[0])
                & (df_transactions["article_id"] == selection[1])
            ).any():
                tmpStr += f"{selection[0]}, {selection[1]}\n"
                num_written += 1
        with open("tmp.csv", "w") as f:
            f.write(tmpStr)
        df_negative = pd.read_csv("tmp.csv")
        os.remove("tmp.csv")
        return df_positive, df_negative

    def merge_dfs_add_label(
        self,
        df_transactions: pd.DataFrame,
        df_articles: pd.DataFrame,
        df_customers: pd.DataFrame,
        positive: bool = False,
    ) -> pd.DataFrame:
        """Merge customer and article data to the sampled data `df_transactions`, excluding customer/article IDs

        Args:
            df_transactions (pd.DataFrame): DataFrame from `generate_dataset`
            df_articles (pd.DataFrame): Articles DataFrame
            df_customers (pd.DataFrame): Customers DataFrame
            positive (bool, optional): Wether or not df_transactions represent positive labels. Defaults to False.

        Returns:
            pd.DataFrame: DF with all columns included
        """
        columns_articles = [
            "article_id",
            "prod_name",
            "product_type_name",
            "product_group_name",
            "graphical_appearance_name",
            "colour_group_name",
            "perceived_colour_value_name",
            "perceived_colour_master_name",
            "department_name",
            "index_name",
            "index_group_name",
            "section_name",
            "garment_group_name",
            "detail_desc",
        ]
        # TODO consider storing blacklisted cols instead of whitelisted

        df_articles = df_articles[columns_articles]

        df = pd.merge(
            df_transactions, df_customers, how="inner", on=["customer_id"]
        ).drop(["customer_id"], axis=1)
        df = pd.merge(df, df_articles, how="inner", on=["article_id"]).drop(
            ["article_id"], axis=1
        )
        df["label"] = 1 if positive else 0
        return df

    def __len__(self):
        return len(self.df.index)

    def __getitem__(self, idx):
        row, label = self.df.iloc[idx, :-1], self.df.iloc[idx, -1]
        if self.transform:
            row = self.transform(row)
        if self.target_transform:
            label = self.target_transform(label)
        return row, label

    def split(
        self, train_portion: float | None = None, test_portion: float | None = None
    ) -> Tuple[pd.DataFrame, pd.DataFrame]:
        """Split full dataset into training and validation set. Note that only one of train_portion or
            test_portion are required (test_portion = 100% - test_portion)

        Args:
            train_portion (float | None, optional): Percentage of rows assigned to training set. Defaults to None.
            test_portion (float | None, optional): Percentage of rows assigned to validation set. Defaults to None.

        Returns:
            Tuple[pd.DataFrame, pd.DataFrame]: Train-set and validation-set
        """
        assert any(
            [train_portion, test_portion]
        ), "At least one of train or test portion must be float"
        if train_portion is None:
            train_portion = 1 - test_portion
        train = self.df.sample(frac=train_portion)
        test = (
            pd.merge(self.df, train, indicator=True, how="outer")
            .query('_merge=="left_only"')
            .drop("_merge", axis=1)
        )
        return train.reset_index(drop=True), test.reset_index(drop=True)


class HM_train(Data_HM):
    def __init__(
        self,
        total_cases,
        portion_negatives,
        df_transactions,
        df_articles,
        df_customers: pd.DataFrame,
        train_portion=None,
        test_portion=None,
        transform=None,
        target_transform=None,
    ) -> None:
        super().__init__(
            total_cases,
            portion_negatives,
            df_transactions,
            df_articles,
            df_customers,
            train_portion,
            test_portion,
            transform,
            target_transform,
        )

    def __getitem__(self, idx):
        row, label = self.train.iloc[idx, :-1], self.train.iloc[idx, -1]
        if self.transform:
            row = self.transform(row)
        if self.target_transform:
            label = self.target_transform(label)
        return row, label


class HM_val(Data_HM):
    def __init__(
        self,
        total_cases,
        portion_negatives,
        df_transactions,
        df_articles,
        df_customers: pd.DataFrame,
        train_portion=None,
        test_portion=None,
        transform=None,
        target_transform=None,
    ) -> None:
        super().__init__(
            total_cases,
            portion_negatives,
            df_transactions,
            df_articles,
            df_customers,
            train_portion,
            test_portion,
            transform,
            target_transform,
        )

    def __getitem__(self, idx):
        row, label = self.test.iloc[idx, :-1], self.test.iloc[idx, -1]
        if self.transform:
            row = self.transform(row)
        if self.target_transform:
            label = self.target_transform(label)
        return row, label

In [4]:
# Embedding models (same model as Mind Data example)


class HM_model(torch.nn.Module):
    def __init__(self, num_customer, num_articles, embedding_size):
        super(HM_model, self).__init__()
        self.customer_embed = torch.nn.Embedding(
            num_embeddings=num_customer, embedding_dim=embedding_size
        )
        self.art_embed = (
            torch.nn.Embedding(  # TODO shouldn't this be article embeddings?
                num_embeddings=num_articles, embedding_dim=embedding_size
            )
        )

    def forward(self, row):
        row_embed = self.customer_embed(row)
        # art_embed = self.art_embed(items)
        # dot_prod = torch.sum(torch.mul(customer_embed, art_embed), 1)
        # return torch.sigmoid(dot_prod)
        return torch.sigmoid(row_embed)

In [20]:
def train_one_epoch(model: HM_model, data, epoch_num: int, optimizer, loss):
    epoch_loss = 0
    for batch, row in enumerate(
        data
    ):  # TODO not sure if we can enumerate DataLoader like that
        row, labels = row[:-1], row[-1]  # TODO probably wont work
        optimizer.zero_grad()
        print(row[0].values)
        pred = model(torch.tensor(row[0].values))
        loss_value = loss(pred.view(-1), labels)
        loss_value.backward()
        optimizer.step()
        epoch_loss += loss_value
    print(f"\t| Training loss for epoch {epoch_num+1}: {epoch_loss}")


def train(model, train_DL, val_DL, params):
    # Uses binary cross entropy at the moment
    loss_metric = torch.nn.BCELoss()  # TODO change to MAP12 once the rest works
    optimizer = params.optimizer(
        model.parameters(), lr=params.lr_rate, weight_decay=params.weight_decay
    )
    for epoch in range(params.epochs):
        train_one_epoch(model, train_DL, epoch, optimizer, loss_metric)
        if not epoch % params.validation_frequency:

            print(f"Provisory results for epoch {epoch+1}:")
            print(
                "MAP12 for training set",
                validate(model, train_DL, train=True),
                sep="\t",
            )
            print(
                "MAP12 for validation set",
                validate(model, val_DL, train=False),
                sep="\t",
            )
            print("-" * 20)


import utils.metrics as metric


def validate(model, val_DL, train):
    with torch.no_grad():
        preds, labels = [], []
        for row in val_DL:
            row, label = row[:-1], row[-1]  # TODO same case as train
            pred_i = model(row).view(-1)
            preds.append(
                pred_i.detach().numpy()
            )  # TODO same case for our case? not sure
            labels.append(label.detach().numpy())
        return metric.MAPk(k=12, preds=preds, true=labels)

In [21]:
from dataclasses import dataclass, asdict
from typing import Any


def main():
    @dataclass
    class Hyperparameters:
        lr_rate: float = 1e-3
        weight_decay: str = 1e-5
        epochs: int = 100
        validation_frequency: int = 10
        optimizer: Any = torch.optim.Adam
        # Add more here...

    # Load data
    df_c, df_a, df_t = load_min_data(
        [
            f"dataset_sample/{n}_min.csv"
            for n in ("customer", "articles", "transactions")
        ]
    )
    df_c = clean_customer_data(df_c)

    # Transform to training and testing set
    dataset_params = {
        "total_cases": 20,
        "portion_negatives": 0.9,
        "df_transactions": df_t,
        "df_articles": df_a,
        "df_customers": df_c,
        "train_portion": 0.7,
    }
    data_train = HM_train(**dataset_params)
    data_test = HM_val(**dataset_params)

    model = HM_model(num_customer=20, num_articles=20, embedding_size=5)
    train(model, data_train, data_test, Hyperparameters())

    # Train, eval, save results and weights...


main()

[nan nan 'ACTIVE' 'None' 48.0
 '0f23f9b1e451204de97aca27b98e60c089c8f7ed13e6d89517bbae16af70b27f'
 'Bradley trousers' 'Trousers' 'Garment Lower body' 'Solid' 'Black' 'Dark'
 'Black' 'Jersey Basic' 'Ladieswear' 'Ladieswear' 'Womens Everyday Basics'
 'Jersey Basic'
 'Joggers in lightweight sweatshirt fabric made from a cotton blend with covered elastication and a drawstring at the waist. Tapered legs with jersey ribbing at the hems.']


TypeError: can't convert np.ndarray of type numpy.object_. The only supported types are: float64, float32, float16, complex64, complex128, int64, int32, int16, int8, uint8, and bool.