# Building a recommender system with embedding

In [1]:
%reload_ext jupyter_black

In [2]:
# Cleaning up the datasets
from typing import Iterable


def load_min_data(filename: str | Iterable):
    dfs = []
    if isinstance(filename, str):
        filename = [filename]
    for fn in filename:
        df = pd.read_csv(fn)
        # All min-datasets have an index column which has to be dropped:
        dfs.append(df.drop(df.columns[0], axis=1))
    return dfs


def clean_customer_data(df):
    # df = df.drop("FN", axis=1) # I they're not exactly equal
    df.loc[
        ~df["fashion_news_frequency"].isin(["Regularly", "Monthly"]),
        "fashion_news_frequency",
    ] = "None"
    return df

In [3]:
# New data loading principle
import pandas as pd
import os
import numpy as np
import torch
from torch.utils.data import DataLoader, Dataset
from typing import Tuple


class Data_HM(Dataset):
    """This is the general HM Dataset class whose children are train-dataset and validation-dataset

    Args:
        Dataset: Abstract Dataset class from pyTorch
    """

    def __init__(
        self,
        total_cases: int,
        portion_negatives: float,
        df_transactions: pd.DataFrame,
        df_articles: pd.DataFrame,
        df_customers: pd.DataFrame,
        train_portion: float | None = None,
        test_portion: float | None = None,
    ) -> None:
        super().__init__()  # TODO not sure if we need this
        self.pos, self.neg = self.generate_dataset(
            total_cases, portion_negatives, df_transactions
        )
        self.df = pd.concat(
            [
                self.merge_dfs_add_label(
                    self.pos,
                    df_articles,
                    df_customers,
                    positive=True,
                ),
                self.merge_dfs_add_label(
                    self.neg,
                    df_articles,
                    df_customers,
                    positive=False,
                ),
            ]
        ).reset_index(drop=True)
        self.train, self.test = self.split(train_portion, test_portion)

    def generate_dataset(
        self, total_cases: int, portion_negatives: float, df_transactions: pd.DataFrame
    ) -> Tuple[pd.DataFrame, pd.DataFrame]:
        """Produce DataFrames for positive labels and generated negative samples

        Args:
            total_cases (int): Total number of transactions
            portion_negatives (float): The portion of the `total_cases` that should be negative. Balanced 0/1 when 0.5
            df_transactions (pd.DataFrame): Transactions to pull samples/generate samples from

        Returns:
            Tuple[pd.DataFrame, pd.DataFrame]: _description_
        """
        assert (
            0 <= portion_negatives <= 1
        ), r"portion negatives must be a float between 0%=0.0 and 100%=1.0!"
        n_positive = int(total_cases * (1 - portion_negatives))
        n_negative = int(total_cases * portion_negatives)
        df_positive = df_transactions.sample(n=n_positive).reset_index(drop=True)
        df_positive = df_positive[["customer_id", "article_id"]]

        # Sampling negative labels:
        #   We select a random combination of `customer_id`, `article_id`, and ensure that this is not a true transaction.
        #   Then we write this tuple to a csv which is transformed into a DataFrame similar to `df_positive`

        num_written = 0
        tmpStr = "customer_id,article_id\n"
        while num_written < n_negative:
            # Choose random customer and article
            selection = np.array(  # TODO this can probably be optimized further
                [
                    df_transactions["customer_id"].sample().values,
                    df_transactions["article_id"].sample().values,
                ]
            ).flatten()
            if not (
                (df_transactions["customer_id"] == selection[0])
                & (df_transactions["article_id"] == selection[1])
            ).any():
                tmpStr += f"{selection[0]}, {selection[1]}\n"
                num_written += 1
        with open("tmp.csv", "w") as f:
            f.write(tmpStr)
        df_negative = pd.read_csv("tmp.csv")
        os.remove("tmp.csv")
        return df_positive, df_negative

    def merge_dfs_add_label(
        self,
        df_transactions: pd.DataFrame,
        df_articles: pd.DataFrame,
        df_customers: pd.DataFrame,
        positive: bool = False,
    ) -> pd.DataFrame:
        """Merge customer and article data to the sampled data `df_transactions`, excluding customer/article IDs

        Args:
            df_transactions (pd.DataFrame): DataFrame from `generate_dataset`
            df_articles (pd.DataFrame): Articles DataFrame
            df_customers (pd.DataFrame): Customers DataFrame
            positive (bool, optional): Wether or not df_transactions represent positive labels. Defaults to False.

        Returns:
            pd.DataFrame: DF with all columns included
        """
        columns_articles = [
            "article_id",
            "prod_name",
            "product_type_name",
            "product_group_name",
            "graphical_appearance_name",
            "colour_group_name",
            "perceived_colour_value_name",
            "perceived_colour_master_name",
            "department_name",
            "index_name",
            "index_group_name",
            "section_name",
            "garment_group_name",
            "detail_desc",
        ]
        # TODO consider storing blacklisted cols instead of whitelisted

        df_articles = df_articles[columns_articles]

        df = pd.merge(
            df_transactions, df_customers, how="inner", on=["customer_id"]
        ).drop(["customer_id"], axis=1)
        df = pd.merge(df, df_articles, how="inner", on=["article_id"]).drop(
            ["article_id"], axis=1
        )
        df["label"] = 1 if positive else 0
        return df

    def __len__(self):
        return len(self.df.index)

    def __getitem__(self, idx):
        row, label = self.df.iloc[idx, :-1], self.df.iloc[idx, -1]
        if self.transform:
            row = self.transform(row)
        if self.target_transform:
            label = self.target_transform(label)
        return row, label

    def split(
        self, train_portion: float | None = None, test_portion: float | None = None
    ) -> Tuple[pd.DataFrame, pd.DataFrame]:
        """Split full dataset into training and validation set. Note that only one of train_portion or
            test_portion are required (test_portion = 100% - test_portion)

        Args:
            train_portion (float | None, optional): Percentage of rows assigned to training set. Defaults to None.
            test_portion (float | None, optional): Percentage of rows assigned to validation set. Defaults to None.

        Returns:
            Tuple[pd.DataFrame, pd.DataFrame]: Train-set and validation-set
        """
        assert any(
            [train_portion, test_portion]
        ), "At least one of train or test portion must be float"
        if train_portion is None:
            train_portion = 1 - test_portion
        train = self.df.sample(frac=train_portion)
        test = (
            pd.merge(self.df, train, indicator=True, how="outer")
            .query('_merge=="left_only"')
            .drop("_merge", axis=1)
        )
        return train.reset_index(drop=True), test.reset_index(drop=True)


class HM_train(Data_HM):
    def __init__(
        self,
        total_cases,
        portion_negatives,
        df_transactions,
        df_articles,
        df_customers: pd.DataFrame,
        train_portion=None,
        test_portion=None,
    ) -> None:
        super().__init__(
            total_cases,
            portion_negatives,
            df_transactions,
            df_articles,
            df_customers,
            train_portion,
            test_portion,
        )

    def __getitem__(self, idx):
        row, label = self.train.iloc[idx, :-1], self.train.iloc[idx, -1]
        if self.transform:
            row = self.transform(row)
        if self.target_transform:
            label = self.target_transform(label)
        return row, label


class HM_val(Data_HM):
    def __init__(
        self,
        total_cases,
        portion_negatives,
        df_transactions,
        df_articles,
        df_customers: pd.DataFrame,
        train_portion=None,
        test_portion=None,
    ) -> None:
        super().__init__(
            total_cases,
            portion_negatives,
            df_transactions,
            df_articles,
            df_customers,
            train_portion,
            test_portion,
        )

    def __getitem__(self, idx):
        row, label = self.test.iloc[idx, :-1], self.test.iloc[idx, -1]
        if self.transform:
            row = self.transform(row)
        if self.target_transform:
            label = self.target_transform(label)
        return row, label

In [4]:
# Embedding models (same model as Mind Data example)


class HM_model(torch.nn.Module):
    def __init__(self, num_customer, num_transactions, embedding_size):
        super(HM_model, self).__init__()
        self.customer_embed = torch.nn.Embedding(
            num_embeddings=num_customer, embedding_dim=embedding_size
        )
        self.trans_embed = torch.nn.Embedding(
            num_embeddings=num_transactions, embedding_dim=embedding_size
        )

    def forward(self, users, items):
        customer_embed = self.customer_embed(users)
        trans_embed = self.trans_embed(items)
        dot_prod = torch.sum(torch.mul(customer_embed, trans_embed), 1)
        return torch.sigmoid(dot_prod)

In [5]:
""" Psudeo one epoch
epoch_loss = 0
* For each point in data
    * retrieve column info + label and set to separate variables
    * optimizer.zero_grad # Not sure if we should do this or not..
    * compute prediction via model(row_info)
    * compute loss_value via loss(prediction.view(-1), labels)
    * loss.backward()
    * optimizer.step()
    * Potentially: LR scheduler.step()

    epoch_loss += loss_value

print("Epoch", "Loss", "Loss per data sample", sep="\t")
print(epoch+1, epoch_loss, epoch_loss/len(data), sep="\t")
print("-"*20)
"""


def train_one_epoch(model: HM_model, data, epoch_num: int, optimizer, loss):
    epoch_loss = 0
    for batch, row in enumerate(data):
        optimizer.zero_grad()
        pred = model(row)
        loss_value = loss(pred.view(-1), labels)
        loss_value.backward()
        optimizer.step()
        epoch_loss += loss_value


def train(model, train_DL, params):
    # Uses binary cross entropy at the moment
    loss_metric = torch.nn.BCELoss()  # TODO change to MAP12 once the rest works
    optimizer = params.optimizer
    for epoch in range(params.epochs):
        train_one_epoch(model, train_DL, epoch, optimizer, loss_metric)
        if not epoch%params.validation_frequency:
            validate(model, train_DL, train=True)
            validate(model, train_DL, train=False)


def validate(model, DL, train=False):
    raise NotImplementedError()

In [6]:
from dataclasses import dataclass, asdict


def main():
    @dataclass
    class Hyperparameters:
        lr_rate: float = 1e-3
        weight_decay: str = "l2_reg"
        epochs: int = 100
        validation_frequency: int = 10
        # Add more here...

    # Load data
    df_c, df_a, df_t = load_min_data(
        [
            f"dataset_sample/{n}_min.csv"
            for n in ("customer", "articles", "transactions")
        ]
    )
    df_c = clean_customer_data(df_c)

    # Transform to training and testing set
    dataset_params = {
        "total_cases": 20,
        "portion_negatives": 0.9,
        "df_transactions": df_t,
        "df_articles": df_a,
        "df_customers": df_c,
        "train_portion": 0.7,
    }
    data_train = HM_train(**dataset_params)
    data_test = HM_val(**dataset_params)

    model = HM_model(num_customer=20, num_transactions=20, embedding_size=5)
    return data_train.train

    # Train, eval, save results and weights...


main()

Unnamed: 0,FN,Active,club_member_status,fashion_news_frequency,age,postal_code,prod_name,product_type_name,product_group_name,graphical_appearance_name,colour_group_name,perceived_colour_value_name,perceived_colour_master_name,department_name,index_name,index_group_name,section_name,garment_group_name,detail_desc,label
0,1.0,1.0,ACTIVE,Regularly,27.0,76c7d8747a83d1c09d66e311b142d77c0fa1d8fb646c5a...,Kai jumper,Sweater,Garment Upper body,Solid,Black,Dark,Black,Knitwear,Ladieswear,Ladieswear,Womens Everyday Collection,Knitwear,"Jumper in a soft, rib-knit cotton blend with a...",0
1,,,ACTIVE,,51.0,43382428f3fb89df36d48da5c4845c72a9e09e722edc8b...,RICHIE HOOD,Hoodie,Garment Upper body,Solid,Light Pink,Light,Pink,Heavy Basic Jersey,Menswear,Menswear,Men Underwear,Jersey Basic,Hoodie in sweatshirt fabric made from a cotton...,0
2,1.0,1.0,ACTIVE,Regularly,57.0,c6f0cb557151fb401df27f8626791514784426f0819c2d...,2p mop pins,Hair clip,Accessories,Solid,Off White,Dusty Light,White,Hair Accessories,Ladies Accessories,Ladieswear,Womens Small accessories,Accessories,Metal hair grips decorated with plastic beads....,0
3,,,ACTIVE,,30.0,b06ca1435925b80373dfac59343ea90447bd1a8dba9c9e...,Tilda tank,Vest top,Garment Upper body,Solid,Greenish Khaki,Medium Dusty,Khaki green,Basic 1,Divided,Divided,Divided Basics,Jersey Basic,"Cropped, fitted top in cotton jersey with narr...",0
4,1.0,1.0,ACTIVE,Regularly,21.0,00b7efd47eeb50702752f1b9ffd8ebd953a54124aead10...,Lola Denim Shorts,Shorts,Garment Lower body,Solid,Black,Dark,Black,Shorts,Divided,Divided,Divided Collection,Shorts,"Short, 5-pocket shorts in washed, slightly str...",0
5,1.0,1.0,ACTIVE,Regularly,24.0,ae784a123dea3d58fbb72ecd0981382345bb11ac35fbe3...,Boulevard- TVP- TM,T-shirt,Garment Upper body,Embroidery,White,Light,White,Tops Fancy Jersey,Divided,Divided,Divided Collection,Jersey Fancy,T-shirt in soft viscose jersey with a slight s...,0
6,1.0,1.0,ACTIVE,Regularly,34.0,fe814f9d1b8a657f20a30666c230cba0ad41a3172d5a6d...,Noora,Sweater,Garment Upper body,Placement print,Dark Blue,Dark,Blue,Young Girl Jersey Basic,Children Sizes 134-170,Baby/Children,Girls Underwear & Basics,Jersey Basic,Cropped top in soft sweatshirt fabric with lon...,0
7,,,ACTIVE,,33.0,bf98f222ef9eb34b3bba2745452d5f6d2f395edc79a4f1...,Bonnie,Sweater,Garment Upper body,Stripe,Light Beige,Dusty Light,Beige,Knitwear,Ladieswear,Ladieswear,Womens Tailoring,Knitwear,"Jumper in a soft, fine-knit viscose blend with...",0
8,1.0,1.0,ACTIVE,Regularly,37.0,17163f9ac92c6145ed68dd724a4aa8d227709ddd044dc9...,JUST PINK DRESS(1),Dress,Garment Full body,Solid,Dark Pink,Bright,Pink,Jersey,Ladieswear,Ladieswear,Womens Casual,Jersey Fancy,"Sleeveless, calf-length dress in soft jersey m...",0
9,1.0,1.0,ACTIVE,Regularly,34.0,fe814f9d1b8a657f20a30666c230cba0ad41a3172d5a6d...,Leonora off-shoulder,Top,Garment Upper body,Solid,White,Light,White,Basic 1,Divided,Divided,Divided Basics,Jersey Basic,"Short, off-the-shoulder top in ribbed jersey w...",0
