# Building a recommender system with embedding

In [1]:
%reload_ext jupyter_black

In [2]:
# Cleaning up the datasets
from typing import Iterable


def load_min_data(filename: str | Iterable):
    dfs = []
    if isinstance(filename, str):
        filename = [filename]
    for fn in filename:
        df = pd.read_csv(fn)
        # All min-datasets have an index column which has to be dropped:
        dfs.append(df.drop(df.columns[0], axis=1))
    return dfs


def clean_customer_data(df):
    # df = df.drop("FN", axis=1) # I they're not exactly equal
    df.loc[
        ~df["fashion_news_frequency"].isin(["Regularly", "Monthly"]),
        "fashion_news_frequency",
    ] = "None"
    return df

In [4]:
# New data loading principle
import pandas as pd
import os
import numpy as np
import torch
from torch.utils.data import DataLoader, Dataset
from typing import Tuple


class Data_HM(Dataset):
    """This is the general HM Dataset class whose children are train-dataset and validation-dataset

    Args:
        Dataset: Abstract Dataset class from pyTorch
    """

    def __init__(
        self,
        total_cases: int,
        portion_negatives: float,
        df_transactions: pd.DataFrame,
        df_articles: pd.DataFrame,
        df_customers: pd.DataFrame,
        train_portion: float | None = None,
        test_portion: float | None = None,
    ) -> None:
        super().__init__()  # TODO not sure if we need this
        self.pos, self.neg = self.generate_dataset(
            total_cases, portion_negatives, df_transactions
        )
        self.df = pd.concat(
            [
                self.merge_dfs_add_label(
                    self.pos,
                    df_articles,
                    df_customers,
                    positive=True,
                ),
                self.merge_dfs_add_label(
                    self.neg,
                    df_articles,
                    df_customers,
                    positive=False,
                ),
            ]
        ).reset_index(drop=True)
        self.train, self.test = self.split(train_portion, test_portion)

    def generate_dataset(
        self, total_cases: int, portion_negatives: float, df_transactions: pd.DataFrame
    ) -> Tuple[pd.DataFrame, pd.DataFrame]:
        """Produce DataFrames for positive labels and generated negative samples

        Args:
            total_cases (int): Total number of transactions
            portion_negatives (float): The portion of the `total_cases` that should be negative. Balanced 0/1 when 0.5
            df_transactions (pd.DataFrame): Transactions to pull samples/generate samples from

        Returns:
            Tuple[pd.DataFrame, pd.DataFrame]: _description_
        """
        assert (
            0 <= portion_negatives <= 1
        ), r"portion negatives must be a float between 0%=0.0 and 100%=1.0!"
        n_positive = int(total_cases * (1 - portion_negatives))
        n_negative = int(total_cases * portion_negatives)
        df_positive = df_transactions.sample(n=n_positive).reset_index(drop=True)
        df_positive = df_positive[["customer_id", "article_id"]]

        # Sampling negative labels:
        #   We select a random combination of `customer_id`, `article_id`, and ensure that this is not a true transaction.
        #   Then we write this tuple to a csv which is transformed into a DataFrame similar to `df_positive`

        num_written = 0
        tmpStr = "customer_id,article_id\n"
        while num_written < n_negative:
            # Choose random customer and article
            selection = np.array(  # TODO this can probably be optimized further
                [
                    df_transactions["customer_id"].sample().values,
                    df_transactions["article_id"].sample().values,
                ]
            ).flatten()
            if not (
                (df_transactions["customer_id"] == selection[0])
                & (df_transactions["article_id"] == selection[1])
            ).any():
                tmpStr += f"{selection[0]}, {selection[1]}\n"
                num_written += 1
        with open("tmp.csv", "w") as f:
            f.write(tmpStr)
        df_negative = pd.read_csv("tmp.csv")
        os.remove("tmp.csv")
        return df_positive, df_negative

    def merge_dfs_add_label(
        self,
        df_transactions: pd.DataFrame,
        df_articles: pd.DataFrame,
        df_customers: pd.DataFrame,
        positive: bool = False,
    ) -> pd.DataFrame:
        """Merge customer and article data to the sampled data `df_transactions`, excluding customer/article IDs

        Args:
            df_transactions (pd.DataFrame): DataFrame from `generate_dataset`
            df_articles (pd.DataFrame): Articles DataFrame
            df_customers (pd.DataFrame): Customers DataFrame
            positive (bool, optional): Wether or not df_transactions represent positive labels. Defaults to False.

        Returns:
            pd.DataFrame: DF with all columns included
        """
        columns_articles = [
            "article_id",
            "prod_name",
            "product_type_name",
            "product_group_name",
            "graphical_appearance_name",
            "colour_group_name",
            "perceived_colour_value_name",
            "perceived_colour_master_name",
            "department_name",
            "index_name",
            "index_group_name",
            "section_name",
            "garment_group_name",
            "detail_desc",
        ]
        # TODO consider storing blacklisted cols instead of whitelisted

        df_articles = df_articles[columns_articles]

        df = pd.merge(
            df_transactions, df_customers, how="inner", on=["customer_id"]
        ).drop(["customer_id"], axis=1)
        df = pd.merge(df, df_articles, how="inner", on=["article_id"]).drop(
            ["article_id"], axis=1
        )
        df["label"] = 1 if positive else 0
        return df

    def __len__(self):
        return len(self.df.index)

    def __getitem__(self, idx):
        row, label = self.df.iloc[idx, :-1], self.df.iloc[idx, -1]
        if self.transform:
            row = self.transform(row)
        if self.target_transform:
            label = self.target_transform(label)
        return row, label

    def split(
        self, train_portion: float | None = None, test_portion: float | None = None
    ) -> Tuple[pd.DataFrame, pd.DataFrame]:
        """Split full dataset into training and validation set. Note that only one of train_portion or
            test_portion are required (test_portion = 100% - test_portion)

        Args:
            train_portion (float | None, optional): Percentage of rows assigned to training set. Defaults to None.
            test_portion (float | None, optional): Percentage of rows assigned to validation set. Defaults to None.

        Returns:
            Tuple[pd.DataFrame, pd.DataFrame]: Train-set and validation-set
        """
        assert any(
            [train_portion, test_portion]
        ), "At least one of train or test portion must be float"
        if train_portion is None:
            train_portion = 1 - test_portion
        train = self.df.sample(frac=train_portion)
        test = (
            pd.merge(self.df, train, indicator=True, how="outer")
            .query('_merge=="left_only"')
            .drop("_merge", axis=1)
        )
        return train.reset_index(drop=True), test.reset_index(drop=True)


class HM_train(Data_HM):
    def __init__(
        self,
        total_cases,
        portion_negatives,
        df_transactions,
        df_articles,
        df_customers: pd.DataFrame,
        train_portion=None,
        test_portion=None,
    ) -> None:
        super().__init__(
            total_cases,
            portion_negatives,
            df_transactions,
            df_articles,
            df_customers,
            train_portion,
            test_portion,
        )

    def __getitem__(self, idx):
        row, label = self.train.iloc[idx, :-1], self.train.iloc[idx, -1]
        if self.transform:
            row = self.transform(row)
        if self.target_transform:
            label = self.target_transform(label)
        return row, label


class HM_val(Data_HM):
    def __init__(
        self,
        total_cases,
        portion_negatives,
        df_transactions,
        df_articles,
        df_customers: pd.DataFrame,
        train_portion=None,
        test_portion=None,
    ) -> None:
        super().__init__(
            total_cases,
            portion_negatives,
            df_transactions,
            df_articles,
            df_customers,
            train_portion,
            test_portion,
        )

    def __getitem__(self, idx):
        row, label = self.test.iloc[idx, :-1], self.test.iloc[idx, -1]
        if self.transform:
            row = self.transform(row)
        if self.target_transform:
            label = self.target_transform(label)
        return row, label

In [5]:
# Embedding models (same model as Mind Data example)


class HM_model(torch.nn.Module):
    def __init__(self, num_customer, num_transactions, embedding_size):
        super(HM_model, self).__init__()
        self.customer_embed = torch.nn.Embedding(
            num_embeddings=num_customer, embedding_dim=embedding_size
        )
        self.trans_embed = torch.nn.Embedding(
            num_embeddings=num_transactions, embedding_dim=embedding_size
        )

    def forward(self, users, items):
        customer_embed = self.customer_embed(users)
        trans_embed = self.trans_embed(items)
        dot_prod = torch.sum(torch.mul(customer_embed, trans_embed), 1)
        return torch.sigmoid(dot_prod)

In [6]:
""" Psudeo one epoch
epoch_loss = 0
* For each point in data
    * retrieve column info + label and set to separate variables
    * optimizer.zero_grad # Not sure if we should do this or not..
    * compute prediction via model(row_info)
    * compute loss_value via loss(prediction.view(-1), labels)
    * loss.backward()
    * optimizer.step()
    * Potentially: LR scheduler.step()

    epoch_loss += loss_value

print("Epoch", "Loss", "Loss per data sample", sep="\t")
print(epoch+1, epoch_loss, epoch_loss/len(data), sep="\t")
print("-"*20)
"""


def train_one_epoch(model: HM_model, data, epoch_num: int, optimizer, loss):
    epoch_loss = 0
    for batch, row in enumerate(data):
        optimizer.zero_grad()
        pred = model(row)
        loss_value = loss(pred.view(-1), labels)
        loss_value.backward()
        optimizer.step()
        epoch_loss += loss_value


def train(model, train_DL, params):
    # Uses binary cross entropy at the moment
    loss_metric = torch.nn.BCELoss()  # TODO change to MAP12 once the rest works

    """ Psudeocode
    * Initialize loss function and optimizer
    * For epoch in epochs:
        * Retrieve data from train_DL # Example uses custom sample_training_data 
        * train_one_epoch(...)
        * Report eval statistics for each n-th epoch
            * Both training accuracy and validation accuracy
    """


def validate(model, DL, train=False):
    pass

In [8]:
from dataclasses import dataclass, asdict


def main():
    @dataclass
    class Hyperparameters:
        lr_rate: float = 1e-3
        weight_decay: str = "l2_reg"
        # Add more here...

    # Load data
    df_c, df_a, df_t = load_min_data(
        [
            f"dataset_sample/{n}_min.csv"
            for n in ("customer", "articles", "transactions")
        ]
    )
    df_c = clean_customer_data(df_c)

    # Transform to training and testing set
    dataset_params = {
        "total_cases": 20,
        "portion_negatives": 0.9,
        "df_transactions": df_t,
        "df_articles": df_a,
        "df_customers": df_c,
        "train_portion": 0.7,
    }
    data_train = HM_train(**dataset_params)
    data_test = HM_val(**dataset_params)

    model = HM_model(num_customer=20, num_transactions=20, embedding_size=5)
    return data_train.train

    # Train, eval, save results and weights...


main()

Unnamed: 0,FN,Active,club_member_status,fashion_news_frequency,age,postal_code,prod_name,product_type_name,product_group_name,graphical_appearance_name,colour_group_name,perceived_colour_value_name,perceived_colour_master_name,department_name,index_name,index_group_name,section_name,garment_group_name,detail_desc,label
0,,,ACTIVE,,31.0,71b88711bd37db08ac1549d73432852f664cb48be8d893...,Emily,Blouse,Garment Upper body,Solid,Dark Green,Medium,Green,Blouse,Ladieswear,Ladieswear,Womens Everyday Collection,Blouses,Short top in an airy cotton weave. Square neck...,0
1,,,ACTIVE,,48.0,4385d55783e67067b9768f0877107d3e6cd64f3bee08ce...,Rebecka Dress,Dress,Garment Full body,All over pattern,White,Light,White,Jersey,Ladieswear,Ladieswear,Mama,Jersey Fancy,"Short, fitted dress in soft, organic cotton je...",0
2,,,ACTIVE,,33.0,bf98f222ef9eb34b3bba2745452d5f6d2f395edc79a4f1...,Twister,Dress,Garment Full body,Melange,Grey,Medium Dusty,Grey,Jersey Basic,Ladieswear,Ladieswear,Womens Everyday Basics,Jersey Basic,"Calf-length jersey dress with draped sides, sh...",0
3,1.0,1.0,ACTIVE,Regularly,31.0,5d3e0bac03cdc876b5a1d84fd7c2b2e7cefa40ddbe62be...,Simple as That Triangle Top,Bikini top,Swimwear,Other structure,Dark Green,Medium Dusty,Green,Swimwear,Lingeries/Tights,Ladieswear,"Womens Swimwear, beachwear",Swimwear,"Lined, non-wired, triangle bikini top with a w...",0
4,1.0,1.0,ACTIVE,Regularly,29.0,efd9030a7d5b5f1a6d03a1a24b1fab9931492256ce29e5...,Charlene cardigan,Cardigan,Garment Upper body,Melange,Grey,Medium Dusty,Grey,Knitwear,Ladieswear,Ladieswear,Womens Trend,Knitwear,"Cardigan in chunky-knit, soft wool with button...",0
5,1.0,1.0,ACTIVE,Regularly,27.0,76c7d8747a83d1c09d66e311b142d77c0fa1d8fb646c5a...,POW Meet the parents dress.,Dress,Garment Full body,Solid,Dark Pink,Medium Dusty,Pink,Jersey,Ladieswear,Ladieswear,H&M+,Jersey Fancy,Calf-length lace dress with adjustable spaghet...,0
6,1.0,1.0,ACTIVE,Regularly,50.0,4e63dc705eb70e92b59132da6ff80f9841982d030f5178...,Cava Shirt Dress new,Dress,Garment Full body,All over pattern,Black,Dark,Black,Dresses,Divided,Divided,Divided Collection,Dresses Ladies,Short dress in a softly draping weave with a c...,0
7,1.0,1.0,ACTIVE,Regularly,21.0,00b7efd47eeb50702752f1b9ffd8ebd953a54124aead10...,Timeless Push Bra,Bikini top,Swimwear,Other structure,Black,Dark,Black,Swimwear,Lingeries/Tights,Ladieswear,"Womens Swimwear, beachwear",Swimwear,Fully lined bikini top with a textured-stripe ...,0
8,,,ACTIVE,,44.0,91bbfabb4917109c8a4d300fa95e514b7bbc251cea44e3...,Chia Seamless HW Tights,Leggings/Tights,Garment Lower body,Solid,Orange,Bright,Orange,Ladies Sport Bottoms,Sport,Sport,Ladies H&M Sport,Jersey Fancy,"Sports tights in ribbed, fast-drying functiona...",0
9,,,PRE-CREATE,,25.0,570e2e8d4b0f09c94a88e52fb936234bf57da9943e142a...,Headphones EARS,Other accessories,Accessories,Solid,Light Turquoise,Light,Turquoise,Girls Small Acc/Bags,"Children Accessories, Swimwear",Baby/Children,"Kids Accessories, Swimwear & D",Accessories,"Adjustable, glittery, on-ear headphones with p...",1
