# Exploring modifications to embedding model

We start by calling necessary dataset functions...

In [1]:
%reload_ext jupyter_black

In [2]:
import torch, pickle, numpy as np, pandas as pd
from torch.utils.data import DataLoader, Dataset
from typing import Any, Tuple
from sklearn.preprocessing import LabelEncoder

In [3]:
# Data loading

# Data_HM definition has to be here for pickle to understand what object it loads in
class Data_HM(Dataset):
    """This is the general HM Dataset class whose children are train-dataset and validation-dataset
    no

    Args:
        Dataset: Abstract Dataset class from pyTorch
    """

    def __init__(
        self,
        total_cases: int,
        portion_negatives: float,
        df_transactions: pd.DataFrame,
        df_articles: pd.DataFrame,
        df_customers: pd.DataFrame,
        batch_size: int,
        train_portion: float | None = None,
        test_portion: float | None = None,
        transform: Any = None,
        target_transform: Any = None,
    ) -> None:
        super().__init__()
        if train_portion is None:
            if test_portion is None:
                raise ValueError("Both train portion and test portion cannot be None.")
            self.train_portion = 1 - test_portion
        self.batch_size = batch_size
        self.df_id = self.generate_dataset(
            total_cases, portion_negatives, df_transactions
        )
        self.train_portion = train_portion
        self.train, self.val = self.split_dataset()
        self.transform, self.target_transform = transform, target_transform

    def generate_dataset(
        self, total_cases: int, portion_negatives: float, df_transactions: pd.DataFrame
    ) -> Tuple[pd.DataFrame, pd.DataFrame]:
        """Produce DataFrames for positive labels and generated negative samples

        Args:
            total_cases (int): Total number of transactions
            portion_negatives (float): The portion of the `total_cases` that should be negative. Balanced 0/1 when 0.5
            df_transactions (pd.DataFrame): Transactions to pull samples/generate samples from

        Returns:
            Tuple[pd.DataFrame, pd.DataFrame]: _description_
        """
        assert (
            0 <= portion_negatives <= 1
        ), r"portion negatives must be a float between 0%=0.0 and 100%=1.0!"
        n_positive = round(total_cases * (1 - portion_negatives))
        n_negative = total_cases - n_positive

        df_positive = df_transactions.sample(n=n_positive).reset_index(drop=True)
        df_positive = df_positive[["customer_id", "article_id"]]
        df_positive["label"] = 1

        # Sampling negative labels:
        #   We select a random combination of `customer_id`, `article_id`, and ensure that this is not a true transaction.
        #   Then we make a 2-column dataframe on same form as `df_positive`

        df_np = df_transactions[["customer_id", "article_id"]].to_numpy()
        neg_np = np.empty((n_negative, df_np.shape[1]), dtype="<U64")
        for i in range(n_negative):
            legit = False
            while not legit:
                sample = [
                    np.random.choice(df_np[:, col]) for col in range(df_np.shape[1])
                ]
                legit = not (
                    (df_np[:, 0] == sample[0]) & (df_np[:, 1] == sample[1])
                ).any()
            neg_np[i, :] = sample
        neg_np = np.column_stack((neg_np, [0] * neg_np.shape[0]))
        df_negative = pd.DataFrame(neg_np, columns=df_positive.columns)
        # Return a shuffled concatenation of the two dataframes
        full_data = (
            pd.concat((df_positive, df_negative)).sample(frac=1).reset_index(drop=True)
        )

        # Make label encodings of the IDs
        le_cust = LabelEncoder()
        le_art = LabelEncoder()
        le_cust.fit(full_data["customer_id"])
        le_art.fit(full_data["article_id"])
        cust_encode = le_cust.transform(full_data["customer_id"])
        art_encode = le_art.transform(full_data["article_id"])
        return pd.DataFrame(
            data={
                "customer_id": cust_encode,
                "article_id": art_encode,
                "label": full_data["label"].astype(np.uint8),
            }
        )

    def __len__(self):
        return len(self.df_id.index)

    def __getitem__(self, idx):
        row, label = self.df_id.iloc[idx, :-1].values, self.df_id.iloc[idx, -1]
        label = int(label)  # Stored as str initially
        if self.transform:
            row = self.transform(row)
        if self.target_transform:
            label = self.target_transform(label)
        return row, label

    def split_dataset(self):
        """Split full data to train and validation Subset-objects

        Returns:
            Tuple[Subset, Subset]: Train and validation subsets
        """
        length = len(self)
        train_size = int(length * self.train_portion)
        valid_size = length - train_size
        train, val = torch.utils.data.random_split(self, [train_size, valid_size])
        return train, val

    def get_data_from_subset(self, subset: torch.utils.data.Subset):
        """Not in use currently, but can retrieve data from Subset object directly"""
        return subset.dataset.df_id.iloc[subset.indices]

    def get_DataLoader(self, trainDL: bool = True):
        subset = self.train if trainDL else self.val
        return DataLoader(dataset=subset, batch_size=self.batch_size)


def read_dataset_obj(src: str) -> Any:

    with open(src, "rb") as f:
        data = pickle.load(f)
    return data

In [4]:
# Load things:)
dataset = read_dataset_obj("object_storage/dataset-2022.11.09.19.34.pckl")

In [35]:
# Other model formulation
class HM_neural(torch.nn.Module):
    def __init__(
        self,
        num_customer: int,
        num_articles: int,
        num_age: int,
        num_idxgroup: int,
        num_garmentgroup: int,
        embedding_size: int,
        bias_nodes: bool,
    ) -> None:
        super().__init__()
        self.customer_embed = torch.nn.Embedding(
            num_embeddings=num_customer, embedding_dim=embedding_size
        )
        self.age_embed = torch.nn.Embedding(num_age, embedding_size)
        self.article_embed = torch.nn.Embedding(
            num_embeddings=num_articles, embedding_dim=embedding_size
        )
        self.indexgroup_embed = torch.nn.Embedding(num_idxgroup, embedding_size)
        self.garmentgroup_embed = torch.nn.Embedding(num_garmentgroup, embedding_size)
        if bias_nodes:
            self.customer_bias = torch.nn.Embedding(num_customer, 1)
            self.article_bias = torch.nn.Embedding(num_articles, 1)
        else:
            # They're added lienarly so this should give no effect
            self.customer_bias = lambda row: 0
            self.article_bias = lambda row: 0
        self.article_MLP = torch.nn.Linear(embedding_size * 3, embedding_size)
        self.customer_MLP = torch.nn.Linear(embedding_size * 2, embedding_size)

        # self.layers = torch.nn.Sequential(
        #     # 3 features: age (customer), index_group and garment_group (article)
        #     # n_activations deaults to e.g. 100?
        #     torch.nn.Linear(int((batch_size := 64) * 3), int(n_activations)),
        #     torch.nn.ReLU(),
        #     torch.nn.Linear(int(n_activations), 1),
        # )

    def article_transform(self, article, garment_group, index_group):
        embeds = (
            self.article_embed(article),
            self.indexgroup_embed(index_group),
            self.garmentgroup_embed(garment_group),
        )
        final_embedding = self.article_MLP(torch.cat(embeds, 1))
        return torch.sigmoid(final_embedding)

    def customer_transform(self, customer, age):
        embeds = (self.customer_embed(customer), self.age_embed(age))
        final_embedding = self.customer_MLP(torch.cat(embeds, 1))
        return torch.sigmoid(final_embedding)

    def forward(self, row):
        customer, article, age, index_group, garment_group = [
            row[:, i] for i in range(5)
        ]
        # Ugly hack:
        garment_group = garment_group - 1001  # 1009 -> 8
        index_group = index_group - 1  # also to zero-index
        age = age - 1
        customer_matrix = self.customer_transform(customer, age)
        article_matrix = self.article_transform(article, garment_group, index_group)
        biases = self.customer_bias(customer), self.article_bias(article)
        x = (customer_matrix * article_matrix).sum(1, keepdim=True)
        x = x + biases[0] + biases[1]
        return torch.sigmoid(x)

In [6]:
np.unique(dataset.get_data_from_subset(dataset.val).to_numpy()[:, 0])

array([    1,     2,     9, ..., 88568, 88569, 88571], dtype=int32)

In [7]:
def _extend_row_data(
    self: Data_HM, customer_rows: list[str], article_rows: list[str]
) -> None:
    customer_rows = ["customer_id"] + customer_rows
    article_rows = ["article_id"] + article_rows

    # Find original customer and article IDs present in dataset
    df_decoded = self.df_id.copy()
    df_decoded["article_id"] = self.le_art.inverse_transform(df_decoded["article_id"])
    df_decoded["customer_id"] = self.le_cust.inverse_transform(
        df_decoded["customer_id"]
    )

    enc_customers = pd.read_csv("dataset/customers.csv")
    enc_customers = enc_customers[
        enc_customers["customer_id"].isin(df_decoded["customer_id"])
    ]
    enc_articles = pd.read_csv("dataset/articles.csv", dtype={"article_id": str})
    enc_articles = enc_articles[
        enc_articles["article_id"].isin(df_decoded["article_id"])
    ]

    enc_customers["customer_id"] = self.le_cust.transform(enc_customers["customer_id"])
    enc_articles["article_id"] = self.le_art.transform(enc_articles["article_id"])
    df_ext = self.df_id.merge(enc_customers[customer_rows]).merge(
        enc_articles[article_rows]
    )
    # Ensure that last column is the label
    ordered_columns = df_ext.columns[df_ext.columns != "label"].append(
        pd.Index(["label"])
    )
    return df_ext[ordered_columns]

In [37]:
# Train fncs...
from tqdm import tqdm
from datetime import datetime
import os
from dataclasses import dataclass


def train_one_epoch(
    model: HM_neural,
    data: Data_HM,
    epoch_num: int,
    optimizer,
    loss,
    lr_scheduler,
    verbose: bool = False,
):
    epoch_loss = 0
    device = "cuda" if torch.cuda.is_available() else "cpu"
    for item in data.get_DataLoader(trainDL=True):
        item = tuple(t.to(device) for t in item)
        row, label = item
        row = row.int()  # Assumes all values are ints, which they currently are:)
        optimizer.zero_grad()
        print(row)
        pred = model(row)
        loss_value = loss(pred.view(-1), torch.FloatTensor(label.tolist()).to(device))
        loss_value.backward()
        optimizer.step()
        epoch_loss += loss_value
    if verbose:
        print(f"\t| Training loss for epoch {epoch_num+1}: {epoch_loss}")
    return epoch_loss


def train(model, data, params):
    device = "cuda" if torch.cuda.is_available() else "cpu"
    # Uses binary cross entropy at the moment
    loss_metric = torch.nn.BCELoss().to(device)
    optimizer = params.optimizer(
        model.parameters(), lr=params.lr_rate, weight_decay=params.weight_decay
    )
    # Adjust lr once model stops improving using scheduler
    lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer)
    save_loss = params.save_loss
    if save_loss:
        train_losses = []
        valid_losses = []
        # Settings not in use atm but we can get the hyperparams from it:))
        settings = ",".join([str(v) for v in params.__dict__.values()])
    for epoch in tqdm(range(params.epochs)):
        model.train()
        epoch_loss = train_one_epoch(
            model, data, epoch, optimizer, loss_metric, lr_scheduler, params.verbose
        )
        if not epoch % params.validation_frequency:
            # Validate step
            model.eval()
            valid_loss = 0.0
            for item in data.get_DataLoader(trainDL=False):
                item = tuple(t.to(device) for t in item)
                row, label = item
                pred = model(row)
                loss = loss_metric(
                    pred.view(-1), torch.FloatTensor(label.tolist()).to(device)
                )
                valid_loss = loss.item() * row.size(0)

            lr_scheduler.step(valid_loss)  # Update lr scheduler
            if params.verbose:
                print(f"Provisory results for epoch {epoch+1}:")
                print(
                    "Loss for training set",
                    epoch_loss.tolist() / len(data.get_DataLoader(trainDL=True)),
                    sep="\t",
                )
                print(
                    "Loss for validation set",
                    valid_loss / len(data.get_DataLoader(trainDL=False)),
                    sep="\t",
                )
                print("-" * 20)
            if save_loss:
                train_losses.append(
                    epoch_loss.tolist() / len(data.get_DataLoader(trainDL=True))
                )
                valid_losses.append(
                    valid_loss / len(data.get_DataLoader(trainDL=False))
                )
    if save_loss:
        fn_append = save_loss if isinstance(save_loss, str) else ""
        save_dir = os.path.join("results", datetime.today().strftime("%Y.%m.%d.%H.%M"))
        if not os.path.isdir(save_dir):
            os.makedirs(save_dir)
        np.savetxt(
            os.path.join(save_dir, f"losses_{fn_append}.csv"),
            np.transpose([train_losses, valid_losses]),
            delimiter=",",
        )
    return valid_loss / len(data.get_DataLoader(trainDL=False))

In [9]:
dataset = read_dataset_obj("object_storage/dataset-2022.11.09.19.34.pckl")
dataset.df_id = _extend_row_data(
    dataset, ["age"], ["index_group_no", "garment_group_no"]
)
print(dataset.df_id.columns)

Index(['customer_id', 'article_id', 'age', 'index_group_no',
       'garment_group_no', 'label'],
      dtype='object')


In [39]:
@dataclass
class Hyperparameters:
    lr_rate: float = 1e-3
    weight_decay: str = 1e-4
    epochs: int = 20
    validation_frequency: int = 1
    optimizer: Any = torch.optim.Adam
    embedding_size: int = 500
    save_loss: bool | str = True
    verbose: bool = False
    dataset_cases: int = 2000
    dataset_portion_negatives: float = 0.9
    dataset_train_portion: float = 0.7
    datset_batch_size: int = 5
    # Add more here...


n_cust, n_art, *_ = dataset.df_id.nunique()
# TODO age/idxgroup/ggroup might have to be the max instead of nunique?
n_age, n_idxgroup, n_garmentgroup = dataset.df_id.max()[2:5].values.astype(int)

params = Hyperparameters(save_loss=False)
model = HM_neural(
    num_customer=n_cust,
    num_articles=n_art,
    num_age=n_age,
    num_idxgroup=n_idxgroup,
    num_garmentgroup=n_garmentgroup,
    embedding_size=params.embedding_size,
    bias_nodes=True,
)
train(model, dataset, params)

  0%|          | 0/20 [00:00<?, ?it/s]

tensor([[70004,  2622,    30,     2,  1003],
        [68836,   711,    32,     4,  1017],
        [48753, 33276,    21,     2,  1005],
        [65452, 11829,    21,     1,  1017],
        [61414, 17627,    51,     2,  1012],
        [34837, 11272,    36,     2,  1009],
        [49202, 24143,    56,     1,  1017],
        [24634, 26212,    27,     1,  1002],
        [68780,  9163,    49,     1,  1020],
        [83390,  7573,    25,     2,  1017],
        [46564,  2984,    53,     1,  1002],
        [51569, 19214,    26,     1,  1017],
        [76082, 18864,    33,    26,  1001],
        [86062, 31449,    53,     3,  1002],
        [43916, 18669,    22,     1,  1005],
        [ 4291, 27595,    58,     1,  1008],
        [46541,  2610,    43,     1,  1003],
        [66929,  2747,    33,     1,  1021],
        [14692, 18083,    49,     1,  1018],
        [43243,  5472,    52,     1,  1018],
        [17062, 29765,    26,     1,  1005],
        [57784,  4555,    53,     1,  1010],
        [8

In [29]:
dataset.df_id.max()[2:5].values.astype(int)

array([  94,   26, 1025])

In [None]:
# try fitting with fastai first time just to check..
import fastai.collab as collab

device = "cuda" if torch.cuda.is_available() else "cpu"
fast_dl = collab.CollabDataLoaders.from_df(dataset.df_id, bs=64).to(device)
print(list(fast_dl.classes.keys()))

rner = collab.Learner(fast_dl, model, loss_func=collab.BCELossFlat())
learner.fit_one_cycle(5, 1e-3)

['customer_id', 'article_id']


epoch,train_loss,valid_loss,time


tensor([[67733, 13250],
        [ 4364, 14105],
        [33799, 11353],
        [61938, 28231],
        [38447,  4359],
        [65622, 27478],
        [72319, 14892],
        [ 6674, 29150],
        [16674,  3047],
        [70781,  4192],
        [36597, 13065],
        [68027, 14725],
        [49277, 17021],
        [44302, 13063],
        [57910, 18014],
        [38933, 28771],
        [12289, 17547],
        [54754, 12828],
        [25302, 25277],
        [ 9240, 21895],
        [25368, 18145],
        [51623, 10671],
        [38869, 24578],
        [70131, 20900],
        [43371, 16372],
        [10913, 19067],
        [45549,  6233],
        [ 3024, 24365],
        [32436,  2092],
        [50554, 17617],
        [35551, 11656],
        [20245,  9074],
        [69901, 28416],
        [34453,  7288],
        [34321, 24949],
        [18795, 18534],
        [47795, 22395],
        [66444, 16701],
        [18639, 25061],
        [54742,  8556],
        [ 2901,  3261],
        [12062, 

ValueError: too many values to unpack (expected 5)