In [13]:
%reload_ext jupyter_black

**Key takeaways from the dataset**

* Some articles have no image
* Some customers don't buy anything
* The complete transaction data has 31 788 325 rows, just short of 32 million (!!)

In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from typing import Union, Tuple
from types import NoneType
import random, shutil, os, itertools, black, jupyter_black

jupyter_black.load()

## Sampling methods

We need to be able to pull out realistic samples of the dataset. To do this, we first sample $n$ customers at random and include every transaction that they have done - these are the positive labels. In addition, we want to obtain additional transactions that are not related to the customers in the sample, working as a negative label. We implement this by saying that $k$% of the data are true labels, defaulting $k=10$%. Lastly, we pull out the article IDs in all the transactions and obtain the images for said article.

In [None]:
def naive_csv_sampler(
    csv_path: str,
    sample_size: int,
    num_records: int | NoneType = None,
    header: str | NoneType = "infer",
) -> pd.DataFrame:
    """Read samples of rows from csv file

    Args:
        csv_path (str): Path to file including file extensions
        sample_size (int): Number of rows to sample
        num_records (int | NoneType, optional): Total records in file, defaults to None. If None, the file will be scanned (costly)
        header (str | NoneType, optional): 'header'-parameter for pandas, defaults to 'infer'. Set to None if file has no header.

    Returns:
        pd.DataFrame: Dataframe with sampled entries (and potentially header)
    """
    if num_records is None:
        num_records = newlines_in_csv(csv_path)
    indices_skip = sorted(
        random.sample(range(1, num_records + 1), num_records - sample_size)
    )
    return pd.read_csv(csv_path, skiprows=indices_skip, header=header)


def newlines_in_csv(csv_path: str, chunk_size: int = 1024) -> int:
    """Counts number of newlines in csv file without loading entire file to memory.
    The number of newlines is the same as number of rows assuming,
        * EITHER csv has a header and last entry does not end with newline
        * OR csv does not have a header, but last entry ends with newline
        * ALWAYS data does not have any nested newline madness
    Originally from orlp, https://stackoverflow.com/a/64744699

    Args:
        csv_path (str): Path of csv file
        chunk_size (int, optional): How many KB to process at at a time. Defaults to 1024 = 1 MB.

    Returns:
        int: Number of newlines
    """
    chunk = chunk_size**2
    f = np.memmap(csv_path)
    number_newlines = sum(
        np.sum(f[i : i + chunk] == ord("\n")) for i in range(0, len(f), chunk)
    )
    del f
    return number_newlines


In [16]:
def copy_img_from_article(df: pd.DataFrame, outpath):
    for id in df['article_id']:
        id0 = "0" + str(id)
        img_path = f"./dataset/images/{id0[:3]}/{id0}.jpg"
        if not os.path.isfile(img_path):
            continue # ID has no image (happens for some cases)
        out_dir = f"./{outpath}/images/{id0[:3]}/"
        if not os.path.isdir(out_dir):
            os.makedirs(out_dir)
            shutil.copy(img_path, out_dir)
copy_img_from_article(df_art, "dataset_sample")

## Data loading and preprocessing

In [5]:
# Cleaning up the datasets
from typing import Iterable


def load_min_data(filename: str | Iterable):
    dfs = []
    if isinstance(filename, str):
        filename = [filename]
    for fn in filename:
        df = pd.read_csv(fn)
        # All min-datasets have an index column which has to be dropped:
        dfs.append(df.drop(df.columns[0], axis=1))
    return dfs

def clean_customer_data(df):
    # df = df.drop("FN", axis=1) # I they're not exactly equal
    df.loc[
        ~df["fashion_news_frequency"].isin(["Regularly", "Monthly"]),
        "fashion_news_frequency",
    ] = "None"
    return df


In [9]:
# New data loading principle
import pandas as pd
import os
import numpy as np
import torch
from torch.utils.data import DataLoader, Dataset


class Data_HM(Dataset):
    """This is the general HM Dataset class whose children are train-dataset and validation-dataset

    Args:
        Dataset: Abstract Dataset class from pyTorch
    """
    def __init__(
        self,
        total_cases: int,
        portion_negatives: float,
        df_transactions: pd.DataFrame,
        df_articles: pd.DataFrame,
        df_customers: pd.DataFrame,
        train_portion: float | None = None,
        test_portion: float | None = None,
    ) -> None:
        super().__init__()  # TODO not sure if we need this
        self.pos, self.neg = self.generate_dataset(
            total_cases, portion_negatives, df_transactions
        )
        self.df = pd.concat(
            [
                self.merge_dfs_add_label(
                    self.pos,
                    df_articles,
                    df_customers,
                    positive=True,
                ),
                self.merge_dfs_add_label(
                    self.neg,
                    df_articles,
                    df_customers,
                    positive=False,
                ),
            ]
        ).reset_index(drop=True)
        self.train, self.test = self.split(train_portion, test_portion)

    def generate_dataset(
        self, total_cases: int, portion_negatives: float, df_transactions: pd.DataFrame
    ) -> Tuple[pd.DataFrame, pd.DataFrame]:
        """Produce DataFrames for positive labels and generated negative samples

        Args:
            total_cases (int): Total number of transactions
            portion_negatives (float): The portion of the `total_cases` that should be negative. Balanced 0/1 when 0.5
            df_transactions (pd.DataFrame): Transactions to pull samples/generate samples from

        Returns:
            Tuple[pd.DataFrame, pd.DataFrame]: _description_
        """
        assert (
            0 <= portion_negatives <= 1
        ), r"portion negatives must be a float between 0%=0.0 and 100%=1.0!"
        n_positive = int(total_cases * (1 - portion_negatives))
        n_negative = int(total_cases * portion_negatives)
        df_positive = df_transactions.sample(n=n_positive).reset_index(drop=True)
        df_positive = df_positive[["customer_id", "article_id"]]

        
        # Sampling negative labels:
        #   We select a random combination of `customer_id`, `article_id`, and ensure that this is not a true transaction.
        #   Then we write this tuple to a csv which is transformed into a DataFrame similar to `df_positive`

        num_written = 0
        tmpStr = "customer_id,article_id\n"
        while num_written < n_negative:
            # Choose random customer and article
            selection = np.array(  # TODO this can probably be optimized further
                [
                    df_transactions["customer_id"].sample().values,
                    df_transactions["article_id"].sample().values,
                ]
            ).flatten()
            if not (
                (df_transactions["customer_id"] == selection[0])
                & (df_transactions["article_id"] == selection[1])
            ).any():
                tmpStr += f"{selection[0]}, {selection[1]}\n"
                num_written += 1
        with open("tmp.csv", "w") as f:
            f.write(tmpStr)
        df_negative = pd.read_csv("tmp.csv")
        os.remove("tmp.csv")
        return df_positive, df_negative

    def merge_dfs_add_label(
        self, df_transactions: pd.DataFrame, df_articles: pd.DataFrame, df_customers: pd.DataFrame, positive: bool = False
    ) -> pd.DataFrame:
        """Merge customer and article data to the sampled data `df_transactions`, excluding customer/article IDs

        Args:
            df_transactions (pd.DataFrame): DataFrame from `generate_dataset`
            df_articles (pd.DataFrame): Articles DataFrame
            df_customers (pd.DataFrame): Customers DataFrame
            positive (bool, optional): Wether or not df_transactions represent positive labels. Defaults to False.

        Returns:
            pd.DataFrame: DF with all columns included
        """
        columns_articles = [
            "article_id",
            "prod_name",
            "product_type_name",
            "product_group_name",
            "graphical_appearance_name",
            "colour_group_name",
            "perceived_colour_value_name",
            "perceived_colour_master_name",
            "department_name",
            "index_name",
            "index_group_name",
            "section_name",
            "garment_group_name",
            "detail_desc",
        ]
        # TODO consider storing blacklisted cols instead of whitelisted

        df_articles = df_articles[columns_articles]

        df = pd.merge(
            df_transactions, df_customers, how="inner", on=["customer_id"]
        ).drop(["customer_id"], axis=1)
        df = pd.merge(df, df_articles, how="inner", on=["article_id"]).drop(
            ["article_id"], axis=1
        )
        df["label"] = 1 if positive else 0
        return df

    def __len__(self):
        return len(self.df.index)

    def __getitem__(self, idx):
        row, label = self.df.iloc[idx, :-1], self.df.iloc[idx, -1]
        if self.transform:
            row = self.transform(row)
        if self.target_transform:
            label = self.target_transform(label)
        return row, label

    def split(
        self, train_portion: float | None = None, test_portion: float | None = None
    ) -> Tuple[pd.DataFrame, pd.DataFrame]:
        """Split full dataset into training and validation set. Note that only one of train_portion or
            test_portion are required (test_portion = 100% - test_portion)

        Args:
            train_portion (float | None, optional): Percentage of rows assigned to training set. Defaults to None.
            test_portion (float | None, optional): Percentage of rows assigned to validation set. Defaults to None.

        Returns:
            Tuple[pd.DataFrame, pd.DataFrame]: Train-set and validation-set
        """
        assert any(
            [train_portion, test_portion]
        ), "At least one of train or test portion must be float"
        if train_portion is None:
            train_portion = 1-test_portion
        train = self.df.sample(frac=train_portion)
        test = (
            pd.merge(self.df, train, indicator=True, how="outer")
            .query('_merge=="left_only"')
            .drop("_merge", axis=1)
        )
        return train.reset_index(drop=True), test.reset_index(drop=True)


class HM_train(Data_HM):
    def __init__(
        self,
        total_cases,
        portion_negatives,
        df_transactions,
        df_articles,
        df_customers: pd.DataFrame,
        train_portion=None,
        test_portion=None,
    ) -> None:
        super().__init__(
            total_cases,
            portion_negatives,
            df_transactions,
            df_articles,
            df_customers,
            train_portion,
            test_portion,
        )

    def __getitem__(self, idx):
        row, label = self.train.iloc[idx, :-1], self.train.iloc[idx, -1]
        if self.transform:
            row = self.transform(row)
        if self.target_transform:
            label = self.target_transform(label)
        return row, label


class HM_val(Data_HM):
    def __init__(
        self,
        total_cases,
        portion_negatives,
        df_transactions,
        df_articles,
        df_customers: pd.DataFrame,
        train_portion=None,
        test_portion=None,
    ) -> None:
        super().__init__(
            total_cases,
            portion_negatives,
            df_transactions,
            df_articles,
            df_customers,
            train_portion,
            test_portion,
        )

    def __getitem__(self, idx):
        row, label = self.test.iloc[idx, :-1], self.test.iloc[idx, -1]
        if self.transform:
            row = self.transform(row)
        if self.target_transform:
            label = self.target_transform(label)
        return row, label

In [11]:
my_dataset = Data_HM(20, 0.8, df_t, df_a, df_c, train_portion=0.7)
my_test = HM_val(20, 0.8, df_t, df_a, df_c, train_portion=0.7)
my_test.test

Unnamed: 0,FN,Active,club_member_status,fashion_news_frequency,age,postal_code,prod_name,product_type_name,product_group_name,graphical_appearance_name,colour_group_name,perceived_colour_value_name,perceived_colour_master_name,department_name,index_name,index_group_name,section_name,garment_group_name,detail_desc,label
0,1.0,1.0,ACTIVE,,47.0,e28a643283c2b517d3e63d17291ae4b339d79827e32dd3...,Tilly,T-shirt,Garment Upper body,All over pattern,Off White,Light,White,Jersey Basic,Ladieswear,Ladieswear,Womens Everyday Basics,Jersey Basic,T-shirt in lightweight jersey with a rounded h...,1
1,1.0,1.0,ACTIVE,Regularly,22.0,2887d1d82d975bf335461f8c87ce835677844b5cba34c5...,Magaluf lace detail dress,Dress,Garment Full body,Solid,Black,Dark,Black,Dress,Ladieswear,Ladieswear,Womens Casual,Dresses Ladies,Knee-length dress in a viscose weave with embr...,0
2,1.0,1.0,ACTIVE,Regularly,34.0,fe814f9d1b8a657f20a30666c230cba0ad41a3172d5a6d...,Everlacing Love Top,Bikini top,Swimwear,Stripe,Blue,Medium,Blue,Swimwear,Lingeries/Tights,Ladieswear,"Womens Swimwear, beachwear",Swimwear,Fully lined bikini top with lacing at the fron...,0
3,1.0,1.0,ACTIVE,Regularly,31.0,9c97073caea2e274c9ef34e0d90e43fb40078bcb9d15db...,SUPREME RW tights,Leggings/Tights,Garment Lower body,Colour blocking,Grey,Dark,Grey,Ladies Sport Bottoms,Sport,Sport,Ladies H&M Sport,Jersey Fancy,Sports tights in fast-drying functional fabric...,0
4,,,ACTIVE,,31.0,71b88711bd37db08ac1549d73432852f664cb48be8d893...,Trudy Cardigan,Cardigan,Garment Upper body,Solid,Light Green,Dusty Light,Green,Tops Knitwear,Divided,Divided,Divided Collection,Knitwear,"Short, fitted cardigan in a fine knit with a V...",0
5,,,ACTIVE,,48.0,0f23f9b1e451204de97aca27b98e60c089c8f7ed13e6d8...,Lee (1),Top,Garment Upper body,Melange,Blue,Medium Dusty,Blue,Jersey Basic,Ladieswear,Ladieswear,Womens Everyday Basics,Jersey Basic,Long-sleeved top in soft jersey.,0


In [81]:
df = naive_csv_sampler("dataset/transactions_train.csv", sample_size=200)

In [82]:
df

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id
0,2018-09-20,aaf7a4cf881cc71b8cf97cd8e9c88ce300eb4fe2a279de...,649445003,0.059305,1
1,2018-09-21,31287b3d29b025cf00822b66b462a415e9c58d65385627...,620337036,0.016932,2
2,2018-09-23,04ebf0daa6de941f870109b5536bc226f574264bd13b25...,637673005,0.033881,2
3,2018-09-28,2e25374e1dd6141985ef534edabbe3ff436b395d1ce8d1...,672498003,0.025407,2
4,2018-10-12,93cb3a871d8997d85f8d765d37d5526b2eabab693919e4...,677219003,0.033881,2
...,...,...,...,...,...
195,2020-08-30,c6ae7c8e763d1127d6991e86a37d4e6fef69742ef2661c...,907527001,0.041441,2
196,2020-08-31,3296834ebcbd763dbd8d854f0883998bcf397cc02e6abb...,805947003,0.042356,2
197,2020-09-06,cfdc06ef05cf8e982bad3ce856bdcdbf4b141b35c2e1ad...,570189003,0.025407,2
198,2020-09-20,d4003b0349e30d5569547bb11ccd69669cdc9db6463c81...,715828028,0.033881,1


Number of rows in complete transactions csv:

In [33]:
newlines_in_csv("dataset/transactions_train.csv")

31788325

<hr>

### Trying out user-user collaborative filtering

In [44]:
"""Psudeocode
* Create customer profiles for all customers in (sampled) dataset
    * i.e. each customer ID has a vector r_ID whose elements represent items purchased
* Compute Jaccard similarity between all r_IDs, independent of position
* For a given customer x, choose the k customers closest to x
* For an article i, wether or not to recommend is based on the recommendation score
    r(x, i) = mean( [rel(y, i) for y in top k] )
"""


def position_indep_jaccard(x: list | set, y: list | set) -> float:
    # Position-independent jaccard-similarity
    x, y = set(x), set(y)
    return len(x.intersection(y)) / len(x.union(y))



def find_customer_similarity(
    df_customer: pd.DataFrame, df_transactions: pd.DataFrame
) -> Tuple[pd.DataFrame, dict]:
    articles_dict = {}
    for cust_ID in df_customer["customer_id"]:
        articles_dict[cust_ID] = df_transactions["article_id"][
            df_transactions["customer_id"] == cust_ID
        ].to_list()
        # Pop customers without purchase history
        if len(articles_dict[cust_ID]) == 0:
            articles_dict.pop(cust_ID)
    num_customers = len(df_customer)
    print(f"{num_customers = }")
    similarity_matrix = np.zeros((num_customers, num_customers))
    # Iterate over customers:
    for r, cust in enumerate(articles_dict.keys()):
        for c, second in enumerate(articles_dict.keys()):
            sim = position_indep_jaccard(articles_dict[cust], articles_dict[second])
            similarity_matrix[r, c] = sim

    return (
        pd.DataFrame(
            similarity_matrix, index=articles_dict.keys(), columns=articles_dict.keys()
        ),
        articles_dict,
    )


def get_recommendation(
    similarity_matrix: pd.DataFrame,
    articles_dict: dict,
    customer_ID: str,
    article_ID: int,
    k: int,
) -> float:
    """Produce recommendation score of an item based on its k closest customer behaviors

    Args:
        similarity_matrix (pd.DataFrame): nxn matrix of similarities between customers
        articles_dict (dict): Dictionary of customer purchases on form {customer_id: [item1, item2, ...]}
        customer_ID (str): The customer the recommendation score is based on
        article_ID (int): The article the score is based on
        k (int): How many (closest) customer-neighbors to include in computation.

    Returns:
        float: Measure of how well the item would fit the customer in question, between [0,1]
    """
    # The k most similar customers IDs:
    closest_customers = (
        similarity_matrix[customer_ID].sort_values(ascending=False)[:k].index
    )
    return (
        sum(1 if article_ID in articles_dict[cust] else 0 for cust in closest_customers)
        / k
    )

In [27]:
# Load sample data
df_cust = pd.read_csv("dataset_sample/customer_min.csv")
df_tr = pd.read_csv("dataset_sample/transactions_min.csv")
df_art = pd.read_csv("dataset_sample/articles_min.csv")
sim_matr, art_dict = find_customer_similarity(df_cust, df_tr)

num_customers = 200


In [37]:
def get_n_recommendations(
    n: int,
    similarity_matrix: pd.DataFrame,
    articles_dict: dict,
    customer_ID: str,
    k: int,
    ignore_purchased: bool = True,
) -> list:
    """Get the n 'best' recommended items for a specific customer ID

    Args:
        n (int): How many items to recommend
        similarity_matrix (pd.DataFrame): Customer similarity matrix
        articles_dict (dict): Dictionary of customer purchases on form {customer_id: [item1, item2, ...]}
        customer_ID (str): _description_
        k (int): _description_
        ignore_purchased (bool, optional): _description_. Defaults to True.

    Returns:
        list: _description_
    """
    # Get rec. score for all cases and choose n with highest score
    # ignore_purchased to ignore those articles customer has already bought
    blacklisted_articles = (
        set(articles_dict[customer_ID]) if ignore_purchased else set()
    )
    art_IDs = set(itertools.chain(*articles_dict.values())) - blacklisted_articles
    score_dict = {
        art_ID: get_recommendation(
            similarity_matrix, articles_dict, customer_ID, art_ID, k
        )
        for art_ID in art_IDs
    }
    n_best_items = {
        k: v for k, v in sorted(score_dict.items(), key=lambda el: el[1], reverse=True)
    }
    # Return entire dict for debug purposes, but otherwise just the article IDs (not scores)
    return list(itertools.islice(n_best_items.items(), n))
    return list(n_best_items.keys())[:n]

In [43]:
get_n_recommendations(
    n=5,
    similarity_matrix=sim_matr,
    articles_dict=art_dict,
    customer_ID="008068b49b6bdd622ed406e30c8603270770174ebf300dbac0f5beac522921e0",
    k=5,
)

[(770851001, 0.2),
 (806388003, 0.2),
 (615154002, 0.2),
 (830702001, 0.2),
 (677561001, 0.2)]

In [26]:

get_recommendation(
    similarity_matrix=sim_matr,
    articles_dict=art_dict,
    customer_ID='008068b49b6bdd622ed406e30c8603270770174ebf300dbac0f5beac522921e0',
    article_ID=556255001,
    k=5
)


0.4

In this case, two of the $k$ closest customers (including the customer itself) has bought the article in question. Thus we get a score of $\frac25=0.4$

<hr>

### Methods for metric evaluation (MAP@12)

In [31]:
def prec(k: int, preds: np.ndarray, true: np.ndarray) -> float:
    """Precision function with cutoff (k). Used for MAP@12 metric.

    Args:
        k (int): Cutoff point for prediction array
        preds (np.ndarray): Prediction array
        true (np.ndarray): Ground truth

    Returns:
        float: Precision, i.e. portion of correctly predicted values

    """
    # Assumes that preds and true are 1d arrays ['a','b',...]
    return len(np.intersect1d(preds[:k], true))/k

def rel(k: int, preds: np.ndarray, true: np.ndarray) -> int:
    assert 0 < k <= len(preds), "k must be able to index preds!"
    return int(preds[k-1] in true)

def MAPk(k, preds, true) -> float:
    return np.mean([
        np.sum([prec(i,p,t)*rel(i,p,t) for i in range(1,k+1)])/\
            min(k, len(true))\
                for t, p in zip(true, preds)
    ])

In [29]:
# Tests
import unittest
class TestMetricFunctions(unittest.TestCase):
    def __init__(self, methodName: str = 'runTest') -> None:
        self.gt = np.array(['a', 'b', 'c', 'd', 'e'])
        self.preds1 = np.array(['b', 'c', 'a', 'd', 'e'])
        self.preds2 = np.array(['a', 'b', 'c', 'd', 'e'])
        self.preds3 = np.array(['f', 'b', 'c', 'd', 'e'])
        self.preds4 = np.array(['a', 'f', 'e', 'g', 'b'])
        self.preds5 = np.array(['a', 'f', 'c', 'g', 'b'])
        self.preds6 = np.array(['d', 'c', 'b', 'a', 'e'])
        super().__init__(methodName)

    def test_prec(self):
        self.assertAlmostEqual(prec(1, self.preds1, self.gt), 1.0)
        self.assertAlmostEqual(prec(1, self.preds2, self.gt), 1.0)
        self.assertAlmostEqual(prec(1, self.preds3, self.gt), 0.0)
        self.assertAlmostEqual(prec(2, self.preds4, self.gt), 0.5)
        self.assertAlmostEqual(prec(3, self.preds5, self.gt), 2/3)
        self.assertAlmostEqual(prec(3, self.preds6, self.gt), 1.0)
    
    def test_rel(self):
        self.assertAlmostEqual(rel(1, self.preds1, self.gt), 1.0)
        self.assertAlmostEqual(rel(1, self.preds2, self.gt), 1.0)
        self.assertAlmostEqual(rel(1, self.preds3, self.gt), 0.0)
        self.assertAlmostEqual(rel(2, self.preds4, self.gt), 0.0)
        self.assertAlmostEqual(rel(3, self.preds5, self.gt), 1.0)
        self.assertAlmostEqual(rel(3, self.preds6, self.gt), 1.0)
    
    def test_mapk(self):
        all_true = np.array([self.gt for i in range(6)])
        all_pred = np.array([self.preds1, self.preds2, self.preds3,\
                            self.preds4, self.preds5, self.preds6])
        self.assertAlmostEqual(MAPk(k=4, preds=all_pred, true=all_true), 0.71875)
unittest.main(argv=[''], verbosity=2, exit=False)

test_mapk (__main__.TestMetricFunctions) ... ok
test_prec (__main__.TestMetricFunctions) ... ok
test_rel (__main__.TestMetricFunctions) ... ok

----------------------------------------------------------------------
Ran 3 tests in 0.003s

OK


<unittest.main.TestProgram at 0x1bd1ab12110>

<hr>

### Playing around with Torch embedding

In [14]:
# First network: let the labels determine if a customer has purchased it or not. Ignore also images for now
import torch, os
import pandas as pd
from torch.utils.data import Dataset, DataLoader


class Dataset_HM(Dataset):
    def __init__(
        self,
        customer_id,
        transactions_file,
        customers_file,
        articles_file,
        transform=None,
        target_transform=None,
    ) -> None:
        # I guess we need the customer ID to get the labels for all articles...
            # Alternatively we need to repeat that process for each customer ID in dataset
        self.df_articles = pd.read_csv(articles_file)
        self.df_customers = pd.read_csv(customers_file)
        self.df_transactions = pd.read_csv(transactions_file)

        


        # for c_id in self.df_customers['customer_id']:
        #     purchased_train = self.df_transactions[self.df_transactions["customer_id"] == c_id][
        #         "article_id" # Articles bought by customer with ID `id`
        #     ]

        #     with open("tmp.csv", "a") as f:
        #         for a_id in self.df_articles["article_id"]:
        #             f.write(f"{c_id}, {a_id}, {1 if a_id in purchased_train.values else 0}\n")

        # self.labels_train = pd.read_csv("tmp.csv")
        # os.remove("tmp.csv")

    def __len__(self):
        return len(self.labels_train)

    def __getitem__(self, idx):
        row = self.df_article.iloc[idx]  # Here we only use the info in the articles lol
        label = self.labels_train.iloc[idx, 1]
        if self.transform:
            row = self.transform(row)
        if self.target_transform:
            label = self.target_transform(label)
        return row, label

    def get_loader(self, test: bool = False):
        data = self.test_data if test else self.train_data
        return DataLoader(data, batch_size=64, shuffle=True)


data_testing = Dataset_HM(
    "a301a140b47463f6daf3d9ca358729889e407581b02ce98ce05771d1028d75a3",
    "dataset_sample/transactions_min.csv",
    "dataset_sample/articles_min.csv",
)
len(data_testing)

4272

In [19]:
import torch
import os
from torchvision.io import read_image
import pandas as pd


class Model_HM(torch.nn.Module):
    def __init__(self, num_users, num_items, embedding_size):
        super(Model_HM, self).__init__()
        self.user_embeddings = torch.nn.Embedding(
            num_embeddings=num_users, embedding_dim=embedding_size
        )
        self.article_embeddings = torch.nn.Embedding(
            num_embeddings=num_items, embedding_dim=embedding_size
        )

    def forward(self, users, items):
        user_embeddings = self.user_embeddings(users)
        article_embeddings = self.article_embeddings(items)
        dot_prod = torch.sum(torch.mul(user_embeddings, article_embeddings), 1)
        return torch.sigmoid(dot_prod)

In [15]:
data_testing = Dataset_HM(
    "a301a140b47463f6daf3d9ca358729889e407581b02ce98ce05771d1028d75a3",
    "dataset_sample/transactions_min.csv",
    "dataset_sample/articles_min.csv",
)
dl_train = data_testing.get_loader()
dl_test = data_testing.get_loader(test=True)

AttributeError: 'Dataset_HM' object has no attribute 'train_data'