###### lets first start with setting up the data 

import libraries

In [1]:
import gc
import numpy as np
import pandas as pd
import os
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np

In [2]:
# helper functions

# https://www.kaggle.com/c/h-and-m-personalized-fashion-recommendations/discussion/308635
def customer_hex_id_to_int(series):
    return series.str[-16:].apply(hex_id_to_int)

def hex_id_to_int(str):
    return int(str[-16:], 16)

def article_id_str_to_int(series):
    return series.astype('int32')

def article_id_int_to_str(series):
    return '0' + series.astype('str')

class Categorize(BaseEstimator, TransformerMixin):
    def __init__(self, min_examples=0):
        self.min_examples = min_examples
        self.categories = []
        
    def fit(self, X):
        for i in range(X.shape[1]):
            vc = X.iloc[:, i].value_counts()
            self.categories.append(vc[vc > self.min_examples].index.tolist())
        return self

    def transform(self, X):
        data = {X.columns[i]: pd.Categorical(X.iloc[:, i], categories=self.categories[i]).codes for i in range(X.shape[1])}
        return pd.DataFrame(data=data)

loading dataset

In [3]:
%%time

articles_df = pd.read_csv("../Dataset/HandM/articles.csv",dtype={"article_id": "str"})
articles_df.article_id = article_id_str_to_int(articles_df.article_id)

customers_df = pd.read_csv("../Dataset/HandM/customers.csv")
customers_df['customer_id'] = customer_hex_id_to_int(customers_df['customer_id'])
for col in ['FN', 'Active', 'age']:
    customers_df[col].fillna(-1, inplace=True)
    customers_df[col] = customers_df[col].astype('int8')


sample_submission_df = pd.read_csv("../Dataset/HandM/sample_submission.csv")

transaction_train = pd.read_csv("../Dataset/HandM/transactions_train.csv",dtype={"article_id": "str"})
transaction_train['customer_id'] = customer_hex_id_to_int(transaction_train['customer_id'])
transaction_train.t_dat = pd.to_datetime(transaction_train.t_dat, format='%Y-%m-%d')
transaction_train.article_id = article_id_str_to_int(transaction_train.article_id)
transaction_train['week'] = 104 - (transaction_train.t_dat.max() - transaction_train.t_dat).dt.days // 7

transaction_train["t_dat"] = pd.to_datetime(transaction_train["t_dat"])
#taking only the first year
transaction_train = transaction_train[transaction_train["t_dat"] > "2019-09-01"]

transaction_train["week"] = (transaction_train["t_dat"].max() - transaction_train["t_dat"]).dt.days // 7
transaction_train["week"].value_counts()

transaction_train.week = transaction_train.week.astype('int8')
transaction_train.sales_channel_id = transaction_train.sales_channel_id.astype('int8')
transaction_train.price = transaction_train.price.astype('float32')

Wall time: 48.2 s


In [4]:
ALL_USERS = customers_df['customer_id'].unique().tolist()

ALL_ITEMS = articles_df['article_id'].unique().tolist()

user_to_customer_map = {user_id : customer_id for user_id, customer_id in enumerate(ALL_USERS)}
customer_to_user_map = {customer_id : user_id for user_id, customer_id in enumerate(ALL_USERS)}

item_to_article_map = {item_id : article_id for item_id, article_id in enumerate(ALL_ITEMS)}
article_to_item_map = {article_id : item_id for item_id, article_id in enumerate(ALL_ITEMS)}

In [5]:
customers_df['user_id'] = customers_df['customer_id'].map(customer_to_user_map)
customers_df['user_id'] = customers_df['user_id'].astype('int8')

customers_df.club_member_status = Categorize().fit_transform(customers_df[['club_member_status']]).club_member_status
customers_df.postal_code = Categorize().fit_transform(customers_df[['postal_code']]).postal_code
customers_df.fashion_news_frequency = Categorize().fit_transform(customers_df[['fashion_news_frequency']]).fashion_news_frequency

In [6]:
for col in articles_df.columns:
    if articles_df[col].dtype == 'object':
        articles_df[col] = Categorize().fit_transform(articles_df[[col]])[col]

for col in articles_df.columns:
    if articles_df[col].dtype == 'int64':
        articles_df[col] = articles_df[col].astype('int32')        
        


In [7]:
def missing_data(data):
    total = data.isnull().sum().sort_values(ascending = False)
    percent = (data.isnull().sum()/data.isnull().count()*100).sort_values(ascending = False)
    return pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])

    Since we have relatively clean data. We can start transforming our data.

In [8]:
missing_data(transaction_train)

Unnamed: 0,Total,Percent
t_dat,0,0.0
customer_id,0,0.0
article_id,0,0.0
price,0,0.0
sales_channel_id,0,0.0
week,0,0.0


Dimension reduction since we will be taking the last year of data. 

In [9]:
transaction_train.drop(columns='t_dat').info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
Int64Index: 15637202 entries, 16151122 to 31788323
Data columns (total 5 columns):
 #   Column            Dtype  
---  ------            -----  
 0   customer_id       uint64 
 1   article_id        int32  
 2   price             float32
 3   sales_channel_id  int8   
 4   week              int8   
dtypes: float32(1), int32(1), int8(2), uint64(1)
memory usage: 387.7 MB


In [10]:
customers_df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1371980 entries, 0 to 1371979
Data columns (total 8 columns):
 #   Column                  Non-Null Count    Dtype 
---  ------                  --------------    ----- 
 0   customer_id             1371980 non-null  uint64
 1   FN                      1371980 non-null  int8  
 2   Active                  1371980 non-null  int8  
 3   club_member_status      1371980 non-null  int8  
 4   fashion_news_frequency  1371980 non-null  int8  
 5   age                     1371980 non-null  int8  
 6   postal_code             1371980 non-null  int32 
 7   user_id                 1371980 non-null  int8  
dtypes: int32(1), int8(6), uint64(1)
memory usage: 23.6 MB


In [17]:
print(articles_df.dtypes)
print(transaction_train.dtypes)
print(customers_df.dtypes)

article_id                      int32
product_code                    int32
prod_name                       int32
product_type_no                 int32
product_type_name               int16
product_group_name               int8
graphical_appearance_no         int32
graphical_appearance_name        int8
colour_group_code               int32
colour_group_name                int8
perceived_colour_value_id       int32
perceived_colour_value_name      int8
perceived_colour_master_id      int32
perceived_colour_master_name     int8
department_no                   int32
department_name                 int16
index_code                       int8
index_name                       int8
index_group_no                  int32
index_group_name                 int8
section_no                      int32
section_name                     int8
garment_group_no                int32
garment_group_name               int8
detail_desc                     int32
dtype: object
t_dat               datetime64[ns]
c

In [11]:
# %%time

# transaction_train.to_parquet('../Dataset/HandM/transaction_train.parquet')
# customers_df.to_parquet('../Dataset/HandM/customers.parquet')
# articles_df.to_parquet('../Dataset/HandM/articles.parquet')

In [12]:
# %%time
# # let's create a 5% sample of the entiriety of the data to speed up dev

# sample = 0.05
# customers_sample = customers.sample(frac=sample, replace=False)
# customers_sample_ids = set(customers_sample['customer_id'])
# transactions_sample = transactions[transactions["customer_id"].isin(customers_sample_ids)]
# articles_sample_ids = set(transactions_sample["article_id"])
# articles_sample = articles[articles["article_id"].isin(articles_sample_ids)]

# customers_sample.to_parquet(f'data/customers_sample_{sample}.parquet', index=False)
# transactions_sample.to_parquet(f'data/transactions_train_sample_{sample}.parquet', index=False)
# articles_sample.to_parquet(f'data/articles_train_sample_{sample}.parquet', index=False)

##### create mapping from ids to incremental integers and viceversa


In [13]:
# week is between 0 - 104 # roughly two years
WEEK_HIST_MAX = 5
def create_dataset(transaction_train, articles_df, week):
    
    #encode article_id and customer_id
    transaction_train['item_id'] = transaction_train['article_id'].map(article_to_item_map)
    transaction_train['user_id'] = transaction_train['customer_id'].map(customer_to_user_map)
    
    cols = ['article_id','prod_name','department_name']
    articles = articles_df[cols]
    articles['item_id'] = articles['article_id'].map(article_to_item_map)
    
    
    
    
    hist_df = transaction_train[(transaction_train["week"] > week) & (transaction_train["week"] <= week + WEEK_HIST_MAX)]
    hist_df = transaction_train.merge(articles,how = 'left', left_on = 'item_id', right_on='item_id')
    
    hist_df = hist_df.groupby("user_id").agg({"item_id": list, "week": list, "prod_name": list}).reset_index()
    hist_df.rename(columns={"week": 'week_history'}, inplace=True)
    
    target_df = transaction_train[transaction_train["week"] == week]
    target_df = target_df.groupby("user_id").agg({"item_id": list}).reset_index()
    target_df.rename(columns={"item_id": "target"}, inplace=True)
    target_df["week"] = week
    
    return target_df.merge(hist_df, on="user_id", how="left")


In [14]:
%%time
val_weeks = [0]
train_weeks = [1, 2, 3, 4]

val_df = pd.concat([create_dataset(transaction_train, articles_df, w) for w in val_weeks]).reset_index(drop=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  articles['item_id'] = articles['article_id'].map(article_to_item_map)


Wall time: 20.1 s


In [15]:
train_df = pd.concat([create_dataset(transaction_train, articles_df, w) for w in train_weeks]).reset_index(drop=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  articles['item_id'] = articles['article_id'].map(article_to_item_map)


In [16]:
train_df

Unnamed: 0,user_id,target,week,item_id,week_history,prod_name
0,2,[78503],1,"[59458, 1469, 1469, 60253, 60259, 93585, 91841...","[33, 33, 33, 24, 24, 22, 22, 22, 22, 22, 22, 2...","[5126, 894, 894, 394, 394, 21703, 452, 2303, 2..."
1,6,"[58295, 3091]",1,"[57068, 75961, 58295, 3091]","[48, 48, 1, 1]","[129, 6167, 3330, 259]"
2,38,[61916],1,"[39379, 75146, 35495, 52962, 86442, 771, 40074...","[54, 54, 54, 54, 54, 54, 54, 52, 52, 52, 52, 5...","[3962, 24441, 33658, 8707, 2286, 975, 6609, 13..."
3,86,"[33868, 27905, 98606, 98606, 100228]",1,"[86215, 1780, 96635, 61602, 97677, 76208, 9723...","[26, 26, 26, 12, 12, 7, 7, 7, 6, 6, 6, 2, 2, 2...","[28266, 249, 18750, 435, 5299, 14086, 7546, 75..."
4,90,"[97666, 97667, 97666, 97667]",1,"[33971, 69224, 80087, 33970, 62346, 62346, 708...","[51, 51, 51, 51, 47, 47, 47, 23, 23, 23, 23, 1...","[2358, 14565, 13390, 2358, 10395, 10395, 40631..."
...,...,...,...,...,...,...
300124,1371937,[59774],4,"[82437, 102944, 59774, 67261, 70640]","[20, 12, 4, 0, 0]","[16786, 1756, 1086, 884, 7488]"
300125,1371949,"[101374, 101368]",4,"[50455, 26106, 46399, 10745, 80900, 20089, 610...","[54, 51, 51, 51, 51, 51, 51, 51, 50, 46, 46, 4...","[15498, 5580, 70, 20677, 10291, 108, 1294, 206..."
300126,1371960,"[54341, 105185]",4,"[83093, 83093, 85501, 85501, 87812, 89382, 209...","[50, 50, 50, 50, 39, 20, 20, 20, 20, 20, 20, 2...","[11053, 11053, 27945, 27945, 9609, 24828, 307,..."
300127,1371963,"[96812, 96272]",4,"[72045, 64220, 90556, 90556, 67584, 93090, 681...","[52, 52, 36, 36, 36, 36, 36, 12, 11, 11, 10, 1...","[14704, 2002, 25534, 25534, 11349, 30282, 6125..."


In [17]:
from torch.utils.data import Dataset, DataLoader
import torch
from tqdm import tqdm


In [18]:
article_ids = np.concatenate([["placeholder"], np.unique(articles_df["article_id"].values)])

class HMDataset(Dataset):
    def __init__(self, df, seq_len, is_test=False):
        self.df = df.reset_index(drop=True)
        self.seq_len = seq_len
        self.is_test = is_test
    
    def __len__(self):
        return self.df.shape[0]
    
    def __getitem__(self, index):
        row = self.df.iloc[index]
        
        if self.is_test:
            target = torch.zeros(2).float() # buy/not buy
        else: 
            target = torch.zeros(len(article_ids)).float()
            for t in row.target:
                target[t] = 1.0
            
        item_hist = torch.zeros(self.seq_len).long()
        week_hist = torch.ones(self.seq_len).float()
        prod_name_hist = torch.zeros(self.seq_len).long()
        
        # encoding each data point into tensors
        
        if isinstance(row.item_id, list):
            if len(row.item_id) >= self.seq_len:
                item_hist = torch.LongTensor(row.item_id[-self.seq_len:])
                week_hist = (torch.LongTensor(row.week_history[-self.seq_len:]) - row.week)/WEEK_HIST_MAX/2
                prod_name_hist = torch.LongTensor(row.prod_name[-self.seq_len:])
            else:
                item_hist[-len(row.item_id):] = torch.LongTensor(row.item_id)
                week_hist[-len(row.item_id):] = (torch.LongTensor(row.week_history) - row.week)/WEEK_HIST_MAX/2
                prod_name_hist[-len(row.item_id):] = torch.LongTensor(row.prod_name)
                
        return item_hist, week_hist, prod_name_hist, target
    
# seq_len is how long the tensor will be.  t

# why if it is test, we can have torch 
# leng of 2 while the other is seq_len 



In [19]:
HMDataset(val_df, 64)[1]

(tensor([     0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,  86215,
           1780,  96635,  61602,  97677,  76208,  97234,  97234, 104805, 104209,
         100630, 103583, 102472, 102710,  33868,  27905,  98606,  98606, 100228,
          87371]),
 tensor([1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000,
         1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000,
         1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000,
         1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000,
         1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 2.6000,
         

In [20]:
val = HMDataset(val_df, 64)
train = HMDataset(train_df, 64)

In [21]:
def adjust_lr(optimizer, epoch):
    if epoch < 1:
        lr = 5e-5
    elif epoch < 6:
        lr = 1e-3
    elif epoch < 9:
        lr = 1e-4
    else:
        lr = 1e-5

    for p in optimizer.param_groups:
        p['lr'] = lr
    return lr
    
def get_optimizer(net):
    optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, net.parameters()), lr=3e-4, betas=(0.9, 0.999),
                                 eps=1e-08)
    return optimizer

In [22]:
import torch.nn as nn
import torch.nn.functional as F

class HMModel(nn.Module):
    def __init__(self, article_shape):
        super(HMModel, self).__init__()
        
        self.article_emb = nn.Embedding(article_shape[0], embedding_dim=article_shape[1])
#         self.prod_emb = nn.Embedding(article_shape[0], embedding_dim=article_shape[1])
        
        self.article_likelihood = nn.Parameter(torch.zeros(article_shape[0]), requires_grad=True)
        self.top = nn.Sequential(nn.Conv1d(4, 32, kernel_size=1), nn.LeakyReLU(),
                                 nn.Conv1d(32, 8, kernel_size=1), nn.LeakyReLU(),
                                 nn.Conv1d(8, 1, kernel_size=1))
#         why is the out put here 1?

    def forward(self, inputs):
        article_hist, week_hist = inputs[0], inputs[1]
        
        print("ARTICLE HISTORY")
        print(article_hist)
        print("article_hist shape")
        print(article_hist.shape)
        
        
        prod_hist = inputs[2]
        
        
        print("prod_hist HISTORY")
        print(prod_hist)
        print("prod_hist shape")
        print(prod_hist.shape)
        
        x = self.article_emb(article_hist)
        x = F.normalize(x, dim=2)
        x = x@F.normalize(self.article_emb.weight).T
        x, indices = x.max(axis=1)
        x = x.clamp(1e-3, 0.999)
        x = -torch.log(1/x - 1)
        
        
#         x1 = self.prod_emb(prod_hist)
#         x1 = F.normalize(x1, dim=2)
#         x1 = x@F.normalize(self.prod_emb.weight).T
#         x1, indices = x1.max(axis=1)
#         x1 = x.clamp(1e-3, 0.999)
#         x1 = -torch.log(1/x1 - 1)
        
        max_week = week_hist.unsqueeze(2).repeat(1, 1, x.shape[-1]).gather(1, indices.unsqueeze(1).repeat(1, week_hist.shape[1], 1))
        max_week = max_week.mean(axis=1).unsqueeze(1)
        
        max_prod = prod_hist.unsqueeze(2).repeat(1, 1, x.shape[-1]).gather(1, indices.unsqueeze(1).repeat(1, prod_hist.shape[1], 1))
        max_prod = max_prod.mean(axis=1).unsqueeze(1)
        
        x = torch.cat([x.unsqueeze(1), max_week, max_prod,
                       self.article_likelihood[None, None, :].repeat(x.shape[0], 1, 1)], axis=1)

        
        x = self.top(x).squeeze(1)
        return x
    
    
model = HMModel((len(transaction_train.item_id.unique()), 512))

model = model.cuda()

In [23]:
import sys

def calc_map(topk_preds, target_array, k=12):
    metric = []
    tp, fp = 0, 0
    
    for pred in topk_preds:
        if target_array[pred]:
            tp += 1
            metric.append(tp/(tp + fp))
        else:
            fp += 1
            
    return np.sum(metric) / min(k, target_array.sum())

def read_data(data):
    return tuple(d.cuda() for d in data[:-1]), data[-1].cuda()


def validate(model, val_loader, k=12):
    model.eval()
    
    tbar = tqdm(val_loader, file=sys.stdout)
    
    maps = []
    
    with torch.no_grad():
        for idx, data in enumerate(tbar):
            inputs, target = read_data(data)

            logits = model(inputs)

            _, indices = torch.topk(logits, k, dim=1)

            indices = indices.detach().cpu().numpy()
            target = target.detach().cpu().numpy()

            for i in range(indices.shape[0]):
                maps.append(calc_map(indices[i], target[i]))
        
    
    return np.mean(maps)

SEQ_LEN = 16

BS = 256
NW = 8

val_dataset = HMDataset(val_df, SEQ_LEN)
val_loader = DataLoader(val_dataset, batch_size=BS, shuffle=False, num_workers=NW,
                          pin_memory=False, drop_last=False)

In [None]:
def dice_loss(y_pred, y_true):
    y_pred = y_pred.sigmoid()
    intersect = (y_true*y_pred).sum(axis=1)
    
    return 1 - (intersect/(intersect + y_true.sum(axis=1) + y_pred.sum(axis=1))).mean()


def train(model, train_loader, val_loader, epochs):
    np.random.seed(SEED)
    
    optimizer = get_optimizer(model)
    scaler = torch.cuda.amp.GradScaler()

    criterion = torch.nn.BCEWithLogitsLoss()
    
    for e in range(epochs):
        model.train()
        tbar = tqdm(train_loader, file=sys.stdout)
        
        lr = adjust_lr(optimizer, e)
        
        loss_list = []

        for idx, data in enumerate(tbar):
            inputs, target = read_data(data)

            optimizer.zero_grad()
            
            with torch.cuda.amp.autocast():
                logits = model(inputs)
                loss = criterion(logits, target) + dice_loss(logits, target)
            
            
            #loss.backward()
            scaler.scale(loss).backward()
            #optimizer.step()
            scaler.step(optimizer)
            scaler.update()
            
            loss_list.append(loss.detach().cpu().item())
            
            avg_loss = np.round(100*np.mean(loss_list), 4)

            tbar.set_description(f"Epoch {e+1} Loss: {avg_loss} lr: {lr}")
            
        val_map = validate(model, val_loader)

        log_text = f"Epoch {e+1}\nTrain Loss: {avg_loss}\nValidation MAP: {val_map}\n"
            
        print(log_text)
        
        #logfile = open(f"models/{MODEL_NAME}_{SEED}.txt", 'a')
        #logfile.write(log_text)
        #logfile.close()
    return model


MODEL_NAME = "exp001"
SEED = 0

train_dataset = HMDataset(train_df, SEQ_LEN)
train_loader = DataLoader(train_dataset, batch_size=BS, shuffle=True, num_workers=NW,
                          pin_memory=False, drop_last=True)

# model = train(model, train_loader, val_loader, epochs=10)

  0%|          | 0/1172 [00:00<?, ?it/s]

In [None]:
val_dataset

In [None]:
articles_df.shape

In [None]:
articles_df.prod_name.nunique()