In [1]:
# helper function
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import pickle as pkl

def convert_c_id(series, dictionary):
    """map customer id to int, this will reduce the memory usage by 16x"""
    return series.map(dictionary).astype('int32')

def a_id_str_to_int(series):
    """map article id to int, this will reduce the memory usage by 2.5x"""
    return series.astype('int32')

def a_id_int_to_str(series):
    """convert back to original article id for submission"""
    return '0' + series.astype('str')

def codes_cat(series):
    """reduce data memory by converting categorical obj to int"""
    return pd.Series(pd.Categorical(series, categories=series.dropna().unique()).codes)

In [2]:
from pathlib import Path

data_path = Path('../input/h-and-m-personalized-fashion-recommendations')

t_df = pd.read_csv(data_path/'transactions_train.csv')
t_df['t_dat'] = pd.to_datetime(t_df['t_dat'])

c_df = pd.read_csv(data_path/'customers.csv')

a_df = pd.read_csv(data_path/'articles.csv')

In [3]:
t_df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31788324 entries, 0 to 31788323
Data columns (total 5 columns):
 #   Column            Dtype         
---  ------            -----         
 0   t_dat             datetime64[ns]
 1   customer_id       object        
 2   article_id        int64         
 3   price             float64       
 4   sales_channel_id  int64         
dtypes: datetime64[ns](1), float64(1), int64(2), object(1)
memory usage: 4.5 GB


In [4]:
# for memory efficiency
id_to_index_dict = dict(zip(c_df["customer_id"], c_df.index))
index_to_id_dict = dict(zip(c_df.index, c_df["customer_id"]))

t_df["customer_id"] = convert_c_id(t_df["customer_id"], id_to_index_dict)
t_df['article_id'] = a_id_str_to_int(t_df['article_id'])
t_df['sales_channel_id'] = t_df['sales_channel_id'].astype('int8')

t_df['week'] = 104 - (t_df.t_dat.max() - t_df.t_dat).dt.days // 7
t_df['week'] = t_df['week'].astype('int8')

t_df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31788324 entries, 0 to 31788323
Data columns (total 6 columns):
 #   Column            Dtype         
---  ------            -----         
 0   t_dat             datetime64[ns]
 1   customer_id       int32         
 2   article_id        int32         
 3   price             float64       
 4   sales_channel_id  int8          
 5   week              int8          
dtypes: datetime64[ns](1), float64(1), int32(2), int8(2)
memory usage: 788.2 MB


In [5]:
c_df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1371980 entries, 0 to 1371979
Data columns (total 7 columns):
 #   Column                  Non-Null Count    Dtype  
---  ------                  --------------    -----  
 0   customer_id             1371980 non-null  object 
 1   FN                      476930 non-null   float64
 2   Active                  464404 non-null   float64
 3   club_member_status      1365918 non-null  object 
 4   fashion_news_frequency  1355971 non-null  object 
 5   age                     1356119 non-null  float64
 6   postal_code             1371980 non-null  object 
dtypes: float64(3), object(4)
memory usage: 512.3 MB


In [6]:
# Handle missing value for customer df and reduce memory space
c_df['customer_id'] = convert_c_id(c_df['customer_id'], id_to_index_dict)

c_df['FN'] = c_df['FN'].fillna(-1).astype('int8')

c_df['Active'] = c_df['Active'].fillna(-1).astype('int8')

c_df['club_member_status'] = codes_cat(c_df['club_member_status'])

c_df['fashion_news_frequency'] = codes_cat(c_df['fashion_news_frequency'])

c_df['age'] = c_df['age'].fillna(-1).astype('int8')

c_df['postal_code'] = codes_cat(c_df['postal_code'])

c_df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1371980 entries, 0 to 1371979
Data columns (total 7 columns):
 #   Column                  Non-Null Count    Dtype
---  ------                  --------------    -----
 0   customer_id             1371980 non-null  int32
 1   FN                      1371980 non-null  int8 
 2   Active                  1371980 non-null  int8 
 3   club_member_status      1371980 non-null  int8 
 4   fashion_news_frequency  1371980 non-null  int8 
 5   age                     1371980 non-null  int8 
 6   postal_code             1371980 non-null  int32
dtypes: int32(2), int8(5)
memory usage: 17.0 MB


In [7]:
a_df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 105542 entries, 0 to 105541
Data columns (total 25 columns):
 #   Column                        Non-Null Count   Dtype 
---  ------                        --------------   ----- 
 0   article_id                    105542 non-null  int64 
 1   product_code                  105542 non-null  int64 
 2   prod_name                     105542 non-null  object
 3   product_type_no               105542 non-null  int64 
 4   product_type_name             105542 non-null  object
 5   product_group_name            105542 non-null  object
 6   graphical_appearance_no       105542 non-null  int64 
 7   graphical_appearance_name     105542 non-null  object
 8   colour_group_code             105542 non-null  int64 
 9   colour_group_name             105542 non-null  object
 10  perceived_colour_value_id     105542 non-null  int64 
 11  perceived_colour_value_name   105542 non-null  object
 12  perceived_colour_master_id    105542 non-null  int64 
 13 

In [8]:
for col in a_df.columns:
    if a_df[col].dtype == 'object':
        a_df[col] = codes_cat(a_df[col])
        
a_df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 105542 entries, 0 to 105541
Data columns (total 25 columns):
 #   Column                        Non-Null Count   Dtype
---  ------                        --------------   -----
 0   article_id                    105542 non-null  int64
 1   product_code                  105542 non-null  int64
 2   prod_name                     105542 non-null  int32
 3   product_type_no               105542 non-null  int64
 4   product_type_name             105542 non-null  int16
 5   product_group_name            105542 non-null  int8 
 6   graphical_appearance_no       105542 non-null  int64
 7   graphical_appearance_name     105542 non-null  int8 
 8   colour_group_code             105542 non-null  int64
 9   colour_group_name             105542 non-null  int8 
 10  perceived_colour_value_id     105542 non-null  int64
 11  perceived_colour_value_name   105542 non-null  int8 
 12  perceived_colour_master_id    105542 non-null  int64
 13  perceived_colo

The data train was huge, 30 GB of RAM won't enough to process it. With a memory trick (credit: https://www.kaggle.com/competitions/h-and-m-personalized-fashion-recommendations/discussion/308635), we can get enough RAM to work with it. We use the last 9 weeks for training.

# Candidates Generation

Candidates generated below can be used as candidates for evaluation by our ranker (for the test week) and as negative examples (for training data). In this notebook, I generate candidates from 4 sources:
- Customer expenses on transactions, aggregate price
- Customer last item purchased
- Bestseller weekly
- Bestseller last week (week 104)


In [9]:
# only use 9 week data train because limited resource

test_week = t_df['week'].max() + 1
t_df = t_df[t_df['week'] > test_week - 9]

In [10]:
t_df.sort_values(['t_dat', 'customer_id'], inplace=True)

In [11]:
# Add detail about the prices from transactions of each customer
agg_cust_df = t_df \
    .groupby('customer_id')['price'] \
    .agg(
        mean_transactions = 'mean',
        max_transactions = 'max',
        min_transactions = 'min',
        median_transactions = 'median',
        sum_transactions = 'sum'
    ) \
    .astype('float32')

In [12]:
c_weeks = t_df.groupby('customer_id')['week'].unique()

c_weeks_shifted_weeks = {}

for c_id, weeks in c_weeks.items():
    c_weeks_shifted_weeks[c_id] = {}
    for i in range(weeks.shape[0]-1):
        c_weeks_shifted_weeks[c_id][weeks[i]] = weeks[i+1]
    c_weeks_shifted_weeks[c_id][weeks[-1]] = test_week

In [13]:
candidates_repurchase_df = t_df.copy()

weeks = []
for _, (c_id, week) in enumerate(zip(t_df['customer_id'], t_df['week'])):
    weeks.append(c_weeks_shifted_weeks[c_id][week])
    
candidates_repurchase_df['week'] = weeks

In [14]:
mean_price_df = t_df.groupby(['week', 'article_id'])['price'].mean()

In [15]:
top_item_weekly_df = t_df \
    .groupby('week')['article_id'].value_counts() \
    .groupby('week').rank(method='dense', ascending=False) \
    .groupby('week').head(12).rename('bestseller_rank')

In [16]:
bs_prev_week_df = pd.merge(top_item_weekly_df, mean_price_df, on=['week', 'article_id']).reset_index()
bs_prev_week_df['week'] += 1

In [17]:
unique_t_df = t_df \
    .groupby(['week', 'customer_id']) \
    .head(1) \
    .drop(columns=['article_id', 'price']) \
    .copy()

In [18]:
candidates_bs_df = pd.merge(
    unique_t_df,
    bs_prev_week_df,
    on='week',
)

In [19]:
# candidates : top item last week (on w104) for every customer
test_set_t_df = unique_t_df.drop_duplicates('customer_id').reset_index(drop=True)
test_set_t_df.week = test_week

In [20]:
candidates_bs_test_week_df = pd.merge(
    test_set_t_df,
    bs_prev_week_df,
    on='week'
)

In [21]:
candidates_bs_df = pd.concat([candidates_bs_df, candidates_bs_test_week_df])
candidates_bs_df.drop(columns='bestseller_rank', inplace=True)
candidates_bs_df.drop_duplicates(inplace=True)

In [22]:
t_df['label'] = 1
train_df = pd.concat([t_df, candidates_repurchase_df, candidates_bs_df])
train_df['label'].fillna(0, inplace=True)

In [23]:
train_df.drop(train_df[train_df['week'] == train_df['week'].min()].index, inplace=True)

In [24]:
# add rank info 
train_df = pd.merge(
    train_df,
    bs_prev_week_df[['week', 'article_id', 'bestseller_rank']],
    on=['week', 'article_id'],
    how='left'
)

In [25]:
train_df['bestseller_rank'].fillna(999, inplace=True)

In [26]:
# add price info
train_df = pd.merge(train_df, agg_cust_df, on='customer_id', how='left')

In [27]:
train_df = pd.merge(train_df, a_df, on='article_id', how='left')
train_df = pd.merge(train_df, c_df, on='customer_id', how='left')

train_df.sort_values(['week', 'customer_id'], ignore_index=True, inplace=True)

In [28]:
test_df = train_df[train_df['week'] == test_week].drop_duplicates(['customer_id', 'article_id']).copy()
train_df = train_df[train_df['week'] != test_week]

In [29]:
train_baskets = train_df.groupby(['week', 'customer_id'])['article_id'].count().values

In [30]:
features = [
    'article_id', 'product_type_no', 
    'graphical_appearance_no', 'colour_group_code', 
    'perceived_colour_value_id','perceived_colour_master_id', 
    'department_no', 'index_code', 
    'index_group_no', 'section_no', 
    'garment_group_no', 'FN', 
    'Active', 'club_member_status', 
    'fashion_news_frequency', 'age', 
    'postal_code', 'bestseller_rank',
    'mean_transactions', 'max_transactions',
    'min_transactions', 'median_transactions',
    'sum_transactions'
]

In [31]:
from lightgbm.sklearn import LGBMRanker

In [32]:
ranker = LGBMRanker(
    objective="lambdarank",
    metric="ndcg",
    boosting_type="dart",
    n_estimators=1,
    importance_type='gain',
    learning_rate=0.03,
    verbose=10
)

In [33]:
%%time

ranker = ranker.fit(
    train_df[features],
    train_df['label'],
    group=train_baskets,
)

[LightGBM] [Debug] Dataset::GetMultiBinFromSparseFeatures: sparse rate 0.846213
[LightGBM] [Debug] Dataset::GetMultiBinFromAllFeatures: sparse rate 0.120591
[LightGBM] [Debug] init for col-wise cost 0.209476 seconds, init for row-wise cost 1.169032 seconds
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Debug] Using Dense Multi-Val Bin
[LightGBM] [Info] Total Bins 2316
[LightGBM] [Info] Number of data points in the train set: 8699068, number of used features: 23
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 10
CPU times: user 27.7 s, sys: 1.43 s, total: 29.1 s
Wall time: 9.63 s


In [34]:
for i in ranker.feature_importances_.argsort()[::-1]:
    print(features[i], ranker.feature_importances_[i]/ranker.feature_importances_.sum())

bestseller_rank 0.9912160886261228
sum_transactions 0.0073073334638925165
mean_transactions 0.0007370371889232798
median_transactions 0.00013321677857702075
section_no 0.00012890969560645408
max_transactions 0.00012571537765782524
department_no 0.00011042208061311655
article_id 9.233818653610492e-05
min_transactions 4.332317927643898e-05
colour_group_code 3.900684716798114e-05
perceived_colour_value_id 3.636831319026236e-05
garment_group_no 3.0240262436190884e-05
fashion_news_frequency 0.0
club_member_status 0.0
Active 0.0
age 0.0
postal_code 0.0
index_group_no 0.0
index_code 0.0
perceived_colour_master_id 0.0
graphical_appearance_no 0.0
product_type_no 0.0
FN 0.0


In [35]:
%%time

test_df['preds'] = ranker.predict(test_df[features])

c_id_to_predicted_article_ids = test_df \
    .sort_values(['customer_id', 'preds'], ascending=False) \
    .groupby('customer_id')['article_id'].apply(list).to_dict()

bestsellers_last_week = \
    bs_prev_week_df[bs_prev_week_df['week'] == bs_prev_week_df['week'].max()]['article_id'].tolist()

CPU times: user 12.9 s, sys: 978 ms, total: 13.9 s
Wall time: 12 s


In [36]:
sub = pd.read_csv(data_path/'sample_submission.csv')

In [37]:
%%time
preds = []

for c_id in convert_c_id(sub['customer_id'], id_to_index_dict):
    pred = c_id_to_predicted_article_ids.get(c_id, [])
    pred = pred + bestsellers_last_week
    preds.append(pred[:12])

CPU times: user 5.09 s, sys: 145 ms, total: 5.24 s
Wall time: 5.23 s


In [38]:
preds = [' '.join(['0' + str(p) for p in ps]) for ps in preds]
sub['prediction'] = preds

In [39]:
sub.to_csv('submission.csv', index=False)