### import library

In [8]:
import numpy as np
import pandas as pd
import os
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
# from wordcloud import WordCloud, STOPWORDS
from datetime import datetime
from PIL import Image
from plotnine import *
from sklearn.preprocessing import OrdinalEncoder
from scipy.sparse import coo_matrix
from implicit.evaluation import mean_average_precision_at_k
import implicit

### import data

In [2]:
articles_df = pd.read_csv("./Dataset/HandM/articles.csv",dtype={'article_id': str})
customers_df = pd.read_csv("./Dataset/HandM/customers.csv")
transactions_df = pd.read_csv("./Dataset/HandM/transactions_train.csv",dtype={'article_id': str}, parse_dates=['t_dat'])
sample_submission_df = pd.read_csv("./Dataset/HandM/sample_submission.csv")

taking fewer samples due to memory limitation

In [3]:
df = transactions_df[transactions_df['t_dat'] > '2020-08-21']

remapped all the item and user 

In [5]:
ALL_USERS = customers_df['customer_id'].unique().tolist()
ALL_ITEMS = articles_df['article_id'].unique().tolist()

user_ids = dict(list(enumerate(ALL_USERS)))
item_ids = dict(list(enumerate(ALL_ITEMS)))

user_map = {u: uidx for uidx, u in user_ids.items()}
item_map = {i: iidx for iidx, i in item_ids.items()}

transactions_df['user_id'] = transactions_df['customer_id'].map(user_map)
transactions_df['item_id'] = transactions_df['article_id'].map(item_map)

del articles_df, customers_df

Create coo_matrix (user x item) and csr matrix (user x item)

In [6]:
row = transactions_df['user_id'].values
col = transactions_df['item_id'].values
data = np.ones(transactions_df.shape[0])
coo_train = coo_matrix((data, (row, col)), shape=(len(ALL_USERS), len(ALL_ITEMS)))
coo_train

<1371980x105542 sparse matrix of type '<class 'numpy.float64'>'
	with 31788324 stored elements in COOrdinate format>

In [9]:
%%time
model = implicit.als.AlternatingLeastSquares(factors=10, iterations=2)
model.fit(coo_train)



  0%|          | 0/2 [00:00<?, ?it/s]

Wall time: 4.23 s


In [10]:
def to_user_item_coo(df):
    """ Turn a dataframe with transactions into a COO sparse items x users matrix"""
    row = df['user_id'].values
    col = df['item_id'].values
    data = np.ones(df.shape[0])
    coo = coo_matrix((data, (row, col)), shape=(len(ALL_USERS), len(ALL_ITEMS)))
    return coo


def split_data(df, validation_days=7):
    """ Split a pandas dataframe into training and validation data, using <<validation_days>>
    """
    validation_cut = df['t_dat'].max() - pd.Timedelta(validation_days)

    df_train = df[df['t_dat'] < validation_cut]
    df_val = df[df['t_dat'] >= validation_cut]
    return df_train, df_val

def get_val_matrices(df, validation_days=7):
    """ Split into training and validation and create various matrices
        
        Returns a dictionary with the following keys:
            coo_train: training data in COO sparse format and as (users x items)
            csr_train: training data in CSR sparse format and as (users x items)
            csr_val:  validation data in CSR sparse format and as (users x items)
    
    """
    df_train, df_val = split_data(df, validation_days=validation_days)
    coo_train = to_user_item_coo(df_train)
    coo_val = to_user_item_coo(df_val)

    csr_train = coo_train.tocsr()
    csr_val = coo_val.tocsr()
    
    return {'coo_train': coo_train,
            'csr_train': csr_train,
            'csr_val': csr_val
          }


def validate(matrices, factors=200, iterations=20, regularization=0.01, show_progress=True):
    """ Train an ALS model with <<factors>> (embeddings dimension) 
    for <<iterations>> over matrices and validate with MAP@12
    """
    coo_train, csr_train, csr_val = matrices['coo_train'], matrices['csr_train'], matrices['csr_val']
    
    model = implicit.als.AlternatingLeastSquares(factors=factors, 
                                                 iterations=iterations, 
                                                 regularization=regularization, 
                                                 random_state=42)
    model.fit(coo_train, show_progress=show_progress)
    
    # The MAPK by implicit doesn't allow to calculate allowing repeated items, which is the case.
    # TODO: change MAP@12 to a library that allows repeated items in prediction
    map12 = mean_average_precision_at_k(model, csr_train, csr_val, K=12, show_progress=show_progress, num_threads=4)
    print(f"Factors: {factors:>3} - Iterations: {iterations:>2} - Regularization: {regularization:4.3f} ==> MAP@12: {map12:6.5f}")
    return map12

In [11]:
matrices = get_val_matrices(transactions_df)

In [None]:
%%time
best_map12 = 0
for factors in [40, 50, 60, 100, 200, 500, 1000]:
    for iterations in [3, 12, 14, 15, 20]:
        for regularization in [0.01]:
            map12 = validate(matrices, factors, iterations, regularization, show_progress=False)
            if map12 > best_map12:
                best_map12 = map12
                best_params = {'factors': factors, 'iterations': iterations, 'regularization': regularization}
                print(f"Best MAP@12 found. Updating: {best_params}")

Factors:  40 - Iterations:  3 - Regularization: 0.010 ==> MAP@12: 0.00315
Best MAP@12 found. Updating: {'factors': 40, 'iterations': 3, 'regularization': 0.01}
Factors:  40 - Iterations: 12 - Regularization: 0.010 ==> MAP@12: 0.00362
Best MAP@12 found. Updating: {'factors': 40, 'iterations': 12, 'regularization': 0.01}
Factors:  40 - Iterations: 14 - Regularization: 0.010 ==> MAP@12: 0.00367
Best MAP@12 found. Updating: {'factors': 40, 'iterations': 14, 'regularization': 0.01}
Factors:  40 - Iterations: 15 - Regularization: 0.010 ==> MAP@12: 0.00369
Best MAP@12 found. Updating: {'factors': 40, 'iterations': 15, 'regularization': 0.01}
Factors:  40 - Iterations: 20 - Regularization: 0.010 ==> MAP@12: 0.00368
Factors:  50 - Iterations:  3 - Regularization: 0.010 ==> MAP@12: 0.00354
Factors:  50 - Iterations: 12 - Regularization: 0.010 ==> MAP@12: 0.00390
Best MAP@12 found. Updating: {'factors': 50, 'iterations': 12, 'regularization': 0.01}
Factors:  50 - Iterations: 14 - Regularization: 

In [14]:
coo_train = to_user_item_coo(df)
csr_train = coo_train.tocsr()

KeyError: 'user_id'

### i want to take age, fashion new frequency as my two parameter for customer

In [41]:
customers_df.columns

Index(['customer_id', 'FN', 'Active', 'club_member_status',
       'fashion_news_frequency', 'age', 'postal_code'],
      dtype='object')

In [50]:
users = customers_df[['customer_id','fashion_news_frequency','age']]
users

Unnamed: 0,customer_id,fashion_news_frequency,age
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,NONE,49.0
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,NONE,25.0
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,NONE,24.0
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,NONE,54.0
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,Regularly,52.0
...,...,...,...
1371975,ffffbbf78b6eaac697a8a5dfbfd2bfa8113ee5b403e474...,NONE,24.0
1371976,ffffcd5046a6143d29a04fb8c424ce494a76e5cdf4fab5...,NONE,21.0
1371977,ffffcf35913a0bee60e8741cb2b4e78b8a98ee5ff2e6a1...,Regularly,21.0
1371978,ffffd7744cebcf3aca44ae7049d2a94b87074c3d4ffe38...,Regularly,18.0


In [52]:
articles_df.columns

Index(['article_id', 'product_code', 'prod_name', 'product_type_no',
       'product_type_name', 'product_group_name', 'graphical_appearance_no',
       'graphical_appearance_name', 'colour_group_code', 'colour_group_name',
       'perceived_colour_value_id', 'perceived_colour_value_name',
       'perceived_colour_master_id', 'perceived_colour_master_name',
       'department_no', 'department_name', 'index_code', 'index_name',
       'index_group_no', 'index_group_name', 'section_no', 'section_name',
       'garment_group_no', 'garment_group_name', 'detail_desc'],
      dtype='object')

In [53]:
'article_id', 'product_code', 'product_type_no','graphical_appearance_no','colour_group_code','perceived_colour_value_id','perceived_colour_master_id', 'perceived_colour_master_name',
'department_no', 'index_code','index_group_no', 'section_no','garment_group_no'

0         253
1         253
2         253
3         306
4         306
         ... 
105537    302
105538    253
105539    265
105540     72
105541    265
Name: product_type_no, Length: 105542, dtype: int64

In [54]:
articles_df.department_name

0              Jersey Basic
1              Jersey Basic
2              Jersey Basic
3            Clean Lingerie
4            Clean Lingerie
                ...        
105537            Socks Bin
105538               Jersey
105539               Jersey
105540    Small Accessories
105541               Jersey
Name: department_name, Length: 105542, dtype: object

In [59]:
items = articles_df[['article_id', 'product_code', 'product_type_no','graphical_appearance_no','colour_group_code','perceived_colour_value_id','perceived_colour_master_id',
'department_no', 'index_code','index_group_no', 'section_no','garment_group_no']]

items.head()

Unnamed: 0,article_id,product_code,product_type_no,graphical_appearance_no,colour_group_code,perceived_colour_value_id,perceived_colour_master_id,department_no,index_code,index_group_no,section_no,garment_group_no
0,108775015,108775,253,1010016,9,4,5,1676,A,1,16,1002
1,108775044,108775,253,1010016,10,3,9,1676,A,1,16,1002
2,108775051,108775,253,1010017,11,1,9,1676,A,1,16,1002
3,110065001,110065,306,1010016,9,4,5,1339,B,1,61,1017
4,110065002,110065,306,1010016,10,3,9,1339,B,1,61,1017


In [67]:
items = items.set_index('article_id')

In [68]:
items

Unnamed: 0_level_0,product_code,product_type_no,graphical_appearance_no,colour_group_code,perceived_colour_value_id,perceived_colour_master_id,department_no,index_code,index_group_no,section_no,garment_group_no
article_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0108775015,108775,253,1010016,9,4,5,1676,A,1,16,1002
0108775044,108775,253,1010016,10,3,9,1676,A,1,16,1002
0108775051,108775,253,1010017,11,1,9,1676,A,1,16,1002
0110065001,110065,306,1010016,9,4,5,1339,B,1,61,1017
0110065002,110065,306,1010016,10,3,9,1339,B,1,61,1017
...,...,...,...,...,...,...,...,...,...,...,...
0953450001,953450,302,1010014,9,4,5,7188,F,3,26,1021
0953763001,953763,253,1010016,9,4,5,1919,A,1,2,1005
0956217002,956217,265,1010016,9,4,5,1641,A,1,18,1005
0957375001,957375,72,1010016,9,4,5,3946,D,2,52,1019


In [61]:
encoder = OrdinalEncoder()

In [77]:
items_encoded = encoder.fit_transform(items)

In [80]:
pd.DataFrame(items_encoded)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,0.0,48.0,16.0,9.0,4.0,5.0,47.0,0.0,0.0,8.0,1.0
1,0.0,48.0,16.0,10.0,3.0,9.0,47.0,0.0,0.0,8.0,1.0
2,0.0,48.0,17.0,11.0,1.0,9.0,47.0,0.0,0.0,8.0,1.0
3,1.0,90.0,16.0,9.0,4.0,5.0,12.0,1.0,0.0,43.0,14.0
4,1.0,90.0,16.0,10.0,3.0,9.0,12.0,1.0,0.0,43.0,14.0
...,...,...,...,...,...,...,...,...,...,...,...
105537,47219.0,86.0,14.0,9.0,4.0,5.0,224.0,4.0,2.0,18.0,18.0
105538,47220.0,48.0,16.0,9.0,4.0,5.0,66.0,0.0,0.0,0.0,3.0
105539,47221.0,60.0,16.0,9.0,4.0,5.0,34.0,0.0,0.0,10.0,3.0
105540,47222.0,11.0,16.0,9.0,4.0,5.0,120.0,3.0,1.0,36.0,16.0


In [47]:
articles_df.prod_name.nunique()

45875

In [46]:
all_product = articles_df.product_code.unique().tolist()

time range from 2018-09-20 to 2020-09-20

#### Assign autoincrementing ids starting from 0 to both users and items

In [19]:
all_customer = customers_df['customer_id'].unique().tolist()
all_articles = articles_df['article_id'].unique().tolist()

all_customer2 = dict(list(enumerate(all_customer)))
all_articles2 = dict(list(enumerate(all_articles)))

customer_map = {u: uidx for uidx, u in all_customer2.items()}
articles_map = {i: iidx for iidx, i in all_articles2.items()}

In [23]:
transactions_df['user_id'] = transactions_df['customer_id'].map(customer_map)
transactions_df['item_id'] = transactions_df['article_id'].map(articles_map)

In [33]:
transactions_df.head(3)

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,user_id,item_id
0,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,663713001,0.050831,2,2,40179
1,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,541518023,0.030492,2,2,10520
2,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,505221004,0.015237,2,7,6387


In [35]:
customers_df.fashion_news_frequency.unique()

array(['NONE', 'Regularly', nan, 'Monthly', 'None'], dtype=object)