In [None]:
!pip install --upgrade implicit

In [1]:
import os; os.environ['OPENBLAS_NUM_THREADS']='10'
import numpy as np
import pandas as pd
import implicit
from scipy.sparse import coo_matrix
from implicit.evaluation import mean_average_precision_at_k
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.model_selection import ParameterGrid

In [2]:
base_path = 'C://Users//darli//Desktop//527//group project//h-and-m-personalized-fashion-recommendations//'
csv_train = f'{base_path}transactions_train.csv'
csv_sub = f'{base_path}sample_submission.csv'
csv_users = f'{base_path}customers.csv'
csv_items = f'{base_path}articles.csv'

df = pd.read_csv(csv_train, dtype={'article_id': str}, parse_dates=['t_dat'])
df_sub = pd.read_csv(csv_sub)
dfu = pd.read_csv(csv_users)
dfi = pd.read_csv(csv_items, dtype={'article_id': str})

In [3]:
print(df['t_dat'].shape)
print(df['t_dat'].max())
print(df['t_dat'].min())
df1 = df[df['sales_channel_id'] ==1]
print(df1.shape)

(31788324,)
2020-09-22 00:00:00
2018-09-20 00:00:00
(9408462, 5)


In [4]:
print(df.isnull().sum())

t_dat               0
customer_id         0
article_id          0
price               0
sales_channel_id    0
dtype: int64


In [5]:
# use the data after 2020-08-21 to find the best parameters (factors, regularization and interations)
df = df[df['t_dat'] > '2020-08-21']
df.shape

(1190911, 5)

In [6]:
print(df.columns)
print(dfu.columns)

Index(['t_dat', 'customer_id', 'article_id', 'price', 'sales_channel_id'], dtype='object')
Index(['customer_id', 'FN', 'Active', 'club_member_status',
       'fashion_news_frequency', 'age', 'postal_code'],
      dtype='object')


In [7]:
ALL_USERS = dfu['customer_id'].unique().tolist()
ALL_ITEMS = dfi['article_id'].unique().tolist()

user_ids = dict(list(enumerate(ALL_USERS)))
item_ids = dict(list(enumerate(ALL_ITEMS)))

user_map = {u: uidx for uidx, u in user_ids.items()}
item_map = {i: iidx for iidx, i in item_ids.items()}

df['user_id'] = df['customer_id'].map(user_map)
df['item_id'] = df['article_id'].map(item_map)

del dfu, dfi

In [8]:
df.head()

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,user_id,item_id
30597413,2020-08-22,0001d44dbe7f6c4b35200abdb052c77a87596fe1bdcc37...,913688003,0.033881,2,38,103595
30597414,2020-08-22,0001d44dbe7f6c4b35200abdb052c77a87596fe1bdcc37...,913688003,0.033881,2,38,103595
30597415,2020-08-22,0001d44dbe7f6c4b35200abdb052c77a87596fe1bdcc37...,923460001,0.042356,2,38,104483
30597416,2020-08-22,0001d44dbe7f6c4b35200abdb052c77a87596fe1bdcc37...,934380001,0.050831,2,38,105214
30597417,2020-08-22,0001d44dbe7f6c4b35200abdb052c77a87596fe1bdcc37...,913688001,0.033881,2,38,103593


In [9]:
row = df['user_id'].values
col = df['item_id'].values
data = np.ones(df.shape[0])  #if purchase, set as one
coo_train = coo_matrix((data, (row, col)), shape=(len(ALL_USERS), len(ALL_ITEMS)))
coo_train

<1371980x105542 sparse matrix of type '<class 'numpy.float64'>'
	with 1190911 stored elements in COOrdinate format>

In [10]:
model = implicit.als.AlternatingLeastSquares(factors=10, iterations=2)
model.fit(coo_train)



  0%|          | 0/2 [00:00<?, ?it/s]

In [11]:
def to_user_item_coo(df):
    row = df['user_id'].values
    col = df['item_id'].values
    data = np.ones(df.shape[0])
    coo = coo_matrix((data, (row, col)), shape=(len(ALL_USERS), len(ALL_ITEMS)))
    return coo


def split_data(df, validation_days=5):
    validation_cut = df['t_dat'].max() - pd.Timedelta(validation_days)

    df_train = df[df['t_dat'] < validation_cut]
    df_val = df[df['t_dat'] >= validation_cut]
    return df_train, df_val

def get_val_matrices(df, validation_days=7):
    df_train, df_val = split_data(df, validation_days=validation_days)
    coo_train = to_user_item_coo(df_train)
    coo_val = to_user_item_coo(df_val)

    csr_train = coo_train.tocsr()
    csr_val = coo_val.tocsr()
    
    return {'coo_train': coo_train,
            'csr_train': csr_train,
            'csr_val': csr_val          }


In [12]:
matrices = get_val_matrices(df)
coo_train, csr_train, csr_val = matrices['coo_train'], matrices['csr_train'], matrices['csr_val']
best_map12 = 0

grid = ParameterGrid({
    "factors": [50, 100, 200, 500, 1000],
    'regularization':[0.01, 0.05, 0.1, 1],
    'iterations':[2, 3, 5, 12, 15, 20]
})

for params in grid:
    model = implicit.als.AlternatingLeastSquares(**params)
    model.fit(coo_train,show_progress=False)
    map12 = mean_average_precision_at_k(model, csr_train, csr_val, K=12,show_progress=False)
    print(f"Factors: {params['factors']} - Iterations: {params['iterations']} - Regularization: {params['regularization']} ==> MAP@12: {map12}")
    if map12 > best_map12:
                best_map12 = map12
                best_params = {'factors': params['factors'], 'iterations': params['iterations'], 'regularization': params['regularization']}
                print(f"The best MAP@12 is unpdated. : {best_params}")

Factors: 50 - Iterations: 2 - Regularization: 0.01 ==> MAP@12: 0.0032847028329455017
The best MAP@12 is unpdated. : {'factors': 50, 'iterations': 2, 'regularization': 0.01}
Factors: 50 - Iterations: 2 - Regularization: 0.05 ==> MAP@12: 0.0031477183326164286
Factors: 50 - Iterations: 2 - Regularization: 0.1 ==> MAP@12: 0.0035058996230396358
The best MAP@12 is unpdated. : {'factors': 50, 'iterations': 2, 'regularization': 0.1}
Factors: 50 - Iterations: 2 - Regularization: 1 ==> MAP@12: 0.0033902043415958696
Factors: 50 - Iterations: 3 - Regularization: 0.01 ==> MAP@12: 0.003909433692749615
The best MAP@12 is unpdated. : {'factors': 50, 'iterations': 3, 'regularization': 0.01}
Factors: 50 - Iterations: 3 - Regularization: 0.05 ==> MAP@12: 0.004435833714978221
The best MAP@12 is unpdated. : {'factors': 50, 'iterations': 3, 'regularization': 0.05}
Factors: 50 - Iterations: 3 - Regularization: 0.1 ==> MAP@12: 0.004534878732087537
The best MAP@12 is unpdated. : {'factors': 50, 'iterations': 3

Factors: 500 - Iterations: 5 - Regularization: 0.05 ==> MAP@12: 0.0064433531433832295
Factors: 500 - Iterations: 5 - Regularization: 0.1 ==> MAP@12: 0.006326732010854404
Factors: 500 - Iterations: 5 - Regularization: 1 ==> MAP@12: 0.006781994974971307
Factors: 500 - Iterations: 12 - Regularization: 0.01 ==> MAP@12: 0.006027062144123001
Factors: 500 - Iterations: 12 - Regularization: 0.05 ==> MAP@12: 0.005956772030569571
Factors: 500 - Iterations: 12 - Regularization: 0.1 ==> MAP@12: 0.0058292315564137205
Factors: 500 - Iterations: 12 - Regularization: 1 ==> MAP@12: 0.005918812851312583
Factors: 500 - Iterations: 15 - Regularization: 0.01 ==> MAP@12: 0.005811170069991786
Factors: 500 - Iterations: 15 - Regularization: 0.05 ==> MAP@12: 0.0058581136365285875
Factors: 500 - Iterations: 15 - Regularization: 0.1 ==> MAP@12: 0.005822451450210537
Factors: 500 - Iterations: 15 - Regularization: 1 ==> MAP@12: 0.005853008454828772
Factors: 500 - Iterations: 20 - Regularization: 0.01 ==> MAP@12: 0

In [13]:
del matrices
best_params

{'factors': 1000, 'iterations': 2, 'regularization': 1}

In [14]:
base_path = 'C://Users//darli//Desktop//527//group project//h-and-m-personalized-fashion-recommendations//'
csv_train = f'{base_path}transactions_train.csv'
csv_sub = f'{base_path}sample_submission.csv'
csv_users = f'{base_path}customers.csv'
csv_items = f'{base_path}articles.csv'

df = pd.read_csv(csv_train, dtype={'article_id': str}, parse_dates=['t_dat'])
df_sub = pd.read_csv(csv_sub)
dfu = pd.read_csv(csv_users)
dfi = pd.read_csv(csv_items, dtype={'article_id': str})

In [15]:
# use the data after 2020-08-21 to find the best parameters (factors, regularization and interations)
df = df[df['t_dat'] > '2020-08-21']
df.shape

(1190911, 5)

In [16]:
ALL_USERS = dfu['customer_id'].unique().tolist()
ALL_ITEMS = dfi['article_id'].unique().tolist()

user_ids = dict(list(enumerate(ALL_USERS)))
item_ids = dict(list(enumerate(ALL_ITEMS)))

user_map = {u: uidx for uidx, u in user_ids.items()}
item_map = {i: iidx for iidx, i in item_ids.items()}
# print('user_id 1:\n',df['customer_id'])

df['user_id'] = df['customer_id'].map(user_map)
# print('user_id 2:\n',df['user_id'])
df['item_id'] = df['article_id'].map(item_map)

del dfu, dfi

In [17]:
coo_train = to_user_item_coo(df)
csr_train = coo_train.tocsr()

model = implicit.als.AlternatingLeastSquares(factors=1000, 
                                            iterations=2, 
                                            regularization=1, 
                                            random_state=42)
model.fit(coo_train, show_progress=True)

  0%|          | 0/2 [00:00<?, ?it/s]

In [18]:
preds = []
batch_size = 2000
to_generate = np.arange(len(ALL_USERS))
for startidx in range(0, len(to_generate), batch_size):
    batch = to_generate[startidx : startidx + batch_size]
    ids, scores = model.recommend(batch, csr_train[batch], N=12, filter_already_liked_items=False)
    for i, userid in enumerate(batch):
        customer_id = user_ids[userid]
        user_items = ids[i]
        article_ids = [item_ids[item_id] for item_id in user_items]
        preds.append((customer_id, ' '.join(article_ids)))

df_preds = pd.DataFrame(preds, columns=['customer_id', 'prediction'])
df_preds.to_csv('submission.csv', index=False)
    
display(df_preds.head())
print(df_preds.shape)

Unnamed: 0,customer_id,prediction
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,0568601043 0858856005 0779781015 0716670009 07...
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,0112679048 0111609001 0111593001 0111586001 01...
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0794321007 0794321011 0805000001 0568601043 08...
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,0112679048 0111609001 0111593001 0111586001 01...
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,0112679048 0111609001 0111593001 0111586001 01...


(1371980, 2)
