In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
%cd '/content/drive/MyDrive/Colab Notebooks/hnm'

/content/drive/MyDrive/Colab Notebooks/hnm


In [None]:
!ls

articles.csv   h-and-m-personalized-fashion-recommendations.zip  sample_submission.csv
customers.csv  images						 transactions_train.csv


In [None]:
!pip install --upgrade implicit
import os; os.environ['OPENBLAS_NUM_THREADS']='1'
import numpy as np
import pandas as pd
import implicit
from scipy.sparse import coo_matrix
from implicit.evaluation import mean_average_precision_at_k



# 데이터

In [None]:
csv_train = f'transactions_train.csv'
csv_users = f'customers.csv'
csv_items = f'articles.csv'

df = pd.read_csv(csv_train, dtype={'article_id': str}, parse_dates=['t_dat'])
dfu = pd.read_csv(csv_users)
dfi = pd.read_csv(csv_items, dtype={'article_id': str})

In [None]:
# explore the data shape (rows, columns) and the name of its coloumns
print("df shape:",df.shape)
print("df columns name:", df.columns)

df shape: (31788324, 5)
df.columns name: Index(['t_dat', 'customer_id', 'article_id', 'price', 'sales_channel_id'], dtype='object')


In [None]:
df.shape

(31788324, 5)

In [None]:
# 4주 데이터만 사용 (2020-08-22 ~ 2020-09-22)
df = df[df['t_dat'] > '2020-08-21']
df.shape

(1190911, 5)

In [None]:
df['t_dat'].max()

Timestamp('2020-09-22 00:00:00')

In [None]:
ALL_USERS = dfu['customer_id'].unique().tolist() #고유한 사용자ID 리스트
ALL_ITEMS = dfi['article_id'].unique().tolist() # 고유한 아이템ID 리스트

# 각 사용자와 아이템에 순차적으로 인덱스를 부여하고, 딕셔너리로 매핑
user_ids = dict(list(enumerate(ALL_USERS)))
item_ids = dict(list(enumerate(ALL_ITEMS)))

# 인덱스를 key, 사용자/아이템 ID를 value로 갖는 딕셔너리 생성
user_map = {u: uidx for uidx, u in user_ids.items()}
item_map = {i: iidx for iidx, i in item_ids.items()}

# 데이터프레임의 'customer_id'/'article_id' 열을 각각 사용자/아이템 ID로 매핑한 'user_id'/'item_id' column 생성
df['user_id'] = df['customer_id'].map(user_map)
df['item_id'] = df['article_id'].map(item_map)

# del dfu, dfi

In [None]:
df.head()

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,user_id,item_id
30597413,2020-08-22,0001d44dbe7f6c4b35200abdb052c77a87596fe1bdcc37...,913688003,0.033881,2,38,103595
30597414,2020-08-22,0001d44dbe7f6c4b35200abdb052c77a87596fe1bdcc37...,913688003,0.033881,2,38,103595
30597415,2020-08-22,0001d44dbe7f6c4b35200abdb052c77a87596fe1bdcc37...,923460001,0.042356,2,38,104483
30597416,2020-08-22,0001d44dbe7f6c4b35200abdb052c77a87596fe1bdcc37...,934380001,0.050831,2,38,105214
30597417,2020-08-22,0001d44dbe7f6c4b35200abdb052c77a87596fe1bdcc37...,913688001,0.033881,2,38,103593


In [None]:
# 'user_id' 및 'item_id' 열을 사용해 희소행렬 생성
row = df['user_id'].values
col = df['item_id'].values
data = np.ones(df.shape[0])
coo = coo_matrix((data, (row, col)), shape=(len(ALL_USERS), len(ALL_ITEMS)))

In [None]:
def to_user_item_coo(df): # 데이터프레임을 coo matrix로 변경
    row = df['user_id'].values
    col = df['item_id'].values
    data = np.ones(df.shape[0])
    coo = coo_matrix((data, (row, col)), shape=(len(ALL_USERS), len(ALL_ITEMS)))
    return coo


def split_data(df, test_days=7): # 데이터프레임을 train, test로 나눔 (3주 training, 1주 test)
    test_cut = df['t_dat'].max() - pd.Timedelta(test_days)

    df_train = df[df['t_dat'] < test_cut]
    df_test = df[df['t_dat'] >= test_cut]
    return df_train, df_test

df_train, df_test = split_data(df, test_days=7)
coo_train = to_user_item_coo(df_train)
coo_test = to_user_item_coo(df_test)

csr_train = coo_train.tocsr()
csr_test = coo_test.tocsr()

In [None]:
df_train.shape

(1158045, 7)

In [36]:
df_train.head()

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,user_id,item_id
30597413,2020-08-22,0001d44dbe7f6c4b35200abdb052c77a87596fe1bdcc37...,913688003,0.033881,2,38,103595
30597414,2020-08-22,0001d44dbe7f6c4b35200abdb052c77a87596fe1bdcc37...,913688003,0.033881,2,38,103595
30597415,2020-08-22,0001d44dbe7f6c4b35200abdb052c77a87596fe1bdcc37...,923460001,0.042356,2,38,104483
30597416,2020-08-22,0001d44dbe7f6c4b35200abdb052c77a87596fe1bdcc37...,934380001,0.050831,2,38,105214
30597417,2020-08-22,0001d44dbe7f6c4b35200abdb052c77a87596fe1bdcc37...,913688001,0.033881,2,38,103593


In [None]:
df_test.shape

(32866, 7)

In [None]:
# check if model is ok
%%time
model = implicit.als.AlternatingLeastSquares(factors=10, iterations=2)
model.fit(coo_train)



  0%|          | 0/2 [00:00<?, ?it/s]

CPU times: user 2.9 s, sys: 89.4 ms, total: 2.99 s
Wall time: 3.64 s


# Training

In [None]:
# coo = to_user_item_coo(df)
# coo_train, coo_test = implicit.evaluation.train_test_split(coo, train_percentage=0.8, random_state=1)
# csr_train = coo_train.tocsr()
# csr_test = coo_test.tocsr()

In [None]:
def train(coo_train, factors=200, iterations=15, regularization=0.01, show_progress=True):
    model = implicit.als.AlternatingLeastSquares(factors=factors,
                                                 iterations=iterations,
                                                 regularization=regularization,
                                                 random_state=42)
    model.fit(coo_train, show_progress=show_progress)
    return model

In [None]:
params = {'factors': 500, 'iterations': 3, 'regularization': 0.01}
params

{'factors': 500, 'iterations': 3, 'regularization': 0.01}

In [None]:
model = train(coo_train, **params)



  0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
coo_train.T.tocsr().shape

(105542, 1371980)

In [None]:
auc = implicit.evaluation.AUC_at_k(model, csr_train, csr_test, K=10, show_progress=True, num_threads=1)

  0%|          | 0/10528 [00:00<?, ?it/s]

In [None]:
auc


0.5066205385293692

In [None]:
map = implicit.evaluation.mean_average_precision_at_k(model, csr_train, csr_test, K=10, show_progress=True, num_threads=1)

  0%|          | 0/10528 [00:00<?, ?it/s]

In [None]:
map

0.0069233291720943784