In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
%cd '/content/drive/MyDrive/Colab Notebooks/hnm'

/content/drive/MyDrive/Colab Notebooks/hnm


In [3]:
!ls

articles.csv   h-and-m-personalized-fashion-recommendations.zip  sample_submission.csv
customers.csv  images						 transactions_train.csv


In [4]:
!pip install lightfm



In [36]:
import os
import cv2
import tqdm
from matplotlib import pyplot as plt
import pandas as pd
import numpy as np
from lightfm import LightFM
from lightfm.data import Dataset
from lightfm.evaluation import precision_at_k, auc_score
%matplotlib inline
SEED = 42
np.random.seed(SEED)
os.environ['PYTHONHASHSEED'] = str(SEED)

In [37]:
K = 10 # 각 사용자에 대한 추천 아이템의 수
EPOCHS = 50
LEARNING_RATE = 0.01
NO_COMPONENTS = 20 # latent factors의 수
NO_THREADS = 32 # 모델을 학습할 때 사용되는 스레드의 수

# regularization parameters
ITEM_ALPHA = 0.01
USER_ALPHA = 0.01

# 데이터

In [38]:
dfu = pd.read_csv("customers.csv")
dfi = pd.read_csv("articles.csv", dtype={'article_id': str})
df = pd.read_csv('transactions_train.csv',  dtype={'article_id': str}, parse_dates=['t_dat'])

In [39]:
df.shape

(31788324, 5)

In [40]:
# 4주 간의 데이터만 사용 (2020-08-22 ~ 2020-09-22)
df = df[df['t_dat'] > '2020-08-21']
df.shape

(1190911, 5)

In [41]:
df['t_dat'].max()

Timestamp('2020-09-22 00:00:00')

In [42]:
dataset = Dataset()
dataset.fit(users=dfu['customer_id'],
            items=dfi['article_id'])

num_users, num_topics = dataset.interactions_shape()
print(f'Number of users: {num_users}, Number of topics: {num_topics}.')

Number of users: 1371980, Number of topics: 105542.


In [44]:
def split_data(df, test_days=7): # 데이터프레임을 train, test로 나눔 (3주 training, 1주 test)
    test_cut = df['t_dat'].max() - pd.Timedelta(test_days)

    df_train = df[df['t_dat'] < test_cut]
    df_test = df[df['t_dat'] >= test_cut]
    return df_train, df_test

df_train, df_test = split_data(df, test_days=7)

In [45]:
df_train.shape

(1158045, 5)

In [46]:
df_test.shape

(32866, 5)

In [47]:
(interactions, weights) = dataset.build_interactions(df_train.iloc[:, 1:3].values)
(test_interactions, test_weights) = dataset.build_interactions(df_test.iloc[:, 1:3].values)

print(interactions.shape, test_interactions.shape) #두 행렬의 크기가 동일

(1371980, 105542) (1371980, 105542)


# LightFM model

In [49]:
model = LightFM(loss='warp', no_components=NO_COMPONENTS, k=K,
                 learning_rate=LEARNING_RATE,
                 random_state=np.random.RandomState(SEED))
model.fit(interactions=interactions, epochs=EPOCHS, verbose=1)

Epoch: 100%|██████████| 50/50 [03:36<00:00,  4.33s/it]


<lightfm.lightfm.LightFM at 0x7ce3df9dbbb0>

In [50]:
map = precision_at_k(model, test_interactions, k=K).mean()

In [51]:
map

0.0058415653

In [52]:
auc = auc_score(model, test_interactions, num_threads=NO_THREADS).mean()

In [53]:
auc

0.966731