In [1]:
from datetime import timedelta

import implicit
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm import tqdm
from scipy.sparse import csr_matrix

from utils import train_test_split

In [2]:
df = pd.read_csv("./transactions_train.csv")
articles = pd.read_csv("./articles.csv")

In [3]:
df.t_dat = pd.to_datetime(df.t_dat)

### 최근 3개월로 한정

In [4]:
print(f"기존 데이터 수: {len(df):,}")
last_90_days = df.t_dat.max() - timedelta(days=90)
df = df[df.t_dat >= last_90_days]
print(f"최근 3개월 데이터 수: {len(df):,}")

기존 데이터 수: 31,788,324
최근 3개월 데이터 수: 3,904,391


In [5]:
train_df, test_df = train_test_split(df)

train: 2020-06-24 ~ 2020-09-14
test: 2020-09-15 ~ 2020-09-22


### Preprocess

In [6]:
%%time
transaction_matrix = train_df.groupby(["customer_id", "article_id"]).size().reset_index(name="buy_count")

CPU times: user 1.67 s, sys: 156 ms, total: 1.83 s
Wall time: 1.95 s


In [7]:
%%time
uid_to_idx = {uid: idx for (idx, uid) in enumerate(transaction_matrix.customer_id.unique().tolist())}
iid_to_idx = {iid: idx for (idx, iid) in enumerate(transaction_matrix.article_id.unique().tolist())}

idx_to_iid = {idx:iid for iid, idx in iid_to_idx.items()}
idx_to_uid = {idx:uid for uid, idx in uid_to_idx.items()}

CPU times: user 622 ms, sys: 8.28 ms, total: 630 ms
Wall time: 643 ms


In [8]:
%%time
row = transaction_matrix.customer_id.tolist()
col = transaction_matrix.article_id.tolist()
dat = transaction_matrix.buy_count.tolist()

## id convert
row = [uid_to_idx[r] for r in row]
col = [iid_to_idx[c] for c in col]

CPU times: user 593 ms, sys: 31.2 ms, total: 624 ms
Wall time: 638 ms


In [9]:
%%time
train_matrix = csr_matrix((dat, (row,col)), shape=(1 + np.max(row), 1 + np.max(col)))
print(train_matrix.shape)

(503219, 41256)
CPU times: user 1.42 s, sys: 28.6 ms, total: 1.44 s
Wall time: 1.51 s


In [10]:
train_matrix_csr = train_matrix.tocsr()

### ALS

In [11]:
model_als = implicit.als.AlternatingLeastSquares(factors=20, use_gpu=False)
model_als.fit(train_matrix_csr)



HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=15.0), HTML(value='')))




In [12]:
als_reco = {} 
for uid in tqdm(train_df.customer_id.unique()):
    items, scores = model_als.recommend(uid_to_idx[uid], train_matrix[uid_to_idx[uid]], 12)
    als_reco[uid] = items

100%|██████████| 503219/503219 [03:26<00:00, 2440.74it/s]


### Evaluation

In [13]:
%%time
test_dic = {customer_id:set(group["article_id"]) for customer_id, group in tqdm(test_df.groupby("customer_id"))}

100%|██████████| 75481/75481 [00:03<00:00, 20052.68it/s]

CPU times: user 4.63 s, sys: 570 ms, total: 5.2 s
Wall time: 4.13 s





In [14]:
catch_count = []
for uid, reco in tqdm(als_reco.items()):
    ## 차주에 아예 구매가 없음
    if uid not in test_dic:
        continue
        
    truth = test_dic[uid]
    temp_reco = [idx_to_iid[idx] for idx in reco]
    catch_count_temp = len(set(temp_reco) & truth)
    catch_count.append(catch_count_temp)

100%|██████████| 503219/503219 [00:01<00:00, 383454.06it/s]


In [15]:
acc = sum(catch_count) / len(test_df)

In [16]:
## 전체로 돌렸을 때 ALS 정확도: 0.0079
print(f"ALS 정확도: {acc:.4f}")

ALS 정확도: 0.0080
