In [12]:
%config Completer.use_jedi = False

## movielens data download
[ml-25.zip] https://grouplens.org/datasets/movielens/


In [2]:
import pandas as pd
data = pd.read_csv('./data/ratings.csv')
data.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510


In [5]:
len(data)

25000095

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000095 entries, 0 to 25000094
Data columns (total 4 columns):
 #   Column     Dtype  
---  ------     -----  
 0   userId     int64  
 1   movieId    int64  
 2   rating     float64
 3   timestamp  int64  
dtypes: float64(1), int64(3)
memory usage: 762.9 MB


In [4]:
# dtype transform
data = data[['userId', 'movieId', 'rating']].astype(str)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000095 entries, 0 to 25000094
Data columns (total 3 columns):
 #   Column   Dtype 
---  ------   ----- 
 0   userId   object
 1   movieId  object
 2   rating   object
dtypes: object(3)
memory usage: 572.2+ MB


## buffalo 
- 공식 문서(<a href="https://buffalo-recsys.readthedocs.io/en/latest/">링크</a>)
- github(<a href="https://github.com/kakao/buffalo">링크</a>)

In [17]:
# buffalo library import
from buffalo.algo.als import ALS, inited_CUALS
from buffalo.algo.options import ALSOption
import buffalo.data
from buffalo.misc import aux
from buffalo.data.mm import MatrixMarketOptions
import numpy as np
from scipy.io import mmwrite
from scipy.io import mmread
from scipy.sparse import csr_matrix
import scipy.sparse as sp

In [7]:
inited_CUALS # True이면 gpu 학습 가능

True

In [8]:
# 유저 * 아이템 매트릭스 생성
def get_df_matrix_mappings(df, row_name, col_name):
    
    rid_to_idx = {}
    idx_to_rid = {}
    for (idx, rid) in enumerate(df[row_name].unique().tolist()):
        rid_to_idx[rid] = idx
        idx_to_rid[idx] = rid

    cid_to_idx = {}
    idx_to_cid = {}
    for (idx, cid) in enumerate(df[col_name].unique().tolist()):
        cid_to_idx[cid] = idx
        idx_to_cid[idx] = cid

    return rid_to_idx, idx_to_rid, cid_to_idx, idx_to_cid

def df_to_matrix(df, row_name, col_name):
    
    rid_to_idx, idx_to_rid, cid_to_idx, idx_to_cid = get_df_matrix_mappings(df, row_name, col_name)

    def map_ids(row, mapper):
        return mapper[row]

    I = df[row_name].apply(map_ids, args=[rid_to_idx]).to_numpy()
    J = df[col_name].apply(map_ids, args=[cid_to_idx]).to_numpy()
    V = np.ones(I.shape[0])
    interactions = sp.coo_matrix((V, (I, J)), dtype=np.float64)
    interactions = interactions.tocsr()
    
    return interactions, rid_to_idx, idx_to_rid, cid_to_idx, idx_to_cid

In [9]:
user_items, uid_to_idx, idx_to_uid, mid_to_idx, idx_to_mid = df_to_matrix(data, 'userId', 'movieId')
mmwrite(f'./train/main.mtx', user_items)

In [10]:
iid = list(idx_to_mid.values())
uid = list(idx_to_uid.values())

In [11]:
with open(f"./train/uid", "w") as f:
    for val in uid:
        print(val, file=f)
f.close()
with open(f"./train/iid", "w") as f:
    for val in iid:
        print(val, file=f)
f.close()

## parameter Optimizer
- hyperopt 사용하여 최적 파라미터 서치

In [19]:
opt = ALSOption().get_default_option()
opt.num_workers = 6
opt.num_iters = 20
opt.evaluation_period = 20
opt.evaluation_on_learning = True
opt.save_best = True
opt.accelerator = True # GPU option

data_opt = MatrixMarketOptions().get_default_option()
data_opt.input.main = './train/main.mtx'
data_opt.input.iid = './train/iid'
data_opt.input.uid = './train/uid'
data_opt.data.ath = './train/mm.h5py'
data_opt.data.validation.p = 0.1
data_opt.data.validation.max_samples = 5000

In [20]:
# optimizer 범위 설정
opt.validation = aux.Option({'topk' : 10 })
opt.tensorboard = aux.Option({'root' : './train/als-validation', 'name' : 'als-new'})
opt.optimize = aux.Option({
   'loss': 'val_ndcg',
        'max_trials':100,
        'deployment': True,
        'start_with_default_parameters': False,
        'space': {
            'd': ['randint', ['d', 10, 128]],
            'reg_u': ['uniform', ['reg_u', 0.1, 1.0]],
            'reg_i': ['uniform', ['reg_i', 0.1, 1.0]],
            'alpha': ['randint', ['alpha', 1, 10]]
        } 
})

In [21]:
als = ALS(opt, data_opt = data_opt)
als.initialize()

als.opt.model_path = './train/als-best-model.bin'
als.optimize() # parameter optimizing
als.get_optimization_data()

[INFO    ] 2022-01-25 18:20:17 [mm.py:245] Create the database from matrix market file.
[INFO    ] 2022-01-25 18:20:18 [mm.py:258] Creating working data...
[PROGRESS] 100.00% 37.1/37.1secs 22,122,287.42it/s
[INFO    ] 2022-01-25 18:20:56 [mm.py:263] Building data part...
[INFO    ] 2022-01-25 18:20:56 [base.py:405] Building compressed triplets for rowwise...
[INFO    ] 2022-01-25 18:20:56 [base.py:406] Preprocessing...
[INFO    ] 2022-01-25 18:20:56 [base.py:409] In-memory Compressing ...
[INFO    ] 2022-01-25 18:20:59 [base.py:290] Load triplet files. Total job files: 13
[INFO    ] 2022-01-25 18:20:59 [base.py:439] Finished
[INFO    ] 2022-01-25 18:20:59 [base.py:405] Building compressed triplets for colwise...
[INFO    ] 2022-01-25 18:20:59 [base.py:406] Preprocessing...
[INFO    ] 2022-01-25 18:20:59 [base.py:409] In-memory Compressing ...
[INFO    ] 2022-01-25 18:21:02 [base.py:290] Load triplet files. Total job files: 13
[INFO    ] 2022-01-25 18:21:02 [base.py:439] Finished
[INFO 

{'trials': <hyperopt.base.Trials at 0x7fd0f36220f0>,
 'best': {'train_loss': 0.04939169654752978,
  'val_ndcg': 0.1234076712927393,
  'val_map': 0.09617349333762493,
  'val_accuracy': 0.20863258695958314,
  'val_auc': 0.6042390495244309,
  'val_rmse': 0.6066403147516816,
  'val_error': 0.5343958109974861,
  'loss': -0.1234076712927393,
  'status': 'ok'},
 'best_parameters': {'alpha': 1,
  'd': 80,
  'reg_i': 0.6986845087564665,
  'reg_u': 0.9846552410221874}}

In [22]:
data_opt = MatrixMarketOptions().get_default_option()
data_opt.input.main = f'./train/main.mtx'
data_opt.input.iid = f'./train/iid'
data_opt.input.uid = f'./train/uid'
data_opt.data.validation.p = 0.1
data_opt.data.validation.max_samples = 10000
data_opt.data.path = f'./train/mm.h5py'

data = buffalo.data.load(data_opt)
data.create()

[INFO    ] 2022-01-26 08:42:49 [mm.py:245] Create the database from matrix market file.
[INFO    ] 2022-01-26 08:42:51 [mm.py:258] Creating working data...
[PROGRESS] 100.00% 39.2/39.2secs 20,977,265.04it/s
[INFO    ] 2022-01-26 08:43:31 [mm.py:263] Building data part...
[INFO    ] 2022-01-26 08:43:31 [base.py:405] Building compressed triplets for rowwise...
[INFO    ] 2022-01-26 08:43:31 [base.py:406] Preprocessing...
[INFO    ] 2022-01-26 08:43:31 [base.py:409] In-memory Compressing ...
[INFO    ] 2022-01-26 08:43:34 [base.py:290] Load triplet files. Total job files: 13
[INFO    ] 2022-01-26 08:43:34 [base.py:439] Finished
[INFO    ] 2022-01-26 08:43:34 [base.py:405] Building compressed triplets for colwise...
[INFO    ] 2022-01-26 08:43:34 [base.py:406] Preprocessing...
[INFO    ] 2022-01-26 08:43:34 [base.py:409] In-memory Compressing ...
[INFO    ] 2022-01-26 08:43:37 [base.py:290] Load triplet files. Total job files: 13
[INFO    ] 2022-01-26 08:43:37 [base.py:439] Finished
[INFO 

In [24]:
del als
als_opt = ALS()
als_opt.load('./train/als-best-model.bin')
als_opt.opt

[INFO    ] 2022-01-26 08:45:52 [als.py:57] ALS({
  "evaluation_on_learning": true,
  "compute_loss_on_training": true,
  "early_stopping_rounds": 0,
  "save_best": false,
  "evaluation_period": 1,
  "save_period": 10,
  "random_seed": 0,
  "validation": {},
  "adaptive_reg": false,
  "save_factors": false,
  "accelerator": false,
  "d": 20,
  "num_iters": 10,
  "num_workers": 1,
  "hyper_threads": 256,
  "num_cg_max_iters": 3,
  "reg_u": 0.1,
  "reg_i": 0.1,
  "alpha": 8,
  "optimizer": "manual_cg",
  "cg_tolerance": 1e-10,
  "eps": 1e-10,
  "model_path": "",
  "data_opt": {}
})


{'evaluation_on_learning': True,
 'compute_loss_on_training': True,
 'early_stopping_rounds': 0,
 'save_best': True,
 'evaluation_period': 20,
 'save_period': 10,
 'random_seed': 0,
 'validation': {'topk': 10},
 'adaptive_reg': False,
 'save_factors': False,
 'accelerator': True,
 'd': 80,
 'num_iters': 20,
 'num_workers': 6,
 'hyper_threads': 256,
 'num_cg_max_iters': 3,
 'reg_u': 0.76870304240029,
 'reg_i': 0.18059745401234717,
 'alpha': 1,
 'optimizer': 'manual_cg',
 'cg_tolerance': 1e-10,
 'eps': 1e-10,
 'model_path': './train/als-best-model.bin',
 'data_opt': {},
 'tensorboard': None,
 'optimize': {'loss': 'val_ndcg',
  'max_trials': 100,
  'deployment': True,
  'start_with_default_parameters': False,
  'space': {'d': ['randint', ['d', 10, 128]],
   'reg_u': ['uniform', ['reg_u', 0.1, 1.0]],
   'reg_i': ['uniform', ['reg_i', 0.1, 1.0]],
   'alpha': ['randint', ['alpha', 1, 10]]}}}

## model train

In [25]:
model = ALS(als_opt.opt, data= data)
model.initialize()
model.train()

[INFO    ] 2022-01-26 08:46:37 [als.py:57] ALS({
  "evaluation_on_learning": true,
  "compute_loss_on_training": true,
  "early_stopping_rounds": 0,
  "save_best": true,
  "evaluation_period": 20,
  "save_period": 10,
  "random_seed": 0,
  "validation": {
    "topk": 10
  },
  "adaptive_reg": false,
  "save_factors": false,
  "accelerator": true,
  "d": 80,
  "num_iters": 20,
  "num_workers": 6,
  "hyper_threads": 256,
  "num_cg_max_iters": 3,
  "reg_u": 0.76870304240029,
  "reg_i": 0.18059745401234717,
  "alpha": 1,
  "optimizer": "manual_cg",
  "cg_tolerance": 1e-10,
  "eps": 1e-10,
  "model_path": "./train/als-best-model.bin",
  "data_opt": {},
  "tensorboard": null,
  "optimize": {
    "loss": "val_ndcg",
    "max_trials": 100,
    "deployment": true,
    "start_with_default_parameters": false,
    "space": {
      "d": [
        "randint",
        [
          "d",
          10,
          128
        ]
      ],
      "reg_u": [
        "uniform",
        [
          "reg_u",
      

{'train_loss': 0.049260287558788134,
 'val_ndcg': 0.12672110195986627,
 'val_map': 0.09802290104881599,
 'val_accuracy': 0.21319396051103368,
 'val_auc': 0.6065194910348387,
 'val_rmse': 0.6009071995885302,
 'val_error': 0.5284293065428733}

## predict next movie(recommendation)

In [27]:
# Top 5 movie list for 'userId 1'
model.topk_recommendation('1',topk=5)

['4973', '6016', '7361', '306', '307']

In [28]:
# Simmilar movie with 'movieId 4973'
model.most_similar('4973',topk=5)

[('7361', 0.79582864),
 ('4226', 0.75482124),
 ('6711', 0.7251757),
 ('6016', 0.7054305),
 ('5618', 0.6970457)]