In [1]:
import numpy as np
import random
import pandas as pd
import matplotlib.pyplot as plt

from IPython.display import display

In [2]:
code_1 = "formal"
code_2 = "casual"
codes = [code_1, code_2]
CATEGORY_CENTERS = {
    code_1: np.asfarray([1.0, 0.0]),
    code_2: np.asfarray([-1.0, 0.0])
}

PRICE_AXIS = np.asfarray([0.0, 1.0])
PRICE_MIN = 5
PRICE_MAX = 1000
PRICE_LOG_MEAN = (15 * 1000) ** .5

In [3]:
N_ITEMS = 300
N_USERS = 500
N_BUY_AVG = 10
NOISE_SCALE = 0.2

item_data = []
item_vectors = []
for id_ in range(1, N_ITEMS + 1):
    code = random.choice(codes)
    center_1 = CATEGORY_CENTERS[code]
    price_component = random.uniform(-1, 1)
    center = PRICE_AXIS * price_component + center_1
    price = int(np.exp(price_component) * PRICE_LOG_MEAN)
    item_data.append(dict(item_id=id_, category=code, price=price))
    
    item_vectors.append(
        center + NOISE_SCALE * np.random.randn(2)
    )

item_df = pd.DataFrame(item_data)
item_vectors = np.asfarray(item_vectors)

In [4]:
item_ids = item_df['item_id'].values
log_df = []
enhancement = 2
for uid in range(1, N_USERS + 1):
    n_buy = np.random.poisson(N_BUY_AVG) + 1
    uvec = np.random.randn(2)
    score = item_vectors.dot(uvec)
    p = np.exp(score * enhancement)
    p /= p.sum()
    interactions = item_ids[np.where(np.random.multinomial(n_buy, p) > 0)]
    for iid in interactions:
        log_df.append(dict(user_id=uid, item_id=iid))

In [5]:
log_df = pd.DataFrame(log_df)

In [6]:
from irspack.utils import df_to_sparse
from irspack import rowwise_train_test_split, autopilot, Evaluator

In [7]:
X, uids, iids = df_to_sparse(log_df, user_colname="user_id", item_colname="item_id")

In [8]:
X_train, X_test = rowwise_train_test_split(X, ceil_n_test=True, test_ratio=.2)
evaluator = Evaluator(X_test)

In [9]:
rec_class, best_parameter, log = autopilot(X_train, evaluator, n_trials=40)

[32m[IRSPACK:I 2021-08-08 02:16:16,545][0m [34mTrying the following algorithms: ['RP3betaOptimizer', 'IALSOptimizer', 'DenseSLIMOptimizer', 'AsymmetricCosineKNNOptimizer', 'SLIMOptimizer'][0m
[32m[I 2021-08-08 02:16:16,690][0m A new study created in RDB with name: autopilot-9c8b3fcf-f7ee-11eb-ab96-04d4c4542edc[0m
[32m[IRSPACK:I 2021-08-08 02:16:16,780][0m [34mTrial 0:[0m
[32m[IRSPACK:I 2021-08-08 02:16:16,782][0m [34mparameter = {'beta': 0.0001542763011327778, 'normalize_weight': True, 'top_k': 130}[0m
[32m[IRSPACK:I 2021-08-08 02:16:16,797][0m [34mConfig 0 obtained the following scores: {'appeared_item': 220.0, 'entropy': 4.657871887340325, 'gini_index': 0.7380893333333334, 'hit': 0.416, 'map': 0.09376296296296298, 'n_items': 300.0, 'ndcg': 0.14946847642761668, 'precision': 0.0512, 'recall': 0.21966666666666665, 'total_user': 500.0, 'valid_user': 500.0} within 0.042337 seconds.[0m
[32m[I 2021-08-08 02:16:16,828][0m Trial 0 finished with value: -0.14946847642761668

[32m[IRSPACK:I 2021-08-08 02:16:17,631][0m [34mConfig 4 obtained the following scores: {'appeared_item': 297.0, 'entropy': 5.521184774258946, 'gini_index': 0.324588, 'hit': 0.102, 'map': 0.014035383597883599, 'n_items': 300.0, 'ndcg': 0.02699121336951469, 'precision': 0.010600000000000002, 'recall': 0.043, 'total_user': 500.0, 'valid_user': 500.0} within 0.328474 seconds.[0m
[32m[I 2021-08-08 02:16:17,672][0m Trial 4 finished with value: -0.02699121336951469 and parameters: {'optimizer_name': 'IALSOptimizer', 'IALSOptimizer.alpha': 17.926357574329035, 'IALSOptimizer.reg': 4.927843612403619e-09, 'IALSOptimizer.n_components': 204}. Best is trial 0 with value: -0.14946847642761668.[0m
[32m[IRSPACK:I 2021-08-08 02:16:17,771][0m [34mTrial 5:[0m
[32m[IRSPACK:I 2021-08-08 02:16:17,773][0m [34mparameter = {'shrinkage': 644.1844855581431, 'feature_weighting': 'BM_25', 'alpha': 0.274622045445714, 'top_k': 964}[0m
[32m[IRSPACK:I 2021-08-08 02:16:17,788][0m [34mConfig 5 obtained 

[32m[IRSPACK:I 2021-08-08 02:16:18,598][0m [34mConfig 9 obtained the following scores: {'appeared_item': 256.0, 'entropy': 5.255965018247246, 'gini_index': 0.5031386666666666, 'hit': 0.334, 'map': 0.07094563492063491, 'n_items': 300.0, 'ndcg': 0.11386526237644679, 'precision': 0.0386, 'recall': 0.1658333333333333, 'total_user': 500.0, 'valid_user': 500.0} within 0.274684 seconds.[0m
[32m[I 2021-08-08 02:16:18,633][0m Trial 9 finished with value: -0.11386526237644679 and parameters: {'optimizer_name': 'IALSOptimizer', 'IALSOptimizer.alpha': 1.8043014887278936, 'IALSOptimizer.reg': 9.982834578521491e-05, 'IALSOptimizer.n_components': 27}. Best is trial 0 with value: -0.14946847642761668.[0m
[32m[IRSPACK:I 2021-08-08 02:16:18,719][0m [34mTrial 10:[0m
[32m[IRSPACK:I 2021-08-08 02:16:18,720][0m [34mparameter = {'reg': 19.17108949777185}[0m
[32m[IRSPACK:I 2021-08-08 02:16:18,910][0m [34mConfig 10 obtained the following scores: {'appeared_item': 276.0, 'entropy': 5.161240526

[32m[I 2021-08-08 02:16:23,893][0m Trial 29 pruned. [0m
[32m[IRSPACK:I 2021-08-08 02:16:23,979][0m [34mTrial 30:[0m
[32m[IRSPACK:I 2021-08-08 02:16:23,981][0m [34mparameter = {'reg': 48.98215262507169}[0m
[32m[IRSPACK:I 2021-08-08 02:16:24,162][0m [34mConfig 30 obtained the following scores: {'appeared_item': 262.0, 'entropy': 4.975080262187239, 'gini_index': 0.6392213333333333, 'hit': 0.396, 'map': 0.08833300264550263, 'n_items': 300.0, 'ndcg': 0.14104245524403186, 'precision': 0.04819999999999999, 'recall': 0.20766666666666664, 'total_user': 500.0, 'valid_user': 500.0} within 0.196244 seconds.[0m
[32m[I 2021-08-08 02:16:24,193][0m Trial 30 finished with value: -0.14104245524403186 and parameters: {'optimizer_name': 'DenseSLIMOptimizer', 'DenseSLIMOptimizer.reg': 48.98215262507169}. Best is trial 23 with value: -0.15352854786722003.[0m
[32m[IRSPACK:I 2021-08-08 02:16:24,283][0m [34mTrial 31:[0m
[32m[IRSPACK:I 2021-08-08 02:16:24,285][0m [34mparameter = {'reg':

In [32]:
rec_class

rec = rec_class(X, **best_parameter).learn()

uindex = random.randint(0, len(uids) - 1)
score_ = rec.get_score_cold_user_remove_seen(X[uindex])[0]
c = np.argsort(score_)[::-1][:5]

iids_seen = iids[X[uindex].nonzero()[1]]

print('seen')
display(item_df.set_index('item_id').reindex(iids_seen))
print('rec')
display(item_df.set_index('item_id').reindex(iids[c]))

seen


Unnamed: 0,category,price
5,casual,106
23,casual,227
60,casual,310
77,casual,240
113,casual,183
150,casual,136
161,casual,219
289,casual,332


rec


Unnamed: 0,category,price
286,casual,270
169,casual,238
174,casual,311
137,casual,271
279,casual,202


In [36]:
log_df.to_csv('purchase_log.csv', index=False)

In [40]:
item_df.to_pickle('item_info.pkl')