In [None]:
# !pip install implicit

In [1]:
import pandas as pd
import tqdm
import implicit
from implicit.evaluation import mean_average_precision_at_k

In [127]:
train = pd.read_parquet('processed_train.parquet')
val = pd.read_parquet('processed_val.parquet')

In [128]:
val.head()

Unnamed: 0,event_date,event_timestamp,vacancy_id_,cookie_id,user_id,event_type
0,2022-08-02,1659470229,100667,ae863aa1dee04c2e86e5b6c1edaacdbf,,show_vacancy
1,2022-09-27,1664281122,237747,524d61cb9f9d4e0cb55475507f68c10c,09bd0baa7f9040de91eddc5f7d25fe10,show_vacancy
2,2022-09-03,1662226287,200510,5ce0081b0a0b4181b901fb6742f8995b,4241bf4b2a4b4f58957c5b0625da0bad,show_vacancy
3,2022-08-15,1660584754,259855,53966f5e759c45fbaa3efdf86080eb38,,show_vacancy
4,2022-09-30,1664541111,258065,9cdbfdcb9ac54958bf38f72515c739d8,,show_vacancy


In [129]:
# найдем топ-5 вакансий (бейзлайн)
reactions = {
    'show_vacancy': 0, 
    'preview_click_vacancy': 0, 
    'click_response': 1,
    'click_contacts': 1, 
    'preview_click_response': 1, 
    'click_favorite': 0,
    'preview_click_favorite': 0, 
    'preview_click_contacts': 1, 
    'click_phone': 1,
    'preview_click_phone': 1
}

cookie_event_df = train[['vacancy_id_', 'event_type']].copy()
cookie_event_df['action'] = cookie_event_df['event_type'].apply(lambda x: reactions[x])


# вакансии в порядке популярности по целевым действиям
sorted_vacancy = cookie_event_df[cookie_event_df['action'] == 1]['vacancy_id_'].value_counts()
top_5 = list(sorted_vacancy.iloc[:5].index)
top_5

[116823, 182870, 207423, 198114, 174953]

In [130]:
train['action'] = train['event_type'].apply(lambda x: reactions[x])
val['action'] = val['event_type'].apply(lambda x: reactions[x])


In [131]:
train['event_type'] = train['action'].astype('uint8')
train['vacancy_id_'] = train['vacancy_id_'].astype('category')
train['cookie_id'] = train['cookie_id'].astype('category')

val['event_type'] = val['action'].astype('uint8')
val['vacancy_id_'] = val['vacancy_id_'].astype('category')
val['cookie_id'] = val['cookie_id'].astype('category')

In [132]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9779404 entries, 0 to 9779403
Data columns (total 7 columns):
 #   Column           Dtype   
---  ------           -----   
 0   event_date       object  
 1   event_timestamp  int64   
 2   vacancy_id_      category
 3   cookie_id        category
 4   user_id          object  
 5   event_type       uint8   
 6   action           int64   
dtypes: category(2), int64(2), object(2), uint8(1)
memory usage: 398.2+ MB


In [8]:
from scipy.sparse import csr_matrix

In [42]:
# sparse_item_user = csr_matrix((train['action'].astype(float), 
#                                (train['vacancy_id_'].cat.codes, 
#                                 train['cookie_id'].cat.codes)))

sparse_user_item = csr_matrix((train['action'].astype(float), 
                               (train['cookie_id'].cat.codes, 
                                train['vacancy_id_'].cat.codes)))

# val_sparse_item_user = csr_matrix((val['action'].astype(float), 
#                                (val['vacancy_id_'].cat.codes, 
#                                 val['cookie_id'].cat.codes)))

val_sparse_user_item = csr_matrix((val['action'].astype(float), 
                               (val['cookie_id'].cat.codes, 
                                val['vacancy_id_'].cat.codes)))

In [84]:
# Словарь, в котором cookie_id ключ, а его категория - значение
cookie_cat_dict = dict(zip( train['cookie_id'], train['cookie_id'].cat.codes ))
cat_cookie_dict = dict(zip( train['cookie_id'].cat.codes, train['cookie_id'] ))
vacancy_cat_dict = dict(zip( train['vacancy_id_'].cat.codes, train['vacancy_id_'] ))

In [66]:
vacancy_cat_dict

{38530: 139779,
 57494: 159237,
 86993: 189558,
 81402: 183798,
 113752: 217242,
 67634: 169652,
 143247: 248153,
 36506: 137696,
 83373: 185819,
 89889: 192562,
 15871: 116402,
 153157: 258369,
 19306: 119913,
 140806: 245634,
 145969: 250946,
 123643: 227566,
 57899: 159647,
 86415: 188962,
 106207: 209422,
 19248: 119855,
 55506: 157197,
 102237: 205345,
 101856: 204957,
 15421: 115936,
 45840: 147302,
 115384: 218939,
 124616: 228577,
 34940: 136075,
 76495: 178754,
 73007: 175173,
 138955: 243725,
 95306: 198181,
 10078: 110421,
 146646: 251651,
 154894: 260154,
 145366: 250327,
 34918: 136053,
 51879: 153475,
 141793: 246652,
 37244: 138463,
 52978: 154605,
 10337: 110687,
 114298: 217807,
 147900: 252945,
 27194: 128093,
 87641: 190219,
 34728: 135856,
 48937: 150462,
 64655: 166590,
 8620: 108924,
 145714: 250685,
 154352: 259594,
 98499: 201477,
 62741: 164602,
 138290: 243039,
 105436: 208619,
 4353: 104502,
 106028: 209234,
 99598: 202630,
 7870: 108154,
 77088: 179362,
 715

In [13]:
sparse_user_item.shape

(330179, 154908)

In [14]:
len(cookie_cat_dict)

330179

In [152]:
#Building the model
model = implicit.als.AlternatingLeastSquares(factors=20, 
                                             regularization=0.05, 
                                             iterations=20,
                                             alpha=40
                                            )
# alpha_val = 40
# data_conf = (sparse_user_item * alpha_val).astype('double')
model.fit(sparse_user_item)

  0%|          | 0/20 [00:00<?, ?it/s]

In [50]:
# map12 = mean_average_precision_at_k(model, sparse_user_item, val_sparse_user_item, K=5, show_progress=True)

  0%|          | 0/321347 [00:00<?, ?it/s]

In [51]:
map12

2.1099552128315424e-05

In [52]:
cookie_id = 154909
model.recommend(cookie_id, 
              sparse_user_item[cookie_id], 
#               N=5,
#               filter_already_liked_items=True # не рекомендовать просмотренные
             )

(array([41848, 78102, 32276, 59851, 46309, 96031, 65809, 18967, 50216,
        80501], dtype=int32),
 array([0.5812351 , 0.44192925, 0.43181863, 0.33391142, 0.3337102 ,
        0.31531382, 0.2989831 , 0.29778242, 0.2889478 , 0.27705458],
       dtype=float32))

In [69]:
# функция, которая делает предсказание для одного куки
def make_prediction(cookie_id, top_n=5):
    # если этого юзера видим впервые, то выдаем для него бейзлайн
    if not cookie_id in cookie_cat_dict:
        return top_5
    
    cookie_id = cookie_cat_dict[cookie_id]
    
    recommended = model.recommend(cookie_id, 
                              sparse_user_item[cookie_id], 
                              N=top_n,
                              filter_already_liked_items=True # не рекомендовать просмотренные
                             )
    recommended = [vacancy_cat_dict[rec] for rec in recommended[0]]
    return recommended

In [70]:
make_prediction(cookie_id = '000cd76cd33f43d4a1ac1d16d10f8bf7')

[146630, 246815, 164698, 215620, 117525]

In [71]:
# Делаем предсказание для списка куки
def make_prediciton_column(cookies):
    lst = []
    for cookie in tqdm.tqdm(cookies):
        prediction = make_prediction(cookie)
        lst.append(prediction)
    return lst

Попробуем **на валидации**

In [117]:
val['predicted'] = make_prediciton_column(val['cookie_id'])

In [None]:
val['true_positive'] = val.apply(lambda x: len(set(x['vacancy_id_']) & set(x['predicted'])), axis=1)
val['precision@5'] = val.apply(lambda x: len(set(x['vacancy_id_']) & set(x['predicted'])) / 5, axis=1)

In [None]:
val.head()

In [None]:
# усредним метрику
val['precision@5'].mean() 

In [None]:
# сколько ненулевых?
val[val['true_positive'] != 0]

In [80]:
!pip install optuna

Collecting optuna
  Downloading optuna-3.1.1-py3-none-any.whl (365 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m365.7/365.7 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting cmaes>=0.9.1
  Downloading cmaes-0.9.1-py3-none-any.whl (21 kB)
Collecting alembic>=1.5.0
  Downloading alembic-1.10.4-py3-none-any.whl (212 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.9/212.9 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting colorlog
  Downloading colorlog-6.7.0-py2.py3-none-any.whl (11 kB)
Collecting Mako
  Downloading Mako-1.2.4-py3-none-any.whl (78 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.7/78.7 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: Mako, colorlog, cmaes, alembic, optuna
Successfully installed Mako-1.2.4 alembic-1.10.4 cmaes-0.9.1 colorlog-6.7.0 optuna-3.1.1


In [136]:
# Подбираем гиперпараметры
import optuna
from sklearn.model_selection import train_test_split
import implicit

# Определяем пространство поиска для гиперпараметров
def objective(trial):
    
    factors = trial.suggest_int('factors', 50, 200, step=50)
    regularization = trial.suggest_loguniform('regularization', 1e-3, 1e-2)
    iterations = trial.suggest_int('iterations', 50, 200, step=50)
    
    # Используем гиперпараметры
    model = implicit.als.AlternatingLeastSquares(factors=factors, regularization=regularization, iterations=iterations)
    model.fit(train_df)
    
    # Оцениваем модель на val (5 рандомных юзеров)
    userids = [250946, 227566, 159647, 188962, 209422]
    precs5 = []

    for id_ in userids:
        ids = model.recommend(id_, sparse_user_item[id_], N=5, filter_already_liked_items=True)[0]
        # что порекомендовали
        recommended = [vacancy_cat_dict[rec] for rec in ids]

        # правильный ответ
        cookie = cat_cookie_dict[id_]
        answers = list(train[(train['cookie_id'] == cookie) 
                             & (train['action'] == 1)]['vacancy_id_'].unique())

        prec5 = len(set(answers) & set(recommended)) / 5
        precs5.append(prec5)

    score = sum(precs5) / len(precs5)
    
    return score

# Делим
train_df, val_df = train_test_split(sparse_user_item, test_size=0.2, random_state=42)

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=1)

print("Best Parameters: ", study.best_params)
print("Best Score: ", study.best_value)

[32m[I 2023-05-01 02:30:38,924][0m A new study created in memory with name: no-name-7a5a1164-6300-425e-af12-572a57443c57[0m
  regularization = trial.suggest_loguniform('regularization', 1e-3, 1e-1)


  0%|          | 0/100 [00:00<?, ?it/s]

[32m[I 2023-05-01 02:44:48,516][0m Trial 0 finished with value: 0.0 and parameters: {'factors': 50, 'regularization': 0.0019835724808245506, 'iterations': 100}. Best is trial 0 with value: 0.0.[0m


Best Parameters:  {'factors': 50, 'regularization': 0.0019835724808245506, 'iterations': 100}
Best Score:  0.0


0.0

Попробуем **на тесте**

In [153]:
test_df = pd.read_parquet('test_public_mfti.parquet', engine='pyarrow')

In [154]:
test_df['predicted'] = make_prediciton_column(test_df['cookie_id'])

100%|████████████████████████████████████████| 772/772 [00:01<00:00, 726.04it/s]


In [155]:
test_df['true_positive'] = test_df.apply(lambda x: len(set(x['vacancy_id_']) & set(x['predicted'])), axis=1)
test_df['precision@5'] = test_df.apply(lambda x: len(set(x['vacancy_id_']) & set(x['predicted'])) / 5, axis=1)

In [156]:
test_df.head()

Unnamed: 0,cookie_id,vacancy_id_,predicted,true_positive,precision@5
0,000cd76cd33f43d4a1ac1d16d10f8bf7,"[222177, 222173, 222163, 238874, 238878, 22812...","[111867, 146630, 246815, 207423, 164698]",0,0.0
1,0034bc7f404341ba8412665453e7825a,"[102794, 137587, 257319, 237756, 240744, 11348...","[171332, 190928, 101462, 209422, 205630]",0,0.0
2,00a6c5a64a274c55a836402bdeb3b2c4,"[254292, 164602, 116438, 228634, 218819, 24065...","[244077, 181745, 113305, 154411, 187662]",0,0.0
3,015937a125b14e74bdff1cddc49f9172,"[246685, 138123, 115420, 210628, 212325, 235196]","[193331, 114583, 120252, 210628, 220718]",1,0.2
4,01de50c280794cec8804f16f45f847b7,"[219070, 251469, 166899, 212703, 214561]","[253946, 138634, 137702, 128278, 104528]",0,0.0


In [157]:
# усредним метрику
test_df['precision@5'].mean() 

0.03471502590673576

In [158]:
# сколько ненулевых?
test_df[test_df['true_positive'] != 0]

Unnamed: 0,cookie_id,vacancy_id_,predicted,true_positive,precision@5
3,015937a125b14e74bdff1cddc49f9172,"[246685, 138123, 115420, 210628, 212325, 235196]","[193331, 114583, 120252, 210628, 220718]",1,0.2
8,029c6b8042064d0899fd73f3290da565,"[227966, 177500, 232571, 230260, 238550, 25626...","[253946, 208761, 104528, 252155, 256268]",1,0.2
12,043b9752285e491ea4d34807dcc94065,"[207423, 240649, 116651, 119608, 221920, 21530...","[207423, 230707, 237341, 203404, 220718]",3,0.6
32,0a8b14d1a6ef450d90033d85effab74a,"[113189, 142272, 213146, 235254, 238136, 23496...","[207423, 252639, 173306, 184858, 138634]",1,0.2
47,0feb43e6f85c47bfa245b599b62669ce,"[103028, 225472, 243145, 253119, 238572]","[106729, 183704, 211897, 191702, 225472]",1,0.2
...,...,...,...,...,...
723,ee828a37d71a4963a0bccf0c76297545,"[207423, 230707, 217606, 237288, 214513, 18208...","[210628, 179082, 221920, 214513, 212325]",2,0.4
732,f3b9473d542a4b4284ed7aeb1d1100c9,"[234893, 236510, 235946, 238691, 108345, 19999...","[148714, 149024, 105907, 108242, 247276]",1,0.2
750,f891793414134632b5d3f6478c5f308a,"[203404, 111837, 210628, 214513, 230707, 234305]","[198114, 203404, 164602, 237341, 111867]",1,0.2
753,f9372a43299b47478848db2f1af3f00e,"[237631, 240455, 107024, 227451, 237341, 240045]","[237341, 207423, 198114, 138634, 203404]",1,0.2


**Приватный датасет**

In [111]:
test_private_df = pd.read_parquet('test_private_users_mfti.parquet', engine='pyarrow')

In [112]:
test_private_df['vacancy_id_'] = make_prediciton_column(test_private_df['cookie_id'])

100%|██████████████████████████████████████| 3086/3086 [00:05<00:00, 556.90it/s]


In [113]:
test_private_df

Unnamed: 0,cookie_id,vacancy_id_
0,0018914ba3e54011b28fa715583d3354,"[253946, 259320, 208761, 138634, 104528]"
1,0035c298d8c64f368ae730a9cca9bb20,"[138123, 135430, 202608, 214466, 109360]"
2,00956458877448ec9fba87fb97443fdf,"[260154, 253678, 171332, 116900, 246285]"
3,0099387c921b41e7bae6c99dd8254b60,"[260154, 103881, 153970, 173337, 250327]"
4,009f65e8ae99413a8da94a491320580a,"[138123, 214466, 135430, 146444, 120188]"
...,...,...
3081,ffadd195859444d2ade2479b0611c5c1,"[158242, 110421, 182870, 174953, 207423]"
3082,ffbc08b528c64f22996873fc63872202,"[113707, 153245, 119337, 150283, 129787]"
3083,ffdeaf3c34544529880aebf17c103f6c,"[257631, 260154, 111867, 181976, 123467]"
3084,ffefa79a74804ee69e6c131e0d05b948,"[260154, 250327, 253678, 117532, 140184]"


In [114]:
test_private_df.to_parquet('test_private_sample_submission_mfti.parquet')

In [115]:
pd.read_parquet('test_private_sample_submission_mfti.parquet')

Unnamed: 0,cookie_id,vacancy_id_
0,0018914ba3e54011b28fa715583d3354,"[253946, 259320, 208761, 138634, 104528]"
1,0035c298d8c64f368ae730a9cca9bb20,"[138123, 135430, 202608, 214466, 109360]"
2,00956458877448ec9fba87fb97443fdf,"[260154, 253678, 171332, 116900, 246285]"
3,0099387c921b41e7bae6c99dd8254b60,"[260154, 103881, 153970, 173337, 250327]"
4,009f65e8ae99413a8da94a491320580a,"[138123, 214466, 135430, 146444, 120188]"
...,...,...
3081,ffadd195859444d2ade2479b0611c5c1,"[158242, 110421, 182870, 174953, 207423]"
3082,ffbc08b528c64f22996873fc63872202,"[113707, 153245, 119337, 150283, 129787]"
3083,ffdeaf3c34544529880aebf17c103f6c,"[257631, 260154, 111867, 181976, 123467]"
3084,ffefa79a74804ee69e6c131e0d05b948,"[260154, 250327, 253678, 117532, 140184]"
