In [45]:
import pandas as pd
import numpy as np
import tqdm

In [2]:
df = pd.read_parquet('train_mfti.parquet', engine='pyarrow')

In [6]:
df.head()

Unnamed: 0,event_date,event_timestamp,vacancy_id_,cookie_id,user_id,event_type
0,2022-08-01,1659323026,129850,97990f1a021d4be19aa3f955b7eacab4,951f53de61764ea0b51317200a0dbbfc,show_vacancy
1,2022-08-01,1659377255,108347,03bf8c511fa949c79845a5d81b09aa1d,f5a2326a17484330aa8cb4019f1b1960,show_vacancy
2,2022-08-01,1659376695,109069,03bf8c511fa949c79845a5d81b09aa1d,f5a2326a17484330aa8cb4019f1b1960,show_vacancy
3,2022-08-01,1659376722,171425,03bf8c511fa949c79845a5d81b09aa1d,f5a2326a17484330aa8cb4019f1b1960,show_vacancy
4,2022-08-01,1659374929,252384,03bf8c511fa949c79845a5d81b09aa1d,f5a2326a17484330aa8cb4019f1b1960,show_vacancy


In [3]:
df['event_type'].value_counts()

show_vacancy              6198889
preview_click_vacancy     4781280
click_response             384090
click_contacts             277584
preview_click_response     190635
click_favorite             155844
preview_click_favorite     107016
preview_click_contacts     102050
click_phone                 79191
preview_click_phone         16009
Name: event_type, dtype: int64

In [6]:
df['cookie_id'].nunique()

330180

In [5]:
df['user_id'].nunique()

209335

In [9]:
df['vacancy_id_'].nunique()

160167

In [10]:
df['event_date'].min(), df['event_date'].max()

('2022-08-01', '2022-10-01')

In [14]:
# вакансии в порядке популярности
sorted_vacancy = df['vacancy_id_'].value_counts()
sorted_vacancy

260154    59105
198114    50187
203404    45634
202608    41620
116823    40023
          ...  
208174        1
207219        1
208297        1
207097        1
136818        1
Name: vacancy_id_, Length: 160167, dtype: int64

In [12]:
# возьмем 5 слуйчаных идентификаторов для тестирования (валидационный датасет)
random_cookies = np.random.choice(df['cookie_id'].unique(), size=5)
random_cookies

array(['9ed5e109294444afbef12eb1f26edd35',
       '181e9903b0984201aa691722a01df6c1',
       'f93ab7afe74f49deb8ea5548a318d30d',
       '933375bf6a794a6a96e78fbcec177315',
       'af1d4ee823ac4944a19237934fe30b20'], dtype=object)

In [36]:
def make_prediction(cookie_id, size=5):
    # какие вакансии уже смотрел этот пользователь
    viewed_vac = df[df['cookie_id'] == cookie_id]['vacancy_id_']
    
    # удалим из предсказания
    predictions = sorted_vacancy.drop(viewed_vac)
    
    return list(predictions.iloc[:size].index)

In [37]:
# предсказания для одного айдишника
make_prediction('9ed5e109294444afbef12eb1f26edd35')

[198114, 203404, 202608, 116823, 207423]

In [38]:
# предсказания для нескольких айдишников в словарь
dct_predictions = {}

for cookie in random_cookies:
    dct_predictions[cookie] = make_prediction(cookie)
    
dct_predictions

{'9ed5e109294444afbef12eb1f26edd35': [198114, 203404, 202608, 116823, 207423],
 '181e9903b0984201aa691722a01df6c1': [260154, 198114, 203404, 202608, 116823],
 'f93ab7afe74f49deb8ea5548a318d30d': [260154, 198114, 203404, 202608, 116823],
 '933375bf6a794a6a96e78fbcec177315': [260154, 198114, 203404, 202608, 116823],
 'af1d4ee823ac4944a19237934fe30b20': [260154, 198114, 202608, 116823, 207423]}

In [39]:
# Проверим на precision@5. Будем считать как кол-во угаданных / 5
test_df = pd.read_parrquet('test_public_mfti.parquet', engine='pyarrow')

In [44]:
test_df.head()

Unnamed: 0,cookie_id,vacancy_id_
0,000cd76cd33f43d4a1ac1d16d10f8bf7,"[222177, 222173, 222163, 238874, 238878, 22812..."
1,0034bc7f404341ba8412665453e7825a,"[102794, 137587, 257319, 237756, 240744, 11348..."
2,00a6c5a64a274c55a836402bdeb3b2c4,"[254292, 164602, 116438, 228634, 218819, 24065..."
3,015937a125b14e74bdff1cddc49f9172,"[246685, 138123, 115420, 210628, 212325, 235196]"
4,01de50c280794cec8804f16f45f847b7,"[219070, 251469, 166899, 212703, 214561]"


In [49]:
def make_prediciton_column(cookies):
    lst = []
    for cookie in tqdm.tqdm(cookies):
        prediction = make_prediction(cookie)
        lst.append(prediction)
    return lst

In [50]:
test_df['predicted'] = make_prediciton_column(test_df['cookie_id'])

100%|█████████████████████████████████████████| 772/772 [08:54<00:00,  1.45it/s]


In [51]:
test_df.head()

Unnamed: 0,cookie_id,vacancy_id_,predicted
0,000cd76cd33f43d4a1ac1d16d10f8bf7,"[222177, 222173, 222163, 238874, 238878, 22812...","[260154, 198114, 203404, 202608, 116823]"
1,0034bc7f404341ba8412665453e7825a,"[102794, 137587, 257319, 237756, 240744, 11348...","[198114, 203404, 202608, 116823, 207423]"
2,00a6c5a64a274c55a836402bdeb3b2c4,"[254292, 164602, 116438, 228634, 218819, 24065...","[260154, 203404, 202608, 116823, 207423]"
3,015937a125b14e74bdff1cddc49f9172,"[246685, 138123, 115420, 210628, 212325, 235196]","[260154, 198114, 202608, 164602, 148714]"
4,01de50c280794cec8804f16f45f847b7,"[219070, 251469, 166899, 212703, 214561]","[260154, 198114, 203404, 202608, 116823]"


In [70]:
test_df['true_positive'] = test_df.apply(lambda x: len(set(x['vacancy_id_']) & set(x['predicted'])), axis=1)
test_df['precision@5'] = test_df.apply(lambda x: len(set(x['vacancy_id_']) & set(x['predicted'])) / 5, axis=1)

In [71]:
test_df

Unnamed: 0,cookie_id,vacancy_id_,predicted,true_positive,precision@5
0,000cd76cd33f43d4a1ac1d16d10f8bf7,"[222177, 222173, 222163, 238874, 238878, 22812...","[260154, 198114, 203404, 202608, 116823]",0,0.0
1,0034bc7f404341ba8412665453e7825a,"[102794, 137587, 257319, 237756, 240744, 11348...","[198114, 203404, 202608, 116823, 207423]",0,0.0
2,00a6c5a64a274c55a836402bdeb3b2c4,"[254292, 164602, 116438, 228634, 218819, 24065...","[260154, 203404, 202608, 116823, 207423]",0,0.0
3,015937a125b14e74bdff1cddc49f9172,"[246685, 138123, 115420, 210628, 212325, 235196]","[260154, 198114, 202608, 164602, 148714]",0,0.0
4,01de50c280794cec8804f16f45f847b7,"[219070, 251469, 166899, 212703, 214561]","[260154, 198114, 203404, 202608, 116823]",0,0.0
...,...,...,...,...,...
767,fdbcda17f22f406486837059e76c7fed,"[207851, 254989, 213344, 214180, 222146]","[260154, 198114, 203404, 202608, 116823]",0,0.0
768,fe6193ab26494ace9be5aae36e507618,"[115352, 230546, 225527, 120188, 109360, 23212...","[260154, 198114, 203404, 202608, 116823]",0,0.0
769,fe95b2826ee1452b81201ed3f4c3294d,"[240362, 114852, 253946, 251081, 127546, 244688]","[260154, 198114, 203404, 202608, 116823]",0,0.0
770,ff1aef256a49481698bb2e938510ff36,"[231194, 236363, 220747, 244688, 100094, 24052...","[260154, 198114, 203404, 202608, 116823]",0,0.0


In [74]:
# сколько ненулевых?
test_df[test_df['true_positive'] != 0]

Unnamed: 0,cookie_id,vacancy_id_,predicted,true_positive,precision@5
5,020c7d1d9a44451c9ef107fc2fe726f5,"[231017, 231243, 238691, 171153, 249934, 23537...","[260154, 198114, 202608, 116823, 164602]",1,0.2
36,0c9bc6e28eb74a65890ab8f35587f7c8,"[232857, 182084, 203404, 207423, 230419]","[260154, 198114, 203404, 202608, 116823]",1,0.2
66,15cee09b2f5a43fca6092907245e5d87,"[225976, 248720, 179082, 203404, 210628, 21229...","[260154, 198114, 203404, 202608, 116823]",1,0.2
75,186cae2dd58a42bb965fce95ff93e6d5,"[232857, 207108, 228866, 228253, 240695, 16460...","[260154, 198114, 203404, 116823, 207423]",2,0.4
77,18ea8b315fe847549c339647baf0bfee,"[260154, 138634, 229432, 257379, 246162, 24104...","[260154, 198114, 203404, 202608, 116823]",1,0.2
...,...,...,...,...,...
716,eb2d7fea87b643a19764052610dd2bfd,"[164602, 248852, 182870, 220718, 207423, 23070...","[260154, 203404, 202608, 116823, 207423]",1,0.2
732,f3b9473d542a4b4284ed7aeb1d1100c9,"[234893, 236510, 235946, 238691, 108345, 19999...","[116823, 207423, 148714, 182870, 111867]",1,0.2
734,f3ff3a77eff94997ad7b15e82e0c9976,"[238046, 211189, 207108, 210503, 203404, 11183...","[260154, 198114, 203404, 202608, 116823]",1,0.2
741,f5c67ff429f64b4b9478e577432a6a3c,"[240614, 240075, 107369, 223446, 230707, 23489...","[260154, 198114, 116823, 207423, 164602]",1,0.2


In [73]:
# усредним метрику
test_df['precision@5'].mean() 

0.019689119170984433