In [1]:
import pandas as pd
import numpy as np
import tqdm

In [2]:
reactions = {
    'show_vacancy': 0, 
    'preview_click_vacancy': 0, 
    'click_response': 1,
    'click_contacts': 1, 
    'preview_click_response': 1, 
    'click_favorite': 0,
    'preview_click_favorite': 0, 
    'preview_click_contacts': 1, 
    'click_phone': 1,
    'preview_click_phone': 1
}

In [3]:
df = pd.read_parquet('train_mfti.parquet', engine='pyarrow')

In [4]:
df['target'] = df['event_type'].apply(lambda x: reactions[x])

In [5]:
df.head()

Unnamed: 0,event_date,event_timestamp,vacancy_id_,cookie_id,user_id,event_type,target
0,2022-08-01,1659323026,129850,97990f1a021d4be19aa3f955b7eacab4,951f53de61764ea0b51317200a0dbbfc,show_vacancy,0
1,2022-08-01,1659377255,108347,03bf8c511fa949c79845a5d81b09aa1d,f5a2326a17484330aa8cb4019f1b1960,show_vacancy,0
2,2022-08-01,1659376695,109069,03bf8c511fa949c79845a5d81b09aa1d,f5a2326a17484330aa8cb4019f1b1960,show_vacancy,0
3,2022-08-01,1659376722,171425,03bf8c511fa949c79845a5d81b09aa1d,f5a2326a17484330aa8cb4019f1b1960,show_vacancy,0
4,2022-08-01,1659374929,252384,03bf8c511fa949c79845a5d81b09aa1d,f5a2326a17484330aa8cb4019f1b1960,show_vacancy,0


In [3]:
df['event_type'].value_counts()

show_vacancy              6198889
preview_click_vacancy     4781280
click_response             384090
click_contacts             277584
preview_click_response     190635
click_favorite             155844
preview_click_favorite     107016
preview_click_contacts     102050
click_phone                 79191
preview_click_phone         16009
Name: event_type, dtype: int64

In [6]:
df['cookie_id'].nunique()

330180

In [5]:
df['user_id'].nunique()

209335

In [9]:
df['vacancy_id_'].nunique()

160167

In [10]:
df['event_date'].min(), df['event_date'].max()

('2022-08-01', '2022-10-01')

In [7]:
# вакансии в порядке популярности + с целевым действием
sorted_vacancy = df[df['target'] == 1]['vacancy_id_'].value_counts()
sorted_vacancy

116823    5722
182870    5194
207423    4722
198114    3658
174953    3509
          ... 
228155       1
228777       1
228834       1
229036       1
136818       1
Name: vacancy_id_, Length: 121036, dtype: int64

In [6]:
# вакансии в порядке популярности 
sorted_vacancy = df['vacancy_id_'].value_counts()
sorted_vacancy

260154    59105
198114    50187
203404    45634
202608    41620
116823    40023
          ...  
208174        1
207219        1
208297        1
207097        1
136818        1
Name: vacancy_id_, Length: 160167, dtype: int64

In [12]:
# возьмем 5 слуйчаных идентификаторов для тестирования (валидационный датасет)
random_cookies = np.random.choice(df['cookie_id'].unique(), size=5)
random_cookies

array(['9ed5e109294444afbef12eb1f26edd35',
       '181e9903b0984201aa691722a01df6c1',
       'f93ab7afe74f49deb8ea5548a318d30d',
       '933375bf6a794a6a96e78fbcec177315',
       'af1d4ee823ac4944a19237934fe30b20'], dtype=object)

In [17]:
def make_prediction(cookie_id, size=5):
    # какие вакансии уже смотрел этот пользователь
    viewed_vac = df[df['cookie_id'] == cookie_id]['vacancy_id_']
    
    # удалим из предсказания
    predictions = sorted_vacancy.drop(viewed_vac, errors='ignore')
    
    return list(predictions.iloc[:size].index)

In [37]:
# предсказания для одного айдишника
make_prediction('9ed5e109294444afbef12eb1f26edd35')

[198114, 203404, 202608, 116823, 207423]

In [38]:
# предсказания для нескольких айдишников в словарь
dct_predictions = {}

for cookie in random_cookies:
    dct_predictions[cookie] = make_prediction(cookie)
    
dct_predictions

{'9ed5e109294444afbef12eb1f26edd35': [198114, 203404, 202608, 116823, 207423],
 '181e9903b0984201aa691722a01df6c1': [260154, 198114, 203404, 202608, 116823],
 'f93ab7afe74f49deb8ea5548a318d30d': [260154, 198114, 203404, 202608, 116823],
 '933375bf6a794a6a96e78fbcec177315': [260154, 198114, 203404, 202608, 116823],
 'af1d4ee823ac4944a19237934fe30b20': [260154, 198114, 202608, 116823, 207423]}

In [10]:
# Проверим на precision@5. Будем считать как кол-во угаданных / 5
test_df = pd.read_parquet('test_public_mfti.parquet', engine='pyarrow')

In [11]:
test_df.head()

Unnamed: 0,cookie_id,vacancy_id_
0,000cd76cd33f43d4a1ac1d16d10f8bf7,"[222177, 222173, 222163, 238874, 238878, 22812..."
1,0034bc7f404341ba8412665453e7825a,"[102794, 137587, 257319, 237756, 240744, 11348..."
2,00a6c5a64a274c55a836402bdeb3b2c4,"[254292, 164602, 116438, 228634, 218819, 24065..."
3,015937a125b14e74bdff1cddc49f9172,"[246685, 138123, 115420, 210628, 212325, 235196]"
4,01de50c280794cec8804f16f45f847b7,"[219070, 251469, 166899, 212703, 214561]"


In [19]:
def make_prediciton_column(cookies):
    lst = []
    for cookie in tqdm.tqdm(cookies):
        prediction = make_prediction(cookie)
        lst.append(prediction)
    return lst

In [20]:
test_df['predicted'] = make_prediciton_column(test_df['cookie_id'])

100%|█████████████████████████████████████████| 772/772 [08:45<00:00,  1.47it/s]


In [21]:
test_df.head()

Unnamed: 0,cookie_id,vacancy_id_,predicted
0,000cd76cd33f43d4a1ac1d16d10f8bf7,"[222177, 222173, 222163, 238874, 238878, 22812...","[116823, 182870, 207423, 198114, 174953]"
1,0034bc7f404341ba8412665453e7825a,"[102794, 137587, 257319, 237756, 240744, 11348...","[116823, 182870, 207423, 198114, 174953]"
2,00a6c5a64a274c55a836402bdeb3b2c4,"[254292, 164602, 116438, 228634, 218819, 24065...","[116823, 182870, 207423, 174953, 260154]"
3,015937a125b14e74bdff1cddc49f9172,"[246685, 138123, 115420, 210628, 212325, 235196]","[198114, 260154, 202608, 148714, 164602]"
4,01de50c280794cec8804f16f45f847b7,"[219070, 251469, 166899, 212703, 214561]","[116823, 182870, 207423, 198114, 174953]"


In [22]:
test_df['true_positive'] = test_df.apply(lambda x: len(set(x['vacancy_id_']) & set(x['predicted'])), axis=1)
test_df['precision@5'] = test_df.apply(lambda x: len(set(x['vacancy_id_']) & set(x['predicted'])) / 5, axis=1)

In [23]:
test_df

Unnamed: 0,cookie_id,vacancy_id_,predicted,true_positive,precision@5
0,000cd76cd33f43d4a1ac1d16d10f8bf7,"[222177, 222173, 222163, 238874, 238878, 22812...","[116823, 182870, 207423, 198114, 174953]",0,0.0
1,0034bc7f404341ba8412665453e7825a,"[102794, 137587, 257319, 237756, 240744, 11348...","[116823, 182870, 207423, 198114, 174953]",0,0.0
2,00a6c5a64a274c55a836402bdeb3b2c4,"[254292, 164602, 116438, 228634, 218819, 24065...","[116823, 182870, 207423, 174953, 260154]",0,0.0
3,015937a125b14e74bdff1cddc49f9172,"[246685, 138123, 115420, 210628, 212325, 235196]","[198114, 260154, 202608, 148714, 164602]",0,0.0
4,01de50c280794cec8804f16f45f847b7,"[219070, 251469, 166899, 212703, 214561]","[116823, 182870, 207423, 198114, 174953]",0,0.0
...,...,...,...,...,...
767,fdbcda17f22f406486837059e76c7fed,"[207851, 254989, 213344, 214180, 222146]","[116823, 182870, 207423, 198114, 174953]",0,0.0
768,fe6193ab26494ace9be5aae36e507618,"[115352, 230546, 225527, 120188, 109360, 23212...","[116823, 182870, 207423, 198114, 174953]",0,0.0
769,fe95b2826ee1452b81201ed3f4c3294d,"[240362, 114852, 253946, 251081, 127546, 244688]","[116823, 182870, 207423, 198114, 174953]",0,0.0
770,ff1aef256a49481698bb2e938510ff36,"[231194, 236363, 220747, 244688, 100094, 24052...","[116823, 182870, 207423, 198114, 174953]",0,0.0


In [24]:
# сколько ненулевых?
test_df[test_df['true_positive'] != 0]

Unnamed: 0,cookie_id,vacancy_id_,predicted,true_positive,precision@5
12,043b9752285e491ea4d34807dcc94065,"[207423, 240649, 116651, 119608, 221920, 21530...","[116823, 182870, 207423, 198114, 174953]",1,0.2
36,0c9bc6e28eb74a65890ab8f35587f7c8,"[232857, 182084, 203404, 207423, 230419]","[116823, 182870, 207423, 198114, 174953]",1,0.2
58,13ca760a7b3948d0aa48adbed26f3b67,"[239744, 226061, 182870, 220718, 110793, 22760...","[116823, 182870, 198114, 174953, 260154]",1,0.2
74,18604a58ee7d49d3a75780f2b1999238,"[230707, 257698, 201370, 182870, 110578, 22808...","[116823, 182870, 198114, 174953, 260154]",1,0.2
75,186cae2dd58a42bb965fce95ff93e6d5,"[232857, 207108, 228866, 228253, 240695, 16460...","[116823, 182870, 207423, 198114, 174953]",1,0.2
...,...,...,...,...,...
723,ee828a37d71a4963a0bccf0c76297545,"[207423, 230707, 217606, 237288, 214513, 18208...","[116823, 182870, 207423, 198114, 174953]",1,0.2
734,f3ff3a77eff94997ad7b15e82e0c9976,"[238046, 211189, 207108, 210503, 203404, 11183...","[116823, 182870, 207423, 198114, 174953]",1,0.2
741,f5c67ff429f64b4b9478e577432a6a3c,"[240614, 240075, 107369, 223446, 230707, 23489...","[116823, 207423, 198114, 174953, 260154]",1,0.2
747,f71abd9d3c9c41ce8db075767d9c3494,"[182870, 198413, 205010, 212325, 230906, 23173...","[116823, 182870, 174953, 260154, 158242]",1,0.2


In [25]:
# усредним метрику
test_df['precision@5'].mean() 

0.017616580310880828