In [62]:
import pandas as pd
import numpy as np

import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
import matplotlib.pyplot as plt

import tqdm

import warnings
warnings.filterwarnings('ignore')

In [3]:
reactions = {
    'show_vacancy': 0, 
    'preview_click_vacancy': 0, 
    'click_response': 1,
    'click_contacts': 1, 
    'preview_click_response': 1, 
    'click_favorite': 0,
    'preview_click_favorite': 0, 
    'preview_click_contacts': 1, 
    'click_phone': 1,
    'preview_click_phone': 1
}

In [4]:
train_data = pd.read_parquet('train_mfti.parquet', engine='pyarrow')

In [5]:
train_data['target'] = train_data['event_type'].apply(lambda x: reactions[x])

In [6]:
train_data.head()

Unnamed: 0,event_date,event_timestamp,vacancy_id_,cookie_id,user_id,event_type,target
0,2022-08-01,1659323026,129850,97990f1a021d4be19aa3f955b7eacab4,951f53de61764ea0b51317200a0dbbfc,show_vacancy,0
1,2022-08-01,1659377255,108347,03bf8c511fa949c79845a5d81b09aa1d,f5a2326a17484330aa8cb4019f1b1960,show_vacancy,0
2,2022-08-01,1659376695,109069,03bf8c511fa949c79845a5d81b09aa1d,f5a2326a17484330aa8cb4019f1b1960,show_vacancy,0
3,2022-08-01,1659376722,171425,03bf8c511fa949c79845a5d81b09aa1d,f5a2326a17484330aa8cb4019f1b1960,show_vacancy,0
4,2022-08-01,1659374929,252384,03bf8c511fa949c79845a5d81b09aa1d,f5a2326a17484330aa8cb4019f1b1960,show_vacancy,0


In [7]:
train_data['target'].value_counts()

0    11243029
1     1049559
Name: target, dtype: int64

In [8]:
train_data['cookie_id'].value_counts()

924398d361a0454c8c30845c2b4c5747    4898
7d8cc5a0fef44378a2d90a237cda288e    3343
5934d5b8a0b348829d8efabe69c733eb    3315
b7dd2f20fdd6472ab62f8d86a739cd5d    3312
353b56c4fa6447d0ba0f08b42d86e51f    2795
                                    ... 
1b4240c448df47f29e00ef38b8215ea2       5
99c776f7ff1d4e73919342365bc0de0a       5
f84c882a92d74646bfc6eb4447b11d3c       5
9b86c05707bc43d19dfaf60563627c2d       5
694ea921f0d14d56853bf9856866110d       5
Name: cookie_id, Length: 330180, dtype: int64

In [9]:
# !pip install surprise

In [10]:
from surprise import Dataset, Reader

from surprise.prediction_algorithms.matrix_factorization import SVD

from surprise import accuracy

In [11]:
disctinct_data = train_data[['cookie_id', 'vacancy_id_', 'target']].groupby(by=['cookie_id', 'vacancy_id_']).sum().reset_index()

In [12]:
# Initialize a surprise reader object
reader = Reader(line_format='user item rating', sep=',', rating_scale=(0,5), skip_lines=1)

# Load the data
data = Dataset.load_from_df(train_data[['cookie_id','vacancy_id_','target']], reader=reader)

# Build trainset object(perform this only when you are using whole dataset to train)
trainset = data.build_full_trainset()

In [13]:
# Initialize model
svd = SVD()

# cross-validate
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1118065b0>

In [91]:
def make_prediction(cookie_id, top_n=5):
    # какие вакансии уже смотрел этот пользователь
    viewed_vac = train_data[train_data['cookie_id'] == cookie_id]['vacancy_id_']
    
    # все вакансии
    all_vacancies = pd.Series(train_data['vacancy_id_'].unique())
    
    # уберем просмотренные
    unviewed_vac = all_vacancies[~all_vacancies.isin(viewed_vac)]
    
    # предскажем рейтинг непросмотренных
    recommendations = []
    for vac_id in unviewed_vac:
        # predicting the ratings for those non interacted product ids by this user
        est = svd.predict(cookie_id, vac_id).est
        
        # appending the predicted ratings
        # movie_name = movie_md[movie_md['id']==str(item_id)]['title'].values[0]
        recommendations.append((vac_id, est))
        
    recommendations.sort(key=lambda x: x[1], reverse=True)
    top_n_recommendations = [x for x in recommendations[:top_n]]

    return top_n_recommendations

In [92]:
make_prediction(cookie_id='000cd76cd33f43d4a1ac1d16d10f8bf7')

[(208243, 0.6578583649496866),
 (224057, 0.6426922320947608),
 (235718, 0.6302035268161327),
 (164086, 0.6249818752923296),
 (200806, 0.6143384842706172)]

In [74]:
# Проверим на precision@5
test_df = pd.read_parquet('test_public_mfti.parquet', engine='pyarrow')

In [75]:
test_df.head()

Unnamed: 0,cookie_id,vacancy_id_
0,000cd76cd33f43d4a1ac1d16d10f8bf7,"[222177, 222173, 222163, 238874, 238878, 22812..."
1,0034bc7f404341ba8412665453e7825a,"[102794, 137587, 257319, 237756, 240744, 11348..."
2,00a6c5a64a274c55a836402bdeb3b2c4,"[254292, 164602, 116438, 228634, 218819, 24065..."
3,015937a125b14e74bdff1cddc49f9172,"[246685, 138123, 115420, 210628, 212325, 235196]"
4,01de50c280794cec8804f16f45f847b7,"[219070, 251469, 166899, 212703, 214561]"


In [76]:
def make_prediciton_column(cookies):
    lst = []
    for cookie in tqdm.tqdm(cookies):
        prediction = make_prediction(cookie)
        lst.append(prediction)
    return lst

In [77]:
test_df['predicted'] = make_prediciton_column(test_df['cookie_id'])

100%|█████████████████████████████████████████| 772/772 [17:41<00:00,  1.37s/it]


In [85]:
test_df['true_positive'] = test_df.apply(lambda x: len(set(x['vacancy_id_']) & set(x['predicted'])), axis=1)
test_df['precision@5'] = test_df.apply(lambda x: len(set(x['vacancy_id_']) & set(x['predicted'])) / 5, axis=1)

In [86]:
test_df.head()

Unnamed: 0,cookie_id,vacancy_id_,predicted,true_positive,precision@5
0,000cd76cd33f43d4a1ac1d16d10f8bf7,"[222177, 222173, 222163, 238874, 238878, 22812...","[208243, 224057, 235718, 164086, 200806]",0,0.0
1,0034bc7f404341ba8412665453e7825a,"[102794, 137587, 257319, 237756, 240744, 11348...","[201679, 224232, 164086, 236160, 100947]",0,0.0
2,00a6c5a64a274c55a836402bdeb3b2c4,"[254292, 164602, 116438, 228634, 218819, 24065...","[201679, 236964, 207806, 229449, 225003]",0,0.0
3,015937a125b14e74bdff1cddc49f9172,"[246685, 138123, 115420, 210628, 212325, 235196]","[164086, 188067, 113155, 112103, 235907]",0,0.0
4,01de50c280794cec8804f16f45f847b7,"[219070, 251469, 166899, 212703, 214561]","[201679, 235718, 191031, 139216, 239294]",0,0.0


In [81]:
# усредним метрику
test_df['precision@5'].mean() 

0.0

In [84]:
# сколько ненулевых?
test_df[test_df['true_positive'] != 0]

Unnamed: 0,cookie_id,vacancy_id_,predicted,true_positive,precision@5
