In [1]:
import numpy as np
import pandas as pd
import tqdm
from scipy import sparse
import scipy
from sklearn.metrics.pairwise import cosine_similarity

# Prepare

In [None]:
df = pd.read_parquet('train_mfti.parquet', engine='pyarrow')

users_cookies = df[df['user_id'].notna()].groupby(['cookie_id'], as_index=False)['user_id'].agg(['unique'])
users_cookies_list = users_cookies.index

In [None]:
def set_id_unknown(cookie):
    if cookie in users_cookies_list:
        return users_cookies.loc[cookie][0][0]
    else:
        return cookie
    
arr_new_id = list()
for i in df['cookie_id']:
    arr_new_id.append(set_id_unknown(i))
    
df.insert(6, "new_id", arr_new_id, True)
print(df[df['new_id'].isna()]['new_id'].sum() == 0)

In [None]:
df.to_csv('with_new_ids.csv', index=False)

---

In [None]:
df = pd.read_csv('with_new_ids.csv', engine='c')
df.drop(['user_id', 'event_date'], axis=1, inplace=True)
df

# Main

In [None]:
# default
def create_massive(original_df, file_name):
    uniq_users_id = original_df['new_id'].unique()
    uniq_vacancy = original_df['vacancy_id_'].unique()
    size = len(uniq_vacancy)
    
    interactions = original_df.query('event_type != "preview_click_vacancy" and event_type != "show_vacancy"') \
    .groupby(['new_id', 'vacancy_id_', 'event_type'], as_index=False) \
    .agg({'event_timestamp': 'count'})

    massive = np.zeros((len(uniq_users_id), size), dtype='int8')  
    for k1, i in enumerate(tqdm.tqdm(uniq_users_id)):
        user_vacancies = set(interactions[interactions['new_id'] == i]['vacancy_id_'])
        for k2, w in enumerate(uniq_vacancy):
            if w in user_vacancies: massive[k1][k2] = 1

    massive = pd.DataFrame(data = massive, index=uniq_users_id, columns=uniq_vacancy)
    return massive

# def create_massive(original_df, file_name):
#     uniq_users_id = original_df['new_id'].unique()
#     uniq_vacancy = original_df['vacancy_id_'].unique()
#     size = len(uniq_vacancy)
    
#     interactions = original_df.groupby(['new_id', 'vacancy_id_', 'event_type'], as_index=False) \
#     .agg({'event_timestamp': 'count'})

#     massive = np.zeros((len(uniq_users_id), size), dtype='int8')  
#     for k1, i in enumerate(tqdm.tqdm(uniq_users_id)):
#         user_vacancies = set(interactions[interactions['new_id'] == i]['vacancy_id_'])
#         for k2, w in enumerate(uniq_vacancy):
#             if w in user_vacancies: massive[k1][k2] = 1

#     massive = pd.DataFrame(data = massive, index=uniq_users_id, columns=uniq_vacancy)
#     return massive

In [None]:
# Get vectorized users
vectors = create_massive(df, 'vectors_wout_show_preview')

sparse_vectors = sparse.csr_matrix(vectors)
scipy.sparse.save_npz('sparse_massive.npz', sparse_vectors)

In [None]:
similarity = cosine_similarity(sparse_vectors, dense_output=False)
changed_similarity = similarity.astype(dtype='float16', copy=True)
scipy.sparse.save_npz('changed_similarity_results.npz', changed_similarity)

In [None]:
changed_similarity = scipy.sparse.load_npz('changed_similarity_results.npz')
A = pd.DataFrame.sparse.from_spmatrix(changed_similarity)
display(A) # матрица юзер х юзер, на пересечении похожесть

In [None]:
# f5 = 0.0189

# куки и соответствующие id юзеров
users_cookies = df.groupby(['cookie_id'], as_index=False)['new_id'].agg(['unique'])

# тут уже все действия юзеров
interactions = df.groupby(['new_id', 'vacancy_id_', 'event_type'], as_index=False) \
.agg({'event_timestamp': 'count'}) \
.sort_values(['event_timestamp'], ascending=False)

# default vacancies to give
top_vacancies = df.pivot_table(index='vacancy_id_', columns='event_type', values='event_timestamp', aggfunc='count', fill_value=0)
top_vacancies['ctr'] = top_vacancies.apply(lambda x: (x['preview_click_vacancy'] + x['click_response'] + x['click_contacts'] + x['preview_click_response']+ x['click_favorite'] + x['preview_click_favorite'] + x['preview_click_contacts'] + x['click_phone'] + x['preview_click_phone'])/x['show_vacancy'] if x['show_vacancy'] != 0 else 0, axis=1)
top_vacancies = list(top_vacancies.sort_values(['ctr'], ascending=False).index[:5])

# Получить new_id/user_id по куки
def get_new_id_by_cookie(cookie):
    return users_cookies.loc[cookie][0][0]

# Лист всех уникальных юзеров по-порядку, в котором составлялась матрица
user_unique_columns = df['new_id'].unique()
# Получить индекс юзера в списке
def get_user_index(user_id):
    return np.where(user_unique_columns == user_id)[0][0]

# Получение вакансий для рекомендации
def get_similar_users_ordered(user_index):
    similar_users_index_list = list()
    for i, val in enumerate(A[user_index]):
        if (val > 0) and (i != user_index):
            similar_users_index_list.append((i, val))
    similar_users_index_list = sorted(similar_users_index_list, key = lambda x: x[1], reverse=True)
    return similar_users_index_list

# Ранжирование товаров юзера
def get_vacancies_ordered_by_user(new_id):
    user_interaction = interactions[interactions['new_id'] == new_id]
    unique_user_events = user_interaction['event_type'].unique()

    vacancy_table = user_interaction.pivot_table(index='vacancy_id_', columns='event_type', values='event_timestamp', fill_value=0).reset_index()
    vacancy_table['ctr'] = vacancy_table.apply(lambda x: (sum([x[i] for i in unique_user_events])), axis=1)
    vac_dict = list([[v['vacancy_id_'] , v['ctr']] for i, v in vacancy_table.iterrows()])
    return sorted(vac_dict, key = lambda x: x[1], reverse=True)

# Получение отранжированных вакансий по всем юзерам
def get_ordered_vacancies(similar_users_list):
    recommend_vac = list()
    for i in similar_users_list[:15]: # пока беру первых 15 похожих юзеров
        user = user_unique_columns[i[0]]
        vacancies = get_vacancies_ordered_by_user(user)
        for v in vacancies:
            recommend_vac.append(v[0])
    recommend_vac = list(dict.fromkeys(recommend_vac))
    return recommend_vac


# Получить использованные вакансии
def get_used_vacancies(new_id):
    return interactions[interactions['new_id'] == new_id]['vacancy_id_'].unique()
    
    
# Получение рекомендации
def get_user_recommendation(cookie, size=5):
    recommend_result = []
    user_id = get_new_id_by_cookie(cookie)
    index = get_user_index(user_id)
    similar_index_list = get_similar_users_ordered(index)
    
    all_recommended_vac = get_ordered_vacancies(similar_index_list)
    users_used_vacs = get_used_vacancies(user_id)
    
    if len(all_recommended_vac) > 0:
        for i in all_recommended_vac:
            if i not in users_used_vacs:
                recommend_result.append(i)
    else:
        return top_vacancies
    return recommend_result[:5]

In [None]:
test_df = pd.read_parquet('test_public_mfti.parquet', engine='pyarrow')
test_df

In [None]:
def make_prediciton(cookies):
    lst = []
    for cookie in tqdm.tqdm(cookies):
        prediction = get_user_recommendation(cookie)
        lst.append(prediction)
    return lst

test_df['predicted'] = make_prediciton(test_df['cookie_id'])

In [None]:
test_df['true_positive'] = test_df.apply(lambda x: len(set(x['vacancy_id_']) & set(x['predicted'])), axis=1)
test_df['f5'] = test_df.apply(lambda x: len(set(x['vacancy_id_']) & set(x['predicted'])) / 5, axis=1)

In [None]:
test_df[test_df['true_positive'] != 0]

In [None]:
test_df['f5'].mean() 

In [None]:
test_df.to_csv('result_0_1.csv')