In [1]:
from __future__ import annotations

from typing import List

from itertools import cycle, chain
from itertools import islice

import polars as pl
import pandas as pd
import numpy as np
import os
from os.path import join as path_join
from tqdm import tqdm

from collections import Counter

In [2]:
train_data_path = "../data/processed/train.parquet"
test_data_path = "../data/processed/test_inference.parquet"
vacancies_path = "../data/raw/hh_recsys_vacancies.pq"
test_vacancies_path = "../data/processed/test_vacancies.parquet"
user_history_data_path = "dumps/user_history/history_data.parquet"

In [3]:
train = pd.read_parquet(train_data_path)
test  = pd.read_parquet(test_data_path)
user_history_data = pd.read_parquet(user_history_data_path)

In [4]:
test_vacancies = test["target_vacancy_id"].to_list()
test_vacancies += list(chain(*test["vacancy_id"].values))
test_vacancies += list(chain(*train["vacancy_id"].values))
test_vacancies = pd.Series(list(set(test_vacancies)))

len(test_vacancies)

1199909

In [5]:
# vacancies = pl.read_parquet(vacancies_path).to_pandas()

In [6]:
# vacancies[vacancies["vacancy_id"] == "v_2691293"]

In [7]:
# vacancies.columns 

In [8]:
# vacancies = vacancies.merge(
#     test_vacancies.rename('vacancy_id'),
#     "inner",
#     "vacancy_id")

In [9]:
# vacancies.to_parquet(test_vacancies_path)

In [10]:
test_vacancies = pd.read_parquet(test_vacancies_path)

In [11]:
test = test.merge(user_history_data, "left", "user_id")

In [12]:
test.head(1)

Unnamed: 0,user_id,session_id,target_session_id,vacancy_id,action_type,action_dt,target_vacancy_id,items,item_actions
0,u_1000060,s_19856666,s_6481076,[v_1962314],[2],[2023-11-10T14:21:18.628000000],v_76636,"[[v_1500295, v_1500295], [v_1500295, v_524850]...","[[2, 1], [2, 2], [2], [2, 2], [2], [2, 2, 2], ..."


In [13]:
test_vacancies = test_vacancies.set_index("vacancy_id").to_dict()

In [14]:
test_vacancies["name"]["v_1446558"]

'–ë—É—Ö–≥–∞–ª—Ç–µ—Ä –ø–æ –±–∞–Ω–∫–æ–≤—Å–∫–∏–º –æ–ø–µ—Ä–∞—Ü–∏—è–º'

In [15]:
test["items"].apply(lambda x: int(isinstance(x, np.ndarray))).mean()

0.7887047347749148

In [16]:
def get_action_name(action_type):
    if action_type == 1:
        return "ü•∞ –û—Ç–∫–ª–∏–∫–Ω—É–ª—Å—è"
    elif action_type == 2:
        return "–ü—Ä–æ—Å–º–æ—Ç—Ä–µ–ª"
    elif action_type == 3:
        return "–î–æ–±–∞–≤–∏–ª –≤ –∏–∑–±—Ä–∞–Ω–Ω–æ–µ"
    else:
        return "–ß—Ç–æ-—á—Ç–æ —Å–¥–µ–ª–∞–ª?"

def pprint_vacancy(vacancy_id):
    if vacancy_id not in test_vacancies["name"]:
        print("        - [–¢–∞–∫–æ–π –≤–∞–∫–∞–Ω—Å–∏–∏ –Ω–µ –±—ã–ª–æ –≤ —Ç–µ—Å—Ç]")
        return
    
    print("        -", test_vacancies["name"][vacancy_id])

    if test_vacancies["company.id"][vacancy_id] is not None:
        print("        -", "Id –∫–æ–º–ø–∞–Ω–∏–∏", test_vacancies["company.id"][vacancy_id])

    if test_vacancies["keySkills.keySkill"][vacancy_id] is not None:
        print("        -", ", ".join(test_vacancies["keySkills.keySkill"][vacancy_id]))

    if test_vacancies["compensation.from"][vacancy_id] is not None and not np.isnan(test_vacancies["compensation.from"][vacancy_id]):
        print("        -", "–û—Ç", test_vacancies["compensation.from"][vacancy_id], test_vacancies["compensation.currencyCode"][vacancy_id])

    if test_vacancies["compensation.to"][vacancy_id] is not None and not np.isnan(test_vacancies["compensation.to"][vacancy_id]):
        print("        -", "–î–æ", test_vacancies["compensation.to"][vacancy_id], test_vacancies["compensation.currencyCode"][vacancy_id])

    if test_vacancies["area.id"][vacancy_id] is not None:
        print("        -", "Id –æ–±–ª–∞—Å—Ç–∏", test_vacancies["area.id"][vacancy_id])

    if test_vacancies["area.regionId"][vacancy_id] is not None:
        print("        -", "Id —Ä–µ–≥–∏–æ–Ω–∞", test_vacancies["area.regionId"][vacancy_id])

    if test_vacancies["employment"][vacancy_id] is not None:
        print("        -", "–£—Å—Ç—Ä–æ–π—Å—Ç–≤–æ", test_vacancies["employment"][vacancy_id])

    if test_vacancies["workSchedule"][vacancy_id] is not None:
        print("        -", "–ì—Ä–∞—Ñ–∏–∫ —Ä–∞–±–æ—Ç—ã:", test_vacancies["workSchedule"][vacancy_id])

    if test_vacancies["workExperience"][vacancy_id] is not None:
        print("        -", "–û–ø—ã—Ç", test_vacancies["workExperience"][vacancy_id])

In [17]:
# for idx, row in test_sample.iterrows():
#     print("====================================================================================================")
#     print("====================================================================================================")

#     print("–Æ–∑–µ—Ä:", row["user_id"])
#     print("----")
#     print("    - –í–∞–∫–∞–Ω—Å–∏–∏ –≤ –ø—Ä–æ—à–ª—ã—Ö —Å–µ—Å—Å–∏—è:")

#     if isinstance(row["items"], np.ndarray):
#         for session in zip(row["items"], row["item_actions"]):
#             for vacancy_id, action_type in zip(session[0], session[1]):
#                 print(get_action_name(action_type), vacancy_id)
#                 pprint_vacancy(vacancy_id)
#     else:
#         print("        - –ò—Ö –Ω–µ –±—ã–ª–æ")
#     print("----")  
#     print("    - –í–∞–∫–∞–Ω—Å–∏–∏ –≤ —Ç–µ—Å—Ç–æ–≤–æ–π —Å–µ—Å—Å–∏–∏:")
#     for dt, vacancy_id, action_type in zip(row["action_dt"], row["vacancy_id"], row["action_type"]):
#         print(get_action_name(action_type), vacancy_id)
#         pprint_vacancy(vacancy_id)

#     print("")
#     print("^^^^^^^^^^^^^^^^^^^^^")
#     print("–í —Å–ª–µ–¥—É—é—â–µ–π —Å–µ—Å—Å–∏–∏ –æ—Ç–∫–ª–∏–∫–Ω—É–ª—Å—è –Ω–∞", row["target_vacancy_id"])
#     pprint_vacancy(row["target_vacancy_id"])

#     if idx > 10:
#         break

In [18]:
"""
    –ú—ã—Å–ª–∏

    - –í–æ—Ç –µ—Å–ª–∏ —á–µ–ª–æ–≤–µ–∫ –≥–ª—è–Ω—É–ª –∫–∞–∫—É—é-—Ç–æ –∫–æ–º–ø–∞–Ω–∏—é, —Ç–æ –∫–∞–∫–æ–π —à–∞–Ω—Å, —á—Ç–æ –æ–Ω –æ—Ç–∫–ª–∏–∫–Ω–µ—Ç—Å—è –Ω–∞ —ç—Ç—É –∫–æ–º–ø–∞–Ω–∏—é –∏–º–µ–Ω–Ω–æ
"""

'\n    –ú—ã—Å–ª–∏\n\n    - –í–æ—Ç –µ—Å–ª–∏ —á–µ–ª–æ–≤–µ–∫ –≥–ª—è–Ω—É–ª –∫–∞–∫—É—é-—Ç–æ –∫–æ–º–ø–∞–Ω–∏—é, —Ç–æ –∫–∞–∫–æ–π —à–∞–Ω—Å, —á—Ç–æ –æ–Ω –æ—Ç–∫–ª–∏–∫–Ω–µ—Ç—Å—è –Ω–∞ —ç—Ç—É –∫–æ–º–ø–∞–Ω–∏—é –∏–º–µ–Ω–Ω–æ\n'

In [19]:
get_company = lambda vacancy_id: test_vacancies["company.id"][vacancy_id]
get_area    = lambda vacancy_id: test_vacancies["area.id"][vacancy_id]
get_region  = lambda vacancy_id: test_vacancies["area.regionId"][vacancy_id]
get_employment = lambda vacancy_id: test_vacancies["employment"][vacancy_id]
get_workSchedule    = lambda vacancy_id: test_vacancies["workSchedule"][vacancy_id]
get_workExperience  = lambda vacancy_id: test_vacancies["workExperience"][vacancy_id]

# test["is_same_company"] = test[["vacancy_id", "target_vacancy_id"]].apply(
#     lambda row: int(get_company(row["target_vacancy_id"]) in set(filter(lambda x: x, map(get_company, row["vacancy_id"])))),
#     axis=1
# )
# test["is_same_area"] = test[["vacancy_id", "target_vacancy_id"]].apply(
#     lambda row: int(get_area(row["target_vacancy_id"]) in set(filter(lambda x: x, map(get_area, row["vacancy_id"])))),
#     axis=1
# )
# test["is_same_region"] = test[["vacancy_id", "target_vacancy_id"]].apply(
#     lambda row: int(get_region(row["target_vacancy_id"]) in set(filter(lambda x: x, map(get_region, row["vacancy_id"])))),
#     axis=1
# )
# test["is_same_employment"] = test[["vacancy_id", "target_vacancy_id"]].apply(
#     lambda row: int(get_employment(row["target_vacancy_id"]) in set(filter(lambda x: x, map(get_employment, row["vacancy_id"])))),
#     axis=1
# )
# test["is_same_workSchedule"] = test[["vacancy_id", "target_vacancy_id"]].apply(
#     lambda row: int(get_workSchedule(row["target_vacancy_id"]) in set(filter(lambda x: x, map(get_workSchedule, row["vacancy_id"])))),
#     axis=1
# )
# test["is_same_workExperience"] = test[["vacancy_id", "target_vacancy_id"]].apply(
#     lambda row: int(get_workExperience(row["target_vacancy_id"]) in set(filter(lambda x: x, map(get_workExperience, row["vacancy_id"])))),
#     axis=1
# )

# test["y_in_x"] = test[["vacancy_id", "target_vacancy_id"]].apply(
#     lambda row: int(row["target_vacancy_id"] in set(row["vacancy_id"])),
#     axis=1
# )

def most_freq(arr):
    if len(arr) == 0:
        return ""
    return Counter(arr).most_common(1)[0][0]

# test["is_most_pop_area_from_test"] = test[["vacancy_id", "target_vacancy_id"]].apply(
#     lambda row: int(get_area(row["target_vacancy_id"]) == most_freq(list(filter(lambda x: x, map(get_area, row["vacancy_id"]))))),
#     axis=1
# )
# test["is_most_pop_region_from_test"] = test[["vacancy_id", "target_vacancy_id"]].apply(
#     lambda row: int(get_region(row["target_vacancy_id"]) == most_freq(list(filter(lambda x: x, map(get_region, row["vacancy_id"]))))),
#     axis=1
# )

def print_false_example(vacancy_ids, actions, target_vacancy_id):
    if np.random.rand() < 0.01:
        print("=-=-=-=-=-=-=-=-=")
        for vacancy, action in zip(vacancy_ids, actions):
            print(get_action_name(action), vacancy)
            pprint_vacancy(vacancy)
            print("")
        print("^^^^^^^^^^^^^^^^^^^^^^^^^^")
        pprint_vacancy(target_vacancy_id)

def most_freq_or_remote(vacancy_ids, actions, target_vacancy_id, freq_treshold):
    vacancy_responded = list(map(
        lambda x: x[0],
        filter(
            lambda x: x[1] == 1 and get_workSchedule(x[0]) == "fullDay",
            zip(vacancy_ids, actions)
        )
    ))

    work_schedules = Counter(map(get_workSchedule, vacancy_ids))
    work_responded_schedules = Counter(map(get_workSchedule, vacancy_responded))
    target_work_schedule = get_workSchedule(target_vacancy_id)

    regions = Counter(map(get_region, vacancy_ids))
    regions_responded = Counter(map(get_region, vacancy_responded))
    target_region = get_region(target_vacancy_id)

    """
        –ê —Ç–µ–ø–µ—Ä—å —Å–¥–µ–ª–∞–µ–º –≤–∞–∫–∞–Ω—Å–∏–∏, –∫–æ—Ç–æ—Ä—ã–µ –Ω–µ remote
    """
    vacancy_ids_not_remote = filter(lambda vacancy: get_workSchedule(vacancy) not in ["remote", "flyInFlyOut"], vacancy_ids)
    vacancy_responded_not_remote = filter(lambda vacancy: get_workSchedule(vacancy) not in ["remote", "flyInFlyOut"], map(
        lambda x: x[0],
        filter(
            lambda x: x[1] == 1 and get_workSchedule(x[0]) == "fullDay",
            zip(vacancy_ids, actions)
        )
    ))
    regions_not_remote = Counter(map(get_region, vacancy_ids_not_remote))
    regions_responded_not_remote = Counter(map(get_region, vacancy_responded_not_remote))

    should_recommend_remote = False
    if "remote" in work_schedules or "flyInFlyOut" in work_schedules:
        should_recommend_remote = True
        remote = (target_work_schedule == "remote") or (target_work_schedule == "flyInFlyOut")

    # –ù–µ–º–Ω–æ–≥–æ –¥–∞—ë—Ç –≤–µ—Å–∞
    # should_recommend_flex = False
    # if "flexible" in work_schedules:
    #     should_recommend_flex = True
    #     flex = target_work_schedule == "flexible"
    # or (should_recommend_flex and flex) \

    # –í—Å–µ —Ä–µ–≥–∏–æ–Ω—ã, –Ω–∞ –æ—Ñ–∏—Å–Ω—ã–µ –≤–∞–∫–∞–Ω—Å–∏–∏ –∫–æ—Ç–æ—Ä—ã—Ö –æ—Ç–∫–ª–∏–∫–Ω—É–ª–∏—Å—å
    # should_recommend_all_office_responded = False
    # if len(regions_responded_not_remote) > 0:
    #     should_recommend_all_office_responded = True
    #     all_office_responded = target_region in regions_responded_not_remote

    # –í—Å–µ –æ—Ñ–∏—Å–Ω—ã–µ –≤–∞–∫–∞–Ω—Å–∏–∏, –Ω–∞ –∫–æ—Ç–æ—Ä—ã–µ –æ—Ç–∫–ª–∏–∫–Ω—É–ª–∏—Å—å
    should_recommend_office_responded = False
    if len(regions_responded_not_remote) > 0:
        should_recommend_office_responded = True
        office_responded = target_region == regions_responded_not_remote.most_common(1)[0][0]

    # –í—Å–µ –æ—Ñ–∏—Å–Ω—ã–µ –≤–∞–∫–∞–Ω—Å–∏–∏
    should_recommend_office = False
    if len(regions_not_remote) > 0:
        should_recommend_office = True
        office = target_region == regions_not_remote.most_common(1)[0][0]

    # –ù—É–∂–Ω–æ –ª–∏ —Ä–µ–∫–æ–º–µ–Ω–¥–æ–≤–∞—Ç—å –º–æ—Å–∫–≤—É?
    # if target_region == "ar_41":
    # should_recommend_moscow = False
    # if "ar_41" in regions:
    #     should_recommend_moscow = True
    #     moscow = target_region == "ar_41"
    # or (should_recommend_moscow and moscow) \

    #########
    # DEBUG #
    #########
    ###############################################################################
    # if should_recommend_office_responded and not office_responded and len(vacancy_ids) < 15:
    #     print_false_example(vacancy_ids, actions, target_vacancy_id)
    ###############################################################################

    return (should_recommend_remote and remote) \
        or (should_recommend_office_responded and office_responded) \
        or (should_recommend_office and office)

test["is_custom_area_remote_60"] = test[["vacancy_id", "action_type", "target_vacancy_id", "items", "item_actions"]].apply(
    lambda row: int(
        most_freq_or_remote(
            list(row["vacancy_id"]) + (list(chain(*row["items"])) if isinstance(row["items"], np.ndarray) else []),
            list(row["action_type"]) + (list(chain(*row["item_actions"])) if isinstance(row["items"], np.ndarray) else []),
            row["target_vacancy_id"],
            30
        )),
    axis=1
)

test["is_custom_area_remote_60"].mean()

0.9219215330829988

In [20]:
test.columns

Index(['user_id', 'session_id', 'target_session_id', 'vacancy_id',
       'action_type', 'action_dt', 'target_vacancy_id', 'items',
       'item_actions', 'is_custom_area_remote_60'],
      dtype='object')

In [21]:
anal = test.copy()

def merge_history(current, historical):
    if isinstance(historical, np.ndarray) or isinstance(historical, list):
        current = list(chain(*historical)) + list(current)
    return current

get_region = lambda vacancy_id: test_vacancies["area.regionId"][vacancy_id] \
    if vacancy_id in test_vacancies["area.regionId"] else None
get_workSchedule = lambda vacancy_id: test_vacancies["workSchedule"][vacancy_id] \
    if vacancy_id in test_vacancies["workSchedule"] else None

is_remote = lambda vacancy_id: get_workSchedule(vacancy_id) in ["remote", "flyInFlyOut"]

anal["history"] = anal[["vacancy_id", "items"]]\
    .apply(lambda row: merge_history(row["vacancy_id"], row["items"]), axis=1)
anal["history_actions"] = anal[["action_type", "item_actions"]]\
    .apply(lambda row: merge_history(row["action_type"], row["item_actions"]), axis=1)

In [22]:
def has_remote_in_history(history):
    return any(map(is_remote, history))

def has_remote_responded_in_history(history, history_actions):
    history = map(lambda x: x[0], filter(lambda x: x[1] == 1, zip(history, history_actions)))
    return has_remote_in_history(history)

def get_most_freq_responded_region(history, history_actions):
    history = map(lambda x: x[0], filter(lambda x: x[1] == 1, zip(history, history_actions)))
    counts = Counter(map(get_region, history))
    return counts.most_common(1)[0][0] if len(counts) > 0 else ""

def get_most_freq_region(history, history_actions):
    counts = Counter(map(get_region, history))
    return counts.most_common(1)[0][0] if len(counts) > 0 else ""

anal["has_remote_in_history"] = anal["history"].apply(has_remote_in_history)
anal["has_remote_responded_in_history"] = anal[["history", "history_actions"]]\
    .apply(lambda row: has_remote_responded_in_history(row["history"], row["history_actions"]), axis=1)
anal["is_target_remote"] = anal["target_vacancy_id"].apply(is_remote)

anal["is_target_region_in_history_regions"] = anal[["history", "target_vacancy_id"]].apply(
    lambda row: get_region(row["target_vacancy_id"]) in set(map(get_region, row["history"])), axis=1
)

anal["target_region"] = anal["target_vacancy_id"].apply(lambda x: get_region(x) if get_region is not None else "")

anal["most_freq_region"] = anal[["history", "history_actions"]]\
    .apply(lambda row: get_most_freq_region(row["history"], row["history_actions"]), axis=1)
anal["most_freq_responded_region"] = anal[["history", "history_actions"]]\
    .apply(lambda row: get_most_freq_responded_region(row["history"], row["history_actions"]), axis=1)

In [31]:
remote_conversion = anal[anal["has_remote_in_history"] == True]["is_target_remote"].mean()
remote_responded_conversion = anal[anal["has_remote_responded_in_history"] == True]["is_target_remote"].mean()
not_remote_remote_conversion = anal[anal["has_remote_in_history"] == False]["is_target_remote"].mean()
is_target_region_in_history_regions = anal["is_target_region_in_history_regions"].mean()

is_most_freq_region = (anal[anal["most_freq_region"] != ""]["most_freq_region"] ==\
                       anal[anal["most_freq_region"] != ""]["target_region"]).mean()

is_most_freq_responded_region = (anal[anal["most_freq_responded_region"] != ""]["most_freq_responded_region"] ==\
                                 anal[anal["most_freq_responded_region"] != ""]["target_region"]).mean()

is_equel_most_regions = (anal[anal["most_freq_responded_region"] != ""]["most_freq_region"] ==\
                         anal[anal["most_freq_responded_region"] != ""]["most_freq_responded_region"]).mean()

total_guess = anal[(anal["is_target_remote"] == True) & (anal["has_remote_in_history"] == True)].shape[0]

tmp = anal[(anal["is_target_remote"] == False) & (anal["most_freq_region"] != "")]
total_guess += (tmp["most_freq_region"] == tmp["target_region"]).sum()

total_guess / anal.shape[0]

0.8986361085933791

In [24]:
print("–ï—Å–ª–∏ —É —á–µ–ª–æ–≤–µ–∫–∞ –±—ã–ª remote –≤ –∏—Å—Ç–æ—Ä–∏–∏, —Ç–æ –≤–µ—Ä–æ—è—Ç–Ω–æ—Å—Ç—å —Ç–æ–≥–æ, —á—Ç–æ —Ç–∞—Ä–≥–µ—Ç remote", remote_conversion)
print("–ï—Å–ª–∏ —É —á–µ–ª–æ–≤–µ–∫–∞ –±—ã–ª remote –≤ –æ—Ç–∫–ª–∏–∫–∞—Ö –≤ –∏—Å—Ç–æ—Ä–∏–∏, —Ç–æ –≤–µ—Ä–æ—è—Ç–Ω–æ—Å—Ç—å —Ç–æ–≥–æ, —á—Ç–æ —Ç–∞—Ä–≥–µ—Ç remote", remote_responded_conversion)
print("–ï—Å–ª–∏ —É —á–µ–ª–æ–≤–µ–∫–∞ –Ω–µ –±—ã–ª–æ remote –≤ –∏—Å—Ç–æ—Ä–∏–∏, —Ç–æ –≤–µ—Ä–æ—è—Ç–Ω–æ—Å—Ç—å —Ç–æ–≥–æ, —á—Ç–æ —Ç–∞—Ä–≥–µ—Ç remote", not_remote_remote_conversion)
print("–†–µ–≥–∏–æ–Ω —Ç–∞—Ä–≥–µ—Ç–∞ —Å–æ–≤–ø–∞–¥–∞–µ—Ç —Å –æ–¥–Ω–∏–º –∏–∑ —Ä–µ–≥–∏–æ–Ω–æ–≤ –≤ –∏—Å—Ç–æ—Ä–∏–∏", is_target_region_in_history_regions)
print("–†–µ–≥–∏–æ–Ω —Ç–∞—Ä–≥–µ—Ç–∞ —Å–æ–≤–ø–∞–¥–∞–µ—Ç —Å —Å–∞–º—ã–º –ø–æ–ø—É–ª—è—Ä–Ω—ã–º —Ä–µ–≥–∏–æ–Ω–æ–º –∏–∑ –∏—Å—Ç–æ—Ä–∏–∏", is_most_freq_region)
print("–†–µ–≥–∏–æ–Ω —Ç–∞—Ä–≥–µ—Ç–∞ —Å–æ–≤–ø–∞–¥–∞–µ—Ç —Å —Å–∞–º—ã–º –ø–æ–ø—É–ª—è—Ä–Ω—ã–º —Ä–µ–≥–∏–æ–Ω–æ–º –∏–∑ –∏—Å—Ç–æ—Ä–∏–∏ –æ—Ç–∫–ª–∏–∫–æ–≤", is_most_freq_responded_region)
print("–°–∞–º—ã–π –ø–æ–ø—É–ª—è—Ä–Ω—ã–µ —Ä–µ–≥–∏–æ–Ω –∏–∑ –æ—Ç–∫–ª–∏–∫–æ–≤ —Å–æ–≤–ø–∞–¥–∞–µ—Ç —Å —Å–∞–º—ã–º –ø–æ–ø—É–ª—è—Ä–Ω—ã–º —Ä–µ–≥–∏–æ–Ω–æ–º", is_equel_most_regions)

–ï—Å–ª–∏ —É —á–µ–ª–æ–≤–µ–∫–∞ –±—ã–ª remote –≤ –∏—Å—Ç–æ—Ä–∏–∏, —Ç–æ –≤–µ—Ä–æ—è—Ç–Ω–æ—Å—Ç—å —Ç–æ–≥–æ, —á—Ç–æ —Ç–∞—Ä–≥–µ—Ç remote 0.4184280730556126
–ï—Å–ª–∏ —É —á–µ–ª–æ–≤–µ–∫–∞ –±—ã–ª remote –≤ –æ—Ç–∫–ª–∏–∫–∞—Ö –≤ –∏—Å—Ç–æ—Ä–∏–∏, —Ç–æ –≤–µ—Ä–æ—è—Ç–Ω–æ—Å—Ç—å —Ç–æ–≥–æ, —á—Ç–æ —Ç–∞—Ä–≥–µ—Ç remote 0.5288314078315101
–ï—Å–ª–∏ —É —á–µ–ª–æ–≤–µ–∫–∞ –Ω–µ –±—ã–ª–æ remote –≤ –∏—Å—Ç–æ—Ä–∏–∏, —Ç–æ –≤–µ—Ä–æ—è—Ç–Ω–æ—Å—Ç—å —Ç–æ–≥–æ, —á—Ç–æ —Ç–∞—Ä–≥–µ—Ç remote 0.07557045882064284
–†–µ–≥–∏–æ–Ω —Ç–∞—Ä–≥–µ—Ç–∞ —Å–æ–≤–ø–∞–¥–∞–µ—Ç —Å –æ–¥–Ω–∏–º –∏–∑ —Ä–µ–≥–∏–æ–Ω–æ–≤ –≤ –∏—Å—Ç–æ—Ä–∏–∏ 0.9088868747032673
–†–µ–≥–∏–æ–Ω —Ç–∞—Ä–≥–µ—Ç–∞ —Å–æ–≤–ø–∞–¥–∞–µ—Ç —Å —Å–∞–º—ã–º –ø–æ–ø—É–ª—è—Ä–Ω—ã–º —Ä–µ–≥–∏–æ–Ω–æ–º –∏–∑ –∏—Å—Ç–æ—Ä–∏–∏ 0.7713539643489145
–†–µ–≥–∏–æ–Ω —Ç–∞—Ä–≥–µ—Ç–∞ —Å–æ–≤–ø–∞–¥–∞–µ—Ç —Å —Å–∞–º—ã–º –ø–æ–ø—É–ª—è—Ä–Ω—ã–º —Ä–µ–≥–∏–æ–Ω–æ–º –∏–∑ –∏—Å—Ç–æ—Ä–∏–∏ –æ—Ç–∫–ª–∏–∫–æ–≤ 0.7553457688808007
–°–∞–º—ã–π –ø–æ–ø—É–ª—è—Ä–Ω—ã–µ —Ä–µ–≥–∏–æ–Ω –∏–∑ –æ—Ç–∫–ª–∏–∫–æ–≤ —Å–æ–≤–ø–∞–¥–∞–µ—Ç —Å —Å–∞–º—ã–º –ø–æ–ø—É–ª—è—Ä–Ω—ã–º 

In [25]:
anal["is_target_remote"].mean()

0.3279597738357288

In [26]:
"""
    1) Remote –Ω–∞–¥–æ –≤—Å–µ–≥–¥–∞ –ø—Ä–µ–¥–ª–∞–≥–∞—Ç—å? +0.006%
    2) Flex –Ω–∞–¥–æ –≤—Å–µ–≥–¥–∞ –ø—Ä–µ–¥–ª–∞–≥–∞—Ç—å? +0.003%
    3) –í—Å–µ —Ä–µ–≥–∏–æ–Ω—ã –Ω–∞ –≤–∞–∫–∞–Ω—Å–∏–∏ –∫–æ—Ç–æ—Ä—ã—Ö –æ—Ç–∫–ª–∏–∫–Ω—É–ª—Å—è –ø–æ fullDay +0.008%
    5) –†–µ–∫–æ–º–µ–Ω–¥–æ–≤–∞—Ç—å –º–æ—Å–∫–≤—É +0.02%
"""

()

In [41]:
import pickle

vacancies_names = pd.read_parquet("dumps/production/i2i/tmp_vacancies")["vacancy_id"]
vacancy_to_idx = {a[1]: a[0] for a in enumerate(vacancies_names)}
embeddings = pickle.load(open("dumps/production/i2i/tmp_vacancies_embeddings", 'rb'))
embeddings = embeddings / np.linalg.norm(embeddings, axis = 1, keepdims=True)
embeddings = np.float32(embeddings)

vacancy_to_idx["v_1817099"], embeddings[vacancy_to_idx["v_1817099"]][:3]

(2540552, array([-0.0199932 ,  0.05978687, -0.06663361], dtype=float32))

In [58]:
def cos(u, v):
    return np.dot(u, v) / (np.linalg.norm(u) * np.linalg.norm(v))

def dot(u, v):
    return np.dot(u, v)

cos(embeddings[np.array([0, 2])].sum(axis=0) / 2, embeddings[np.array([0, 2])].sum(axis=0) / 2)

1.0

In [70]:
def foo(vacancy_id, action_type, items, item_actions, target):
    if isinstance(items, np.ndarray) or isinstance(items, list):
        vacancy_id = list(chain(*items)) + list(vacancy_id)
        action_type = list(chain(*item_actions)) + list(action_type)

    vacancy_id = vacancy_id[::-1]
    action_type = action_type[::-1]

    idxs = np.array(list(map(lambda vac: vacancy_to_idx[vac], vacancy_id)))
    
    user_embedding = embeddings[idxs[:30]].sum(axis=0)

    target_embedding = embeddings[vacancy_to_idx[target]]

    return cos(user_embedding, target_embedding)

data = test.copy()
data["cos"] = data[["vacancy_id", "action_type", "items", "item_actions", "target_vacancy_id"]].apply(
    lambda row: foo(row["vacancy_id"], row["action_type"], row["items"], row["item_actions"], row["target_vacancy_id"]), axis = 1
)
data["cos"].describe()

count    46338.000000
mean         0.740390
std          0.169083
min         -0.047915
25%          0.637574
50%          0.767928
75%          0.870499
max          1.000000
Name: cos, dtype: float64

In [None]:
def roundrobin(*iterables):
    pending = len(iterables)
    nexts = cycle(iter(it).__next__ for it in iterables)
    while pending:
        try:
            for next in nexts:
                yield next()
        except StopIteration:
            pending -= 1
            nexts = cycle(islice(nexts, pending))

def get_recos(n, *iterables):
    output = []
    for idx, vacancy_id in enumerate(roundrobin(*iterables)):
        output.append(vacancy_id)

        if len(output) == n:
            break
    
    return output