In [1]:
import pandas as pd
import polars as pl
import numpy as np

In [2]:
path = "../data/raw/hh_recsys_train_hh.pq"

train_path = "../data/processed/train.parquet"
test_inference_path = "../data/processed/test_inference.parquet"
val_inference_path = "../data/processed/val_inference.parquet"

In [3]:
df = pl.read_parquet(path).to_pandas()

In [4]:
df.columns

Index(['user_id', 'session_id', 'vacancy_id', 'action_type', 'action_dt'], dtype='object')

In [5]:
df.head(3)

Unnamed: 0,user_id,session_id,vacancy_id,action_type,action_dt
0,u_332060,s_28301374,"[v_2571684, v_488179, v_2389179, v_1393783, v_...","[2, 2, 2, 2, 2, 2, 1, 1, 2, 1, 2, 1, 2, 2, 2, ...","[2023-11-01T00:40:58.105000000, 2023-11-01T00:..."
1,u_1057881,s_33868982,[v_665861],[2],[2023-11-01T00:23:51.452000000]
2,u_1036784,s_32474802,[v_2594840],[2],[2023-11-01T00:52:34.023000000]


In [6]:
df["user_id"].shape[0] - df["user_id"].drop_duplicates().shape[0]

2580655

In [7]:
df["user_id"].drop_duplicates().shape[0]

882409

In [8]:
def is_session_has_more_then_one_date(action_dt):    
    dates = set(map(lambda x: np.datetime64(x, 'D'), action_dt))
    return len(dates) > 1

In [9]:
df["is_session_has_more_then_one_date"] = df["action_dt"].apply(is_session_has_more_then_one_date)

In [10]:
df[df["is_session_has_more_then_one_date"] == True].head(2)

Unnamed: 0,user_id,session_id,vacancy_id,action_type,action_dt,is_session_has_more_then_one_date
126,u_623716,s_8070830,"[v_945273, v_2467032, v_1627727, v_270293, v_1...","[2, 2, 1, 2, 1, 1, 1, 2, 2, 2, 2, 1, 1, 2, 2, ...","[2023-11-01T21:50:38.056000000, 2023-11-01T21:...",True
265,u_749629,s_15090787,"[v_938131, v_2063968, v_2063968, v_1520585, v_...","[2, 1, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 1, 2, ...","[2023-11-02T20:59:10.336000000, 2023-11-02T20:...",True


In [11]:
df[df["is_session_has_more_then_one_date"] == True].shape[0] / df.shape[0] * 100, "%", df.shape[0], df[df["is_session_has_more_then_one_date"] == True].shape[0]

(0.4446929077833965, '%', 3463064, 15400)

In [12]:
# Пол процента сессий лежит на пересечении дат. Дропнем их, чтобы разделить на train, val, test

In [13]:
df = df[df["is_session_has_more_then_one_date"] == False]

In [14]:
df["session_day"] = df["action_dt"].apply(
    lambda x: int(str(x[0])[:10][-2:])
)

In [15]:
df["session_day"].unique()

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14])

In [16]:
df_train = df[df["session_day"].isin([1,  2,  3,  4,  5,  6,  7,  8,  9])]
df_test_eval = df[df["session_day"].isin([10, 11, 12, 13, 14])]

In [20]:
def prepare_dataset_for_inference(data):
    data = data.copy()[["user_id", "session_id", "vacancy_id", "action_type", "action_dt"]]

    def get_session_min_start(action_dt):
        return str(action_dt[0])[:19]

    data["session_min_start"] = data["action_dt"].apply(get_session_min_start)
    data = data.sort_values(by="session_min_start", ascending=True)
    data["has_reaction"] = data["action_type"].apply(lambda x: 1 in x)

    sessions = data[["user_id", "session_id", "has_reaction"]]
    sessions["session_id"] = sessions["session_id"] + ":" + sessions["has_reaction"].apply(str)
    sessions = sessions[["user_id", "session_id"]]
    sessions = sessions.groupby("user_id", as_index=False)["session_id"].apply(list)
    sessions = sessions[sessions["session_id"].apply(len) >= 2]

    def sample_2_sessions(session_ids):
        good_target_session_idxs = []
    
        for idx in range(1, len(session_ids)):
            if session_ids[idx].endswith("True"):
                good_target_session_idxs.append(idx)
    
        if not len(good_target_session_idxs):
            return []
    
        good_target_session_num = len(good_target_session_idxs)

        target_session_idx = np.random.randint(0, good_target_session_num)
        target_session_idx = good_target_session_idxs[target_session_idx]

        input_session_idx = target_session_idx - 1
    
        return [
            session_ids[input_session_idx].split(":")[0],
            session_ids[target_session_idx].split(":")[0],
        ]
    
    sessions["data"] = sessions["session_id"].apply(sample_2_sessions)
    sessions = sessions[sessions["data"].apply(len) >= 2]

    sessions = sessions[["user_id", "data"]]
    sessions["session_id"] = sessions["data"].apply(lambda x: x[0])
    sessions["target_session_id"] = sessions["data"].apply(lambda x: x[1])
    sessions = sessions[["user_id", "session_id", "target_session_id"]]

    input_info = data[["user_id", "session_id", "vacancy_id", "action_type", "action_dt"]]

    def get_first_reaction(row):
        actions = map(
            lambda x: (x[1], x[2]),
            sorted(zip(row["action_dt"], row["vacancy_id"], row["action_type"])))
        
        for vacancy_id, action_type in actions:
            if action_type == 1:
                return vacancy_id
        return "unknown"

    label_info = data[["user_id", "session_id", "vacancy_id", "action_type", "action_dt"]]
    label_info["target_vacancy_id"] = label_info[["vacancy_id", "action_type", "action_dt"]].apply(get_first_reaction, axis=1)
    label_info = label_info[label_info["target_vacancy_id"] != "unknown"]
    label_info = label_info[["user_id", "session_id", "target_vacancy_id"]]
    label_info = label_info.rename(columns={"session_id": "target_session_id"})

    print(sessions.shape[0])
    dataset = pd.merge(sessions, input_info, "inner", ["user_id", "session_id"])
    print(dataset.shape[0])
    dataset = pd.merge(dataset, label_info, "inner", ["user_id", "target_session_id"])
    print(dataset.shape[0])

    return dataset

In [21]:
inference_dataset = prepare_dataset_for_inference(df_test_eval)
inference_dataset.head(2)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sessions["session_id"] = sessions["session_id"] + ":" + sessions["has_reaction"].apply(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  label_info["target_vacancy_id"] = label_info[["vacancy_id", "action_type", "action_dt"]].apply(get_first_reaction, axis=1)


92806
92806
92806


Unnamed: 0,user_id,session_id,target_session_id,vacancy_id,action_type,action_dt,target_vacancy_id
0,u_1000006,s_20792345,s_24205016,"[v_528266, v_1145710, v_1145710, v_521801, v_9...","[2, 2, 2, 2, 2, 2]","[2023-11-10T17:57:45.200000000, 2023-11-10T17:...",v_2252717
1,u_1000060,s_19856666,s_6481076,[v_1962314],[2],[2023-11-10T14:21:18.628000000],v_76636


In [22]:
inference_dataset.shape[0]

92806

In [23]:
import hashlib

def split_by_col(df, col, random_state):
    def is_test(value):
        return (int(hashlib.md5(value.encode("utf-8")).hexdigest(), 16) + random_state) % 2

    df["__is_test"] = df[col].apply(is_test)

    test_dataset = df[df["__is_test"] == 1].drop(columns=["__is_test"])
    val_dataset = df[df["__is_test"] == 0].drop(columns=["__is_test"])

    return test_dataset, val_dataset

In [24]:
test_inference_dataset, val_inference_dataset = split_by_col(inference_dataset, "user_id", 137137)
test_inference_dataset.shape[0], val_inference_dataset.shape[0]

(46338, 46468)

In [25]:
df_train.to_parquet(train_path)
test_inference_dataset.to_parquet(test_inference_path)
val_inference_dataset.to_parquet(val_inference_path)

In [26]:
df_train.head(10)

Unnamed: 0,user_id,session_id,vacancy_id,action_type,action_dt,is_session_has_more_then_one_date,session_day
0,u_332060,s_28301374,"[v_2571684, v_488179, v_2389179, v_1393783, v_...","[2, 2, 2, 2, 2, 2, 1, 1, 2, 1, 2, 1, 2, 2, 2, ...","[2023-11-01T00:40:58.105000000, 2023-11-01T00:...",False,1
1,u_1057881,s_33868982,[v_665861],[2],[2023-11-01T00:23:51.452000000],False,1
2,u_1036784,s_32474802,[v_2594840],[2],[2023-11-01T00:52:34.023000000],False,1
3,u_786220,s_14060785,"[v_1473781, v_1622905, v_1621959, v_2289180, v...","[1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 3, 2, ...","[2023-11-01T00:58:20.793000000, 2023-11-01T01:...",False,1
4,u_639152,s_23205986,"[v_695738, v_22433, v_1590524, v_502496, v_200...","[2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 2, ...","[2023-11-01T01:14:20.828000000, 2023-11-01T00:...",False,1
5,u_456512,s_6053713,"[v_2267837, v_1724756, v_990702]","[2, 2, 2]","[2023-11-01T01:37:09.157000000, 2023-11-01T01:...",False,1
6,u_584261,s_11620144,"[v_902380, v_1488750, v_574794, v_212567, v_84...","[2, 2, 2, 2, 2]","[2023-11-01T01:09:49.508000000, 2023-11-01T01:...",False,1
7,u_733524,s_13468241,"[v_1460424, v_996518, v_505107, v_2638088, v_2...","[2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 1, 2, 2, 2, 2, ...","[2023-11-01T02:12:43.970000000, 2023-11-01T02:...",False,1
8,u_1114237,s_23351853,"[v_695101, v_154777]","[2, 2]","[2023-11-01T04:18:48.790000000, 2023-11-01T04:...",False,1
9,u_407319,s_23566251,"[v_268677, v_692096, v_1928237, v_2578812, v_1...","[2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2]","[2023-11-01T04:56:41.605000000, 2023-11-01T04:...",False,1
