In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
from datetime import timedelta
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier, Pool
from tqdm import tqdm
import pyarrow.dataset as ds

In [None]:
DATA_DIR = Path(r"C:\Users\Denis\Desktop\avito")

# Загрузка events и фильтрация clickstream по contact

In [None]:
events = pd.read_parquet(DATA_DIR / "events.pq")
contact_event_ids = events.query("is_contact == 1")["event"].unique()

# Используем pyarrow для фильтрации при чтении
clicks_ds = ds.dataset(str(DATA_DIR / "clickstream.pq"), format="parquet")
clicks_table = clicks_ds.to_table(
    filter=ds.field("event").isin(contact_event_ids),
    columns=["cookie", "node", "item", "event_date", "platform"]
)
clicks = clicks_table.to_pandas()

# Делим на train / eval по времени

In [None]:
threshold = clicks["event_date"].max() - timedelta(days=14)
train_clicks = clicks[clicks["event_date"] <= threshold]
eval_clicks = clicks[clicks["event_date"] > threshold]

# Кандидаты: популярные node

In [None]:
popular_nodes = train_clicks["node"].value_counts().head(1000).index.tolist()
user_history = train_clicks.groupby("cookie")["node"].apply(set).to_dict()

candidates = []
for cookie in tqdm(eval_clicks["cookie"].unique()):
    seen = user_history.get(cookie, set())
    candidate_nodes = [n for n in popular_nodes if n not in seen]
    for node in candidate_nodes:
        candidates.append({"cookie": cookie, "node": node})

df_candidates = pd.DataFrame(candidates)

# Целевая переменная (target)

In [None]:
true_contacts = eval_clicks[["cookie", "node"]].drop_duplicates()
true_contacts["target"] = 1

df_data = df_candidates.merge(true_contacts, on=["cookie", "node"], how="left")
df_data["target"] = df_data["target"].fillna(0).astype("int")

# Фичи: user_node_count, node_popularity, user_unique_nodes, platform

In [None]:
interaction_counts = train_clicks.groupby(["cookie", "node"]).size().reset_index(name="user_node_count")
df_data = df_data.merge(interaction_counts, on=["cookie", "node"], how="left")
df_data["user_node_count"] = df_data["user_node_count"].fillna(0)

node_pop = train_clicks["node"].value_counts().reset_index()
node_pop.columns = ["node", "node_popularity"]
df_data = df_data.merge(node_pop, on="node", how="left")

user_node_diversity = train_clicks.groupby("cookie")["node"].nunique().reset_index()
user_node_diversity.columns = ["cookie", "user_unique_nodes"]
df_data = df_data.merge(user_node_diversity, on="cookie", how="left")

platforms = train_clicks.sort_values("event_date").groupby("cookie")["platform"].last().reset_index()
df_data = df_data.merge(platforms, on="cookie", how="left")

# Готовим данные для CatBoost

In [None]:
features = ["user_node_count", "node_popularity", "user_unique_nodes", "platform"]
X = df_data[features]
y = df_data["target"]

X_train, X_val, y_train, y_val = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

cat_features = ["platform"]

train_pool = Pool(X_train, y_train, cat_features=cat_features)
val_pool = Pool(X_val, y_val, cat_features=cat_features)

model = CatBoostClassifier(iterations=200, depth=6, learning_rate=0.1, eval_metric="Recall", verbose=50)
model.fit(train_pool, eval_set=val_pool)

# Предсказания и Recall@40

In [None]:
df_data["pred_proba"] = model.predict_proba(X)[:, 1]

topk = (
    df_data.sort_values("pred_proba", ascending=False)
    .groupby("cookie")
    .head(40)[["cookie", "node"]]
)

def recall_at_k(df_eval, df_pred, k=40):
    true = df_eval.groupby("cookie")["node"].apply(set)
    pred = df_pred.groupby("cookie")["node"].apply(list)

    recalls = []
    for cookie in true.index.intersection(pred.index):
        y_true = true[cookie]
        y_pred = pred[cookie][:k]
        if y_true:
            recall = len(set(y_pred) & y_true) / len(y_true)
            recalls.append(recall)
    return np.mean(recalls)

recall = recall_at_k(true_contacts, topk, k=40)
print(f"\n Локальный Recall@40: {recall:.5f}")

# Формирование submit.csv

In [None]:
# Загружаем test_users 
test_users = pd.read_parquet(DATA_DIR / "test_users.pq")

# Оставим только предсказания по test_users
submit_topk = (
    df_data[df_data["cookie"].isin(test_users["cookie"])]
    .sort_values("pred_proba", ascending=False)
    .groupby("cookie")
    .head(40)[["cookie", "node"]]
)

# Сохраняем
submit_topk.to_csv(DATA_DIR / "submit.csv", index=False)