# 03. Моделирование для VK EdTech ML Challenge

В этом ноутбуке:
* заново строим признаки кампаний `features_df` по рабочему коду (1008 строк);
* обучаем RandomForest-классификатор по 3 классам;
* обучаем stacking-регрессии по каждому таргету.

# ЭТАП 1. Импорт и загрузка исходных данных

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [5]:
from google.colab import drive
drive.mount("/content/drive")

import os
import random
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

SEED = 42
np.random.seed(SEED)
random.seed(SEED)

FOLDER = "/content/drive/MyDrive/VK_Project_v2"

users = pd.read_csv(os.path.join(FOLDER, "users.tsv"), sep="\t")
history = pd.read_csv(os.path.join(FOLDER, "history.tsv"), sep="\t")
validate = pd.read_csv(os.path.join(FOLDER, "validate.tsv"), sep="\t")
validate_answers = pd.read_csv(os.path.join(FOLDER, "validate_answers.tsv"), sep="\t")

print("users.shape:", users.shape)
print("history.shape:", history.shape)
print("validate.shape:", validate.shape)
print("validate_answers.shape:", validate_answers.shape)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
users.shape: (27769, 4)
history.shape: (1147857, 4)
validate.shape: (1008, 6)
validate_answers.shape: (1008, 3)


# ЭТАП 2. Кластеризация кампаний по демографии аудитории

In [6]:
def get_campaign_clusters(validate_df, users_df, n_clusters=5):
    features = []
    for _, row in validate_df.iterrows():
        user_ids = eval(row["user_ids"]) if isinstance(row["user_ids"], str) else row["user_ids"]
        user_df = users_df[users_df["user_id"].isin(user_ids)]
        sex_mean = user_df["sex"].mean()
        age_mean = user_df["age"].mean()
        features.append([sex_mean, age_mean])

    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(features)

    kmeans = KMeans(n_clusters=n_clusters, random_state=SEED)
    clusters = kmeans.fit_predict(X_scaled)
    return clusters

campaign_clusters = get_campaign_clusters(validate, users, n_clusters=5)
print("Количество кластеров:", len(np.unique(campaign_clusters)))
print("Пример кластеров первых 10 кампаний:", campaign_clusters[:10])

Количество кластеров: 5
Пример кластеров первых 10 кампаний: [3 3 3 3 1 3 1 3 1 3]


# ЭТАП 3. Rolling-статистики по истории показов (окно 24 часа)

In [7]:
def rolling_stats(df, win):
    res = []
    for h in df["hour"].unique():
        win_df = df[(df["hour"] >= h - win) & (df["hour"] <= h + win)]
        res.append(
            {
                "hour": h,
                "roll_cpm_mean": win_df["cpm"].mean(),
                "roll_cpm_median": win_df["cpm"].median(),
                "roll_cpm_std": win_df["cpm"].std(),
            }
        )
    return pd.DataFrame(res).set_index("hour")

roll_features = rolling_stats(history, win=24)
print("roll_features.shape:", roll_features.shape)
display(roll_features.head())

roll_features.shape: (1488, 3)


Unnamed: 0_level_0,roll_cpm_mean,roll_cpm_median,roll_cpm_std
hour,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
10,177.381738,102.0,383.516447
8,177.020917,102.125,382.822419
7,177.014755,102.0,384.489807
18,175.375959,100.625,357.635086
5,176.039587,101.33,387.721526


# ЭТАП 4. Формирование признаков кампаний (features_df, 1008 строк)

In [8]:
import numpy as np

def campaign_features_final(campaign, users_df, history_df, roll_df, cluster):
    user_ids = eval(campaign["user_ids"]) if isinstance(campaign["user_ids"], str) else campaign["user_ids"]
    user_df = users_df[users_df["user_id"].isin(user_ids)]

    h_window = history_df[
        (history_df["hour"] >= campaign["hour_start"])
        & (history_df["hour"] <= campaign["hour_end"])
        & (history_df["user_id"].isin(user_ids))
    ]

    h_inds = list(range(campaign["hour_start"], campaign["hour_end"] + 1))
    try:
        roll_cpm_mean = roll_df.loc[h_inds, "roll_cpm_mean"].median()
        roll_cpm_std = roll_df.loc[h_inds, "roll_cpm_std"].median()
        roll_cpm_median = roll_df.loc[h_inds, "roll_cpm_median"].median()
    except Exception:
        roll_cpm_mean = history_df["cpm"].mean()
        roll_cpm_median = history_df["cpm"].median()
        roll_cpm_std = history_df["cpm"].std()

    sex_mean = user_df["sex"].mean()
    age_mean = user_df["age"].mean()
    city_nunique = user_df["city_id"].nunique()

    in_history_frac = user_df["user_id"].isin(h_window["user_id"]).mean()
    n_shows_mean = h_window.groupby("user_id").size().reindex(user_ids, fill_value=0).mean()
    n_unique_pub_mean = (
        h_window.groupby("user_id")["publisher"].nunique().reindex(user_ids, fill_value=0).mean()
    )
    mean_cpm_mean = (
        h_window.groupby("user_id")["cpm"].mean().reindex(user_ids, fill_value=campaign["cpm"]).mean()
    )
    median_cpm_mean = (
        h_window.groupby("user_id")["cpm"].median().reindex(user_ids, fill_value=campaign["cpm"]).mean()
    )

    quantile_cpm_10 = np.percentile(h_window["cpm"], 10) if len(h_window) else campaign["cpm"]
    quantile_cpm_90 = np.percentile(h_window["cpm"], 90) if len(h_window) else campaign["cpm"]

    audience_size = campaign["audience_size"]
    window_hours = campaign["hour_end"] - campaign["hour_start"] + 1
    cpm = campaign["cpm"]
    n_publishers = (
        len(campaign["publishers"].split(","))
        if isinstance(campaign["publishers"], str)
        else len(campaign["publishers"])
    )

    feats = dict(
        campaign_cluster=cluster,
        sex_mean=sex_mean,
        age_mean=age_mean,
        city_nunique=city_nunique,
        in_history_frac=in_history_frac,
        n_shows_mean=n_shows_mean,
        n_unique_pub_mean=n_unique_pub_mean,
        mean_cpm_mean=mean_cpm_mean,
        median_cpm_mean=median_cpm_mean,
        quantile_cpm_10=quantile_cpm_10,
        quantile_cpm_90=quantile_cpm_90,
        roll_cpm_mean=roll_cpm_mean,
        roll_cpm_median=roll_cpm_median,
        roll_cpm_std=roll_cpm_std,
        audience_size=audience_size,
        window_hours=window_hours,
        cpm=cpm,
        n_publishers=n_publishers,
    )
    return feats


features = []
for i, row in enumerate(validate.iterrows()):
    _, campaign = row
    cluster = campaign_clusters[i]
    if (i + 1) % 100 == 0 or i == 0:
        print(f"Кампания {i+1}/{len(validate)}")
    feats = campaign_features_final(campaign, users, history, roll_features, cluster)
    features.append(feats)

features_df = pd.DataFrame(features)
print("Формирование признаков завершено. Размерность features_df:", features_df.shape)
display(features_df.head())

Кампания 1/1008
Кампания 100/1008
Кампания 200/1008
Кампания 300/1008
Кампания 400/1008
Кампания 500/1008
Кампания 600/1008
Кампания 700/1008
Кампания 800/1008
Кампания 900/1008
Кампания 1000/1008
Формирование признаков завершено. Размерность features_df: (1008, 18)


Unnamed: 0,campaign_cluster,sex_mean,age_mean,city_nunique,in_history_frac,n_shows_mean,n_unique_pub_mean,mean_cpm_mean,median_cpm_mean,quantile_cpm_10,quantile_cpm_90,roll_cpm_mean,roll_cpm_median,roll_cpm_std,audience_size,window_hours,cpm,n_publishers
0,3,1.462225,28.721406,7,0.524134,2.743442,0.588667,258.807057,242.285923,32.0,450.088,200.270121,116.935,366.52211,1906,96,220.0,2
1,3,1.467391,28.611594,372,0.177536,0.331884,0.180435,297.761332,296.874891,34.94,445.8,201.463053,125.02,336.921666,1380,7,312.0,2
2,3,1.467342,29.05518,280,0.275901,0.685811,0.291667,110.309948,107.825845,30.832,370.592,204.441962,128.21,357.198092,888,21,70.0,6
3,3,1.454545,27.763636,7,0.484091,1.956818,0.522727,243.399133,236.889795,30.0,391.5,192.687558,120.0,305.660511,440,83,240.0,2
4,1,1.386856,44.76084,416,0.615854,5.747967,0.765583,298.016818,279.796704,35.0,452.262,181.858295,105.0,320.003712,1476,239,262.0,4


# ЭТАП 5. Сохранение features_df для повторного использования

In [9]:
features_path = os.path.join(FOLDER, "features_df_baseline.parquet")
features_df.to_parquet(features_path)
print("features_df_baseline сохранён в", features_path)

features_df_baseline сохранён в /content/drive/MyDrive/VK_Project_v2/features_df_baseline.parquet


# ЭТАП 6. Подготовка X, y и классов

In [10]:
target_cols = ["at_least_one", "at_least_two", "at_least_three"]

X = features_df.reset_index(drop=True)
y = validate_answers[target_cols].reset_index(drop=True)

print("X.shape:", X.shape)
print("y.shape:", y.shape)

def make_classes(df_targets: pd.DataFrame) -> np.ndarray:
    cond0 = (df_targets < 0.01).all(axis=1)
    cond1 = (df_targets < 0.05).all(axis=1)
    classes = np.full(len(df_targets), 2, dtype=int)
    classes[cond1] = 1
    classes[cond0] = 0
    return classes

y_class = make_classes(y)

unique, counts = np.unique(y_class, return_counts=True)
print("Распределение классов:", dict(zip(unique, counts)))

X.shape: (1008, 18)
y.shape: (1008, 3)
Распределение классов: {np.int64(0): np.int64(172), np.int64(1): np.int64(308), np.int64(2): np.int64(528)}


# ЭТАП 7. RandomForest-классификатор по 3 классам

In [11]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

X_train, X_valid, y_class_train, y_class_valid = train_test_split(
    X, y_class, test_size=0.2, random_state=SEED, stratify=y_class
)

param_dist = {
    "n_estimators": [200, 300, 500],
    "max_depth": [None, 8, 12, 16],
    "min_samples_leaf": [1, 2, 4],
    "max_features": ["sqrt", "log2", 0.5],
}

rf_base = RandomForestClassifier(random_state=SEED, n_jobs=-1)

search = RandomizedSearchCV(
    rf_base,
    param_distributions=param_dist,
    n_iter=10,
    cv=3,
    scoring="f1_macro",
    random_state=SEED,
    n_jobs=-1,
)

search.fit(X, y_class)

print("Лучшие параметры RF:", search.best_params_)
rf_clf = search.best_estimator_

y_class_pred = rf_clf.predict(X_valid)
print("Классификатор RandomForest, отчёт по валидации:")
print(classification_report(y_class_valid, y_class_pred))

Классификатор RandomForest, отчёт по валидации:
              precision    recall  f1-score   support

           0       0.79      0.68      0.73        34
           1       0.57      0.50      0.53        62
           2       0.79      0.89      0.84       106

    accuracy                           0.73       202
   macro avg       0.72      0.69      0.70       202
weighted avg       0.72      0.73      0.73       202



# ЭТАП 8. Stacking-регрессии + калибровка для каждого таргета

In [12]:
from sklearn.ensemble import HistGradientBoostingRegressor, StackingRegressor
from sklearn.linear_model import RidgeCV
from sklearn.isotonic import IsotonicRegression

EPS = 0.005

base_estimators = [
    ("hgb", HistGradientBoostingRegressor(random_state=SEED)),
    ("ridge", RidgeCV(alphas=(0.1, 1.0, 10.0))),
]

meta_estimator = RidgeCV(alphas=(0.1, 1.0, 10.0))

def make_stacking_regressor():
    return StackingRegressor(
        estimators=base_estimators,
        final_estimator=meta_estimator,
        n_jobs=-1,
        passthrough=True,
    )

stack_models = {}
calibrators = {}

for col in target_cols:
    print(f"\n=== Таргет: {col} ===")
    y_col = y[col].values
    y_log = np.log(y_col + EPS)

    X_tr, X_val, y_tr, y_val = train_test_split(
        X, y_log, test_size=0.2, random_state=SEED
    )

    stack = make_stacking_regressor()
    stack.fit(X_tr, y_tr)

    y_val_pred_log = stack.predict(X_val)
    y_val_pred = np.exp(y_val_pred_log) - EPS
    y_val_pred = np.clip(y_val_pred, 0, None)

    calib = IsotonicRegression(out_of_bounds="clip")
    calib.fit(y_val_pred, np.exp(y_val) - EPS)

    stack_models[col] = stack
    calibrators[col] = calib

print("\nОбучение стеков и калибровщиков завершено.")


=== Таргет: at_least_one ===

=== Таргет: at_least_two ===

=== Таргет: at_least_three ===

Обучение стеков и калибровщиков завершено.


# ЭТАП 9. Функция предсказания для одной кампании

In [13]:
CLASS0_TEMPLATE = np.array([0.0, 0.0, 0.0])
CLASS1_TEMPLATE = np.array([0.02, 0.01, 0.005])

def predict_for_features_row(x_row: pd.Series) -> np.ndarray:
    x_row_2d = x_row.values.reshape(1, -1)

    class_pred = rf_clf.predict(x_row_2d)[0]

    if class_pred == 0:
        return CLASS0_TEMPLATE.copy()
    elif class_pred == 1:
        return CLASS1_TEMPLATE.copy()
    else:
        preds = []
        for col in target_cols:
            stack = stack_models[col]
            calib = calibrators[col]

            y_log_pred = stack.predict(x_row_2d)[0]
            y_pred = np.exp(y_log_pred) - EPS
            y_pred = max(y_pred, 0.0)

            y_pred_calib = calib.transform([y_pred])[0]
            y_pred_calib = max(y_pred_calib, 0.0)

            preds.append(y_pred_calib)

        return np.array(preds)

# Быстрая проверка
print("Пример предсказания для кампании 0:", predict_for_features_row(X.iloc[0]))

Пример предсказания для кампании 0: [0.02  0.01  0.005]


# ЭТАП 10. Применяем модель ко всем кампаниям и сохраняем предсказания

In [18]:
preds_matrix = np.zeros((len(X), 3))

for i in range(len(X)):
    if (i + 1) % 100 == 0 or i == 0:
        print(f"Кампания {i+1}/{len(X)}")
    preds_matrix[i, :] = predict_for_features_row(X.iloc[i])

preds_df = pd.DataFrame(preds_matrix, columns=target_cols)
print("preds_df.shape:", preds_df.shape)
display(preds_df.head())

preds_path = os.path.join(FOLDER, "preds_validate.parquet")
preds_df.to_parquet(preds_path)
print("preds_df сохранён в", preds_path)

Кампания 1/1008
Кампания 100/1008
Кампания 200/1008
Кампания 300/1008
Кампания 400/1008
Кампания 500/1008
Кампания 600/1008
Кампания 700/1008
Кампания 800/1008
Кампания 900/1008
Кампания 1000/1008
preds_df.shape: (1008, 3)


Unnamed: 0,at_least_one,at_least_two,at_least_three
0,0.02,0.01,0.005
1,0.02235,0.000286,1.734723e-18
2,0.104817,0.018036,0.0001
3,0.210367,0.106725,0.05915806
4,0.3593,0.267008,0.1819818


preds_df сохранён в /content/drive/MyDrive/VK_Project_v2/preds_validate.parquet


# Внутренняя оценка качества по всей выборке (приблизительная)

In [20]:
from sklearn.metrics import mean_squared_log_error

def smlogaccratio(y_true, y_pred):
    eps = 1e-9
    rmsle = np.sqrt(mean_squared_log_error(y_true + eps, y_pred + eps))
    return (1 - rmsle) * 100

# y — это validate_answers[target_cols], X — features_df
y_true_full = y.values           # (1008, 3)
y_pred_full = preds_df.values    # (1008, 3)

internal_score = smlogaccratio(y_true_full, y_pred_full)
print("Internal SMLogAccRatio (approx):", internal_score)

Internal SMLogAccRatio (approx): 95.32988660935068
