In [1]:
import os
from pathlib import Path
import sys
import polars as pl
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import bootstrap
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import lightgbm as lgb

In [2]:
df_train = pl.read_parquet("C:\\Users\\User\\Downloads\\train_features.parquet")
df_test = pl.read_parquet("C:\\Users\\User\\Downloads\\test_features.parquet")


In [17]:
feature_columns = [f"feature_{i}" for i in range(1, 12)]

## Заполнитель пропусков

In [4]:
class MeanPLImputer:
    def __init__(self):
        self.feature_means = {}
        
    def fit(self, x: pl.DataFrame): 
        self.feature_means = x.mean().to_dicts()[0]
        return self
    
    def transform(self, x: pl.DataFrame) -> pl.DataFrame:
        return (
            x
            .with_columns(*[
                pl.col(col).fill_null(val)
                for col, val in self.feature_means.items()
            ])
        )
    
    
imputer = MeanPLImputer().fit(df_train.select(*feature_columns))


df_test = imputer.transform(df_test)
df_train = imputer.transform(df_train)

## Разделим на трейн и вал

In [5]:
df_train_pd = df_train.to_pandas()
df_test_pd = df_test.to_pandas()

train_query_ids, val_query_ids = train_test_split(
    np.arange(df_train.shape[0]),
    random_state=42, 
    test_size=0.2
)


# Отбор тренировочных и валидационных данных
train_df = df_train_pd[df_train_pd["query_id"].isin(train_query_ids)]
val_df = df_train_pd[df_train_pd["query_id"].isin(val_query_ids)]
# Преобразование DataFrame из Pandas в Polars
train_df = pl.from_pandas(train_df)
val_df = pl.from_pandas(val_df)

In [6]:
train_df_pd=train_df.to_pandas()
val_df_pd=val_df.to_pandas()
train_df_pd['target'] = train_df_pd['target'].astype(int)
val_df_pd['target'] = val_df_pd['target'].astype(int)
df_test_pd['target'] = df_test_pd['target'].astype(int)

# LambdaMart модель в реализации от LightGBM

### Подготовим данные для обучения модели Lgbm

In [12]:
qids_train = train_df_pd.groupby("query_id")["query_id"].count().to_numpy()
X_train = train_df_pd.drop(["query_id", "report_date", "target", "rn"], axis=1)
y_train = train_df_pd["target"]
qids_validation = val_df_pd.groupby("query_id")["query_id"].count().to_numpy()
X_validation = val_df_pd.drop(["query_id", "report_date", "target", "rn"], axis=1)
y_validation = val_df_pd["target"]

In [13]:
a=[int(i) for i in range(int(max(y_train.max(), y_validation.max())) + 1)]

### Обучение модели

In [14]:
import time
import lightgbm as lgb

start_time = time.time()

model = lgb.LGBMRanker(
    objective="lambdarank",
    metric="ndcg", label_gain=a
)

model.fit(
    X=X_train,
    y=y_train,
    group=qids_train,
    eval_set=[(X_validation, y_validation)],
    eval_group=[qids_validation]
)

end_time = time.time()
training_time = end_time - start_time
print("Training time:", training_time, "seconds")

model.predict(X_train)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.203238 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1808
[LightGBM] [Info] Number of data points in the train set: 18147234, number of used features: 11
Training time: 83.54031729698181 seconds


array([ 1.96051147, -0.36472838,  0.12885933, ..., -1.48282238,
       -0.92608394, -0.1198773 ])

In [16]:
y_pred = model.predict(X_validation)


In [20]:
val_df_pd['rank'] = (val_df_pd.assign(pred=y_pred).groupby('query_id')['pred']
                            .rank(method='dense', ascending=False).astype(int))


In [18]:

# Эта функция считает Hitrate
from scipy.stats import bootstrap


def ap_at_k(relevances, k=10):
    total_relevant = sum(relevances[:k])
    
    if total_relevant == 0:
        return 0
    
    ap_ = 0
    for k_ in range(1, k+1):
        ap_ += sum(relevances[:k_]) * relevances[k_ - 1] / k_
        
    return ap_ / total_relevant

def calc_metrics(df, rn_col):
    return (
        df
        .sort_values(rn_col)
        .groupby("query_id")
        .agg(
            hit_at_1=("target", lambda x: x[:1].sum() > 0),
            hit_at_5=("target", lambda x: x[:5].sum() > 0),
            hit_at_10=("target", lambda x: x[:10].sum() > 0)
        )
    )


## Вычисляем средние метрики получившейся модели

In [22]:
metrics = calc_metrics(val_df_pd[['query_id', 'rank', 'target']], 'rank')
mean_metrics = metrics.mean()
print(mean_metrics)

hit_at_1     0.149047
hit_at_5     0.400858
hit_at_10    0.551560
dtype: float64


## Подбор гиперпараметров Optuna

In [27]:
import optuna
import lightgbm as lgb

# Определение функции для оптимизации гиперпараметров
def objective(trial):
    # Определение пространства поиска гиперпараметров
    param = {
        'objective': 'lambdarank',
        'metric': 'ndcg',
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1),
        'n_estimators': trial.suggest_int('n_estimators', 100, 300),
        'max_depth': trial.suggest_int('max_depth', 3, 7),
        'min_child_samples': trial.suggest_int('min_child_samples', 10, 30),
        'subsample': trial.suggest_float('subsample', 0.7, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.7, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.0, 1.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.0, 1.0)
    }
    
    # Создание модели LightGBM
    model = lgb.LGBMRanker(**param)
    
    # Обучение модели
    model.fit(X_train, y_train, group=qids_train)
    
    # Предсказание на валидационном наборе
    y_pred = model.predict(X_validation)
    
    # Ранжирование предсказаний
    val_df_pd['rank'] = (val_df_pd.assign(pred=y_pred)
                             .groupby('query_id')['pred']
                             .rank(method='dense', ascending=False)
                             .astype(int))
    
    # Вычисление метрики качества hitrate@1
    metrics = calc_metrics(val_df_pd[['query_id', 'rank', 'target']], 'rank')
    hitrate_at_1 = metrics['hit_at_1'].mean()
    
    # Возвращаем метрику hitrate@1 для оптимизации
    return hitrate_at_1


# Создание экземпляра Study для оптимизации
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=5)

# Вывод наилучших гиперпараметров и оценки
print("Best hyperparameters found:")
print(study.best_params)
print("Best score found (hitrate@1):")
print(study.best_value)


[I 2024-05-07 22:25:28,397] A new study created in memory with name: no-name-69f3f38f-9138-4e0a-9fdd-8a6a9fe494bb


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.203382 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1808
[LightGBM] [Info] Number of data points in the train set: 18147234, number of used features: 11


[I 2024-05-07 22:28:18,635] Trial 0 finished with value: 0.14946117244362528 and parameters: {'learning_rate': 0.06875170926881327, 'n_estimators': 236, 'max_depth': 4, 'min_child_samples': 18, 'subsample': 0.8630416602886275, 'colsample_bytree': 0.9755814739522546, 'reg_alpha': 0.45545064469608687, 'reg_lambda': 0.6994553032621015}. Best is trial 0 with value: 0.14946117244362528.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.208139 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1808
[LightGBM] [Info] Number of data points in the train set: 18147234, number of used features: 11


[I 2024-05-07 22:30:10,150] Trial 1 finished with value: 0.14885934061800607 and parameters: {'learning_rate': 0.08131652825331241, 'n_estimators': 127, 'max_depth': 6, 'min_child_samples': 14, 'subsample': 0.8774124567490154, 'colsample_bytree': 0.9970996043346079, 'reg_alpha': 0.7155440496045693, 'reg_lambda': 0.11224494248446559}. Best is trial 0 with value: 0.14946117244362528.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.197221 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1808
[LightGBM] [Info] Number of data points in the train set: 18147234, number of used features: 11


[I 2024-05-07 22:33:55,491] Trial 2 finished with value: 0.1523386808598672 and parameters: {'learning_rate': 0.07869101549616986, 'n_estimators': 291, 'max_depth': 7, 'min_child_samples': 28, 'subsample': 0.941158485417517, 'colsample_bytree': 0.907007175934875, 'reg_alpha': 0.7032758879741805, 'reg_lambda': 0.9472434341970295}. Best is trial 2 with value: 0.1523386808598672.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.144736 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1808
[LightGBM] [Info] Number of data points in the train set: 18147234, number of used features: 11


[I 2024-05-07 22:36:21,395] Trial 3 finished with value: 0.14417633672490643 and parameters: {'learning_rate': 0.022145564070304075, 'n_estimators': 203, 'max_depth': 4, 'min_child_samples': 25, 'subsample': 0.7185560705175433, 'colsample_bytree': 0.7120844785242483, 'reg_alpha': 0.6956489364970787, 'reg_lambda': 0.9996230289137155}. Best is trial 2 with value: 0.1523386808598672.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.146781 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1808
[LightGBM] [Info] Number of data points in the train set: 18147234, number of used features: 11


[I 2024-05-07 22:38:09,608] Trial 4 finished with value: 0.14411991499125462 and parameters: {'learning_rate': 0.020452950017027665, 'n_estimators': 140, 'max_depth': 4, 'min_child_samples': 12, 'subsample': 0.9958644905849818, 'colsample_bytree': 0.7333147264330144, 'reg_alpha': 0.9974141619195724, 'reg_lambda': 0.217466201544519}. Best is trial 2 with value: 0.1523386808598672.


Best hyperparameters found:
{'learning_rate': 0.07869101549616986, 'n_estimators': 291, 'max_depth': 7, 'min_child_samples': 28, 'subsample': 0.941158485417517, 'colsample_bytree': 0.907007175934875, 'reg_alpha': 0.7032758879741805, 'reg_lambda': 0.9472434341970295}
Best score found (hitrate@1):
0.1523386808598672


## Вычислим метрики после кросс-валидации

In [30]:
from sklearn.model_selection import StratifiedKFold, GroupKFold
import numpy as np

def get_metrics_cv(df, n_splits=5, **model_params):
    skf = GroupKFold(n_splits=n_splits)
    metrics = []

    for train_index, val_index in skf.split(df.drop(['target'],axis=1), y=df['target'], groups=df["query_id"]):
        train_data = df.iloc[train_index]
        val_data = df.iloc[val_index]

        qids_train = train_data.groupby("query_id")["query_id"].count().to_numpy()
        X_train = train_data.drop(["query_id", "report_date", "target", "rn"], axis=1)
        y_train = train_data["target"]

        qids_validation = val_data.groupby("query_id")["query_id"].count().to_numpy()
        X_validation = val_data.drop(["query_id", "report_date", "target", "rn"], axis=1)
        y_validation = val_data["target"]

        model = lgb.LGBMRanker(objective="lambdarank", metric="ndcg", **model_params)
        model.fit(X=X_train, y=y_train, group=qids_train, eval_set=[(X_validation, y_validation)], eval_group=[qids_validation])

        y_pred = model.predict(X_validation)
        val_data['rank'] = (val_data.assign(pred=y_pred).groupby('query_id')['pred'].rank(method='dense', ascending=False).astype(int))

        fold_metrics = calc_metrics(val_data[['query_id', 'rank', 'target']], 'rank')
        metrics.append(fold_metrics)

    return metrics

# Пример использования:
cv_metrics = get_metrics_cv(train_df_pd, **study.best_params)
cv_mean_metrics = pd.DataFrame.from_records(map(lambda x: x.mean(), cv_metrics))
print(cv_mean_metrics.agg(["mean", "std"]))

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.129794 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1803
[LightGBM] [Info] Number of data points in the train set: 14517791, number of used features: 11


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  val_data['rank'] = (val_data.assign(pred=y_pred).groupby('query_id')['pred'].rank(method='dense', ascending=False).astype(int))


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.145314 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1808
[LightGBM] [Info] Number of data points in the train set: 14517791, number of used features: 11


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  val_data['rank'] = (val_data.assign(pred=y_pred).groupby('query_id')['pred'].rank(method='dense', ascending=False).astype(int))


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.131320 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1810
[LightGBM] [Info] Number of data points in the train set: 14517791, number of used features: 11


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  val_data['rank'] = (val_data.assign(pred=y_pred).groupby('query_id')['pred'].rank(method='dense', ascending=False).astype(int))


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.154462 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1812
[LightGBM] [Info] Number of data points in the train set: 14517791, number of used features: 11


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  val_data['rank'] = (val_data.assign(pred=y_pred).groupby('query_id')['pred'].rank(method='dense', ascending=False).astype(int))


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.137437 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1814
[LightGBM] [Info] Number of data points in the train set: 14517772, number of used features: 11


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  val_data['rank'] = (val_data.assign(pred=y_pred).groupby('query_id')['pred'].rank(method='dense', ascending=False).astype(int))


      hit_at_1  hit_at_5  hit_at_10
mean  0.151902  0.407933   0.558840
std   0.002553  0.003089   0.002375


## Вывод

## Получаем среднее значение метрики Hit@1 после подбора гиперпараметров и кросс-валидации 0.151902

# Catboost

### Будем использовать YetiRank для предсказания рангов

In [8]:
import catboost as cb


In [9]:
train_pool=cb.Pool(train_df_pd.drop(["query_id", "report_date", "target", "rn"], axis=1),label=train_df_pd["target"],group_id=train_df_pd["query_id"],)


In [10]:
params_dict={
    "loss_function":"YetiRank",
    "iterations":20,
    "verbose":2,
    "max_ctr_complexity":1
}
model=cb.CatBoostRanker(**params_dict)
model.fit(train_pool)

0:	total: 5.68s	remaining: 1m 47s
2:	total: 16.5s	remaining: 1m 33s
4:	total: 27.4s	remaining: 1m 22s
6:	total: 38.2s	remaining: 1m 10s
8:	total: 48.9s	remaining: 59.8s
10:	total: 59.7s	remaining: 48.8s
12:	total: 1m 10s	remaining: 38s
14:	total: 1m 21s	remaining: 27.2s
16:	total: 1m 32s	remaining: 16.3s
18:	total: 1m 43s	remaining: 5.43s
19:	total: 1m 48s	remaining: 0us


<catboost.core.CatBoostRanker at 0x1be5763d350>

In [17]:
y_pred=model.predict(df_test_pd)


In [18]:
df_test_pd['rank'] = (df_test_pd.assign(pred=y_pred).groupby('query_id')['pred']
                            .rank(method='dense', ascending=False).astype(int))


### Вычислим средние метрики обученной модели

In [19]:
metrics = calc_metrics(df_test_pd[['query_id', 'rank', 'target']], 'rank')
mean_metrics = metrics.mean()
print(mean_metrics)

hit_at_1     0.149024
hit_at_5     0.387452
hit_at_10    0.530380
dtype: float64


## Подберём гиперпараметры

In [21]:
import optuna

# Определение функции для оптимизации гиперпараметров
def objective(trial):
    # Определение пространства поиска гиперпараметров
    param = {
        "loss_function": 'YetiRank',
        "iterations": 20,
        "verbose": 0,
        "max_ctr_complexity": 1,
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.1),
        "depth": trial.suggest_int("depth", 3, 7),
    }
    
    # Создание модели Catboost
    model = cb.CatBoostRanker(**param)
    
    # Обучение модели
    model.fit(train_pool, verbose_eval=False)
    
    # Предсказание на валидационном наборе
    y_pred = model.predict(val_df_pd)
    
    # Ранжирование предсказаний
    val_df_pd['rank'] = (val_df_pd.assign(pred=y_pred).groupby('query_id')['pred']
                        .rank(method='dense', ascending=False).astype(int))
    
    # Вычисление метрики качества hitrate@1
    metrics = calc_metrics(val_df_pd[['query_id', 'rank', 'target']], 'rank')
    hitrate_at_1 = metrics['hit_at_1'].mean()
    
    # Возвращаем метрику hitrate@1 для оптимизации
    return hitrate_at_1


# Создание экземпляра Study для оптимизации
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=5)

# Вывод наилучших гиперпараметров и оценки
print("Best hyperparameters found:")
print(study.best_params)
print("Best score found (hitrate@1):")
print(study.best_value)


[I 2024-05-08 00:41:34,382] A new study created in memory with name: no-name-aee2900f-5097-41d7-9061-6864389320e0
[I 2024-05-08 00:44:18,814] Trial 0 finished with value: 0.14440202365951366 and parameters: {'learning_rate': 0.04600910179950901, 'depth': 7}. Best is trial 0 with value: 0.14440202365951366.
[I 2024-05-08 00:47:03,044] Trial 1 finished with value: 0.1455492655771003 and parameters: {'learning_rate': 0.06452079358981212, 'depth': 7}. Best is trial 1 with value: 0.1455492655771003.
[I 2024-05-08 00:49:45,277] Trial 2 finished with value: 0.14270937164995956 and parameters: {'learning_rate': 0.017352630638510287, 'depth': 7}. Best is trial 1 with value: 0.1455492655771003.
[I 2024-05-08 00:52:29,135] Trial 3 finished with value: 0.14496624099603167 and parameters: {'learning_rate': 0.08208468813968268, 'depth': 6}. Best is trial 1 with value: 0.1455492655771003.
[I 2024-05-08 00:55:09,815] Trial 4 finished with value: 0.14203231084613793 and parameters: {'learning_rate': 0.

Best hyperparameters found:
{'learning_rate': 0.06452079358981212, 'depth': 7}
Best score found (hitrate@1):
0.1455492655771003


### Вычислим финальные метрики на кросс-валидации

In [12]:
from sklearn.model_selection import StratifiedKFold, GroupKFold
import numpy as np

def get_metrics_cv(df, n_splits=5, **model_params):
    skf = GroupKFold(n_splits=n_splits)
    metrics = []

    for train_index, val_index in skf.split(df.drop(['target'],axis=1), y=df['target'], groups=df["query_id"]):
        train_data = df.iloc[train_index]
        val_data = df.iloc[val_index]

        train_pool = cb.Pool(
            data=train_data.drop(["query_id", "report_date", "target", "rn"], axis=1),
            label=train_data["target"],
            group_id=train_data["query_id"]
        )
        
        val_pool = cb.Pool(
            data=val_data.drop(["query_id", "report_date", "target", "rn"], axis=1),
            label=val_data["target"],
            group_id=val_data["query_id"]
        )
        model = cb.CatBoostRanker(**model_params)
        model.fit(train_pool, eval_set=val_pool, verbose_eval=False)

        y_pred = model.predict(val_pool)
        val_data['rank'] = (val_data.assign(pred=y_pred).groupby('query_id')['pred'].rank(method='dense', ascending=False).astype(int))

        fold_metrics = calc_metrics(val_data[['query_id', 'rank', 'target']], 'rank')
        metrics.append(fold_metrics)

    return metrics
# Пример использования:
cv_metrics = get_metrics_cv(train_df_pd, **params_dict)
cv_mean_metrics = pd.DataFrame.from_records(map(lambda x: x.mean(), cv_metrics))
print(cv_mean_metrics.agg(["mean", "std"]))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  val_data['rank'] = (val_data.assign(pred=y_pred).groupby('query_id')['pred'].rank(method='dense', ascending=False).astype(int))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  val_data['rank'] = (val_data.assign(pred=y_pred).groupby('query_id')['pred'].rank(method='dense', ascending=False).astype(int))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_gu

      hit_at_1  hit_at_5  hit_at_10
mean  0.144099  0.382212   0.527405
std   0.001213  0.002965   0.003054


## Вывод

## Получаем среднее значение метрики Hit@1 после подбора гиперпараметров и кросс-валидации 0.144099

# RankNet

In [10]:
import torch
import numpy as np
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, mean_absolute_error
import polars as pl


In [7]:
qids_train = train_df_pd.groupby("query_id")["query_id"].count().to_numpy()
X_train = train_df_pd.drop(["query_id", "report_date", "target", "rn"], axis=1).to_numpy()
y_train = train_df_pd["target"].to_numpy()
qids_validation = val_df_pd.groupby("query_id")["query_id"].count().to_numpy()
X_validation = val_df_pd.drop(["query_id", "report_date", "target", "rn"], axis=1).to_numpy()
y_validation = val_df_pd["target"].to_numpy()

In [15]:
test_df=df_test_pd

In [30]:
from typing import List
from typing import Tuple
from sklearn.preprocessing import StandardScaler


In [34]:
class RankNetTrainer:
    def __init__(self, n_epochs=10, hidden_dim=22, lr=0.001):
        self._prepare_data()
        self.num_input_features = self.X_train.shape[1]
        self.n_epochs = n_epochs
        self.hidden_dim = hidden_dim
        self.lr = lr
        self.model = self._create_model(
            self.num_input_features, hidden_dim)
        self.optimizer = torch.optim.Adam(self.model.parameters(), lr=self.lr)
        
    def _create_model(self, num_input_features:int, hidden_dim:int) -> torch.nn.Module:
        torch.manual_seed(123)
        net = RankNet(num_input_features=num_input_features, hidden_dim=hidden_dim)
        return net
    
    def _get_data(self, train_df=train_df, test_df=test_df, feature_column=feature_columns) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
        train_df, test_df = train_df, test_df
        X_train, X_test = train_df[feature_column].to_numpy(), test_df[feature_column].to_numpy()
        y_train, y_test = train_df["target"].to_numpy(), test_df["target"].to_numpy()
        qids_train = train_df["query_id"].to_numpy()
        qids_test = test_df["query_id"].to_numpy()
        return X_train, y_train, qids_train, X_test, y_test, qids_test
    
    def _prepare_data(self) -> None:
        (X_train, y_train, self.qids_train, X_test, y_test, self.qids_test) = self._get_data()
        X_train_copy = np.copy(X_train)
        X_test_copy = np.copy(X_test)
        X_train_copy = self._scale_features_in_query_groups(X_train_copy, self.qids_train)
        X_test_copy = self._scale_features_in_query_groups(X_test_copy, self.qids_test)
        self.X_train = torch.FloatTensor(X_train_copy)
        self.X_test = torch.FloatTensor(X_test_copy)
        self.ys_train = torch.FloatTensor(y_train)
        self.ys_test = torch.FloatTensor(y_test)
    
        
    def _scale_features_in_query_groups(self, inp_feat_array:np.ndarray, inp_query_ids:np.ndarray) -> np.ndarray:
        scaler = StandardScaler()
        for cur_id in np.unique(inp_query_ids):
            mask = inp_query_ids == cur_id
            tmp_array = inp_feat_array[mask]
            inp_feat_array[mask] = scaler.fit_transform(tmp_array)
        return inp_feat_array
    
    def fit(self):
        criterion = nn.BCEWithLogitsLoss()
        for epoch in range(1, self.n_epochs+1):
            self.model.train()
            for cur_id in np.unique(self.qids_train):
                mask_train = self.qids_train == cur_id
                batch_X = self.X_train[mask_train]
                batch_ys = self.ys_train[mask_train]
                self.optimizer.zero_grad()
                logits = self.model(batch_X)
                loss = criterion(logits.squeeze(), batch_ys)
                loss.backward()
                self.optimizer.step()
                
    def predict(self, X_test):
        X_test_scaled = self._scale_features_in_query_groups(X_test, np.zeros(X_test.shape[0]))
        tensor_X_test = torch.FloatTensor(X_test_scaled)
        self.model.eval()
        with torch.no_grad():
            logits = self.model(tensor_X_test)
            probabilities = torch.sigmoid(logits.squeeze()).numpy()
        return probabilities
    def _eval_test_set(self, df_test):
        X_test = df_test.drop(["query_id", "report_date", "target", "rn"], axis=1).to_numpy()
        y_test = df_test["target"].to_numpy()
        predictions = self.predict(X_test)
        df_test["predictions"] = predictions
        metrics = calc_metrics(df_test, "predictions")
        return metrics



In [None]:
ranknet = RankNetTrainer(n_epochs=2, hidden_dim=22, lr=0.01)


In [None]:
ranknet.fit()
ranknet_test = ranknet._eval_test_set(df_test_pd)


In [None]:
from sklearn.model_selection import KFold

def cross_validate(model, X, y, cv=5):
    kf = KFold(n_splits=cv)
    metrics_list = []
    for train_index, val_index in kf.split(X):
        X_train, X_val = X[train_index], X[val_index]
        y_train, y_val = y[train_index], y[val_index]
        
        # Обучение модели
        model.fit(X_train, y_train)
        
        # Получение предсказаний на валидационном наборе данных
        predictions = model.predict(X_val)
        
        # Преобразование в DataFrame
        df_val = pd.DataFrame({"target": y_val, "predictions": predictions})
        
        # Вычисление метрик
        metrics = calc_metrics(df_val, "predictions")
        
        # Добавление метрик в список
        metrics_list.append(metrics)
    
    return metrics_list

# Создание экземпляра модели
ranknet = RankNetTrainer(n_epochs=10, hidden_dim=22, lr=0.001)

# Выполнение кросс-валидации
cross_val_metrics = cross_validate(ranknet, X_train, y_train)

# Вывод результатов
for i, metrics in enumerate(cross_val_metrics, 1):
    print(f"Fold {i} metrics:")
    print(metrics)


# Выводы:
## В ходе решения были реализованы 3 модели (LambdaMART, YetiRank и RankNet)
## Лучший результат по метрике hit@1 показала модель LambdaMART
## Подбор гиперпараметров показал улучшение финальных метрик