# Импорт библиотек

In [142]:
import yaml
import joblib

import pandas as pd
import numpy as np

from sklearn.metrics import roc_auc_score, recall_score, precision_score, f1_score
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier

from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier

import optuna

import warnings
warnings.filterwarnings('ignore');

# Описание задачи

В данном jupyter-ноутбуке мы обучим бейзлан модели для определения пола пользователя. А затем подберем лучшие параметры для лучшей модели

https://ods.ai/competitions/mtsmlcup

Задача соревнования
- Определение пола и возраста владельца HTTP cookie по истории активности пользователя в интернете на основе синтетических данных.

Метрики соревнования:
* ROC-AUC – для определения пола

Предобработка, аггрегация и создание новых фич произведена в 0.Data_preparing.ipng.
Описание колонок аггрегированного файла с данными:
* 'part_of_day_day' – кол-во визитов пользователя днем
* 'part_of_day_evening' – кол-во визитов пользователя вечером
* 'part_of_day_morning' – кол-во визитов пользователя утром
* 'part_of_day_night' – кол-во визитов пользователя ночью
* 'sum_visits' – кол-во визитов пользователя
* 'day_pct' – доля визитов пользователя днем
* 'evening_pct' – доля визитов пользователя вечером
* 'morning_pct' – доля визитов пользователя утром
* 'night_pct' – доля визитов пользователя ночью
* 'act_days' – кол-во дней, в которые пользователь совершил визит пользователя
* 'request_cnt' - кол-во запросов пользователя
* 'avg_req_per_day' - среднее кол-во запросов пользователя
* 'period_days' - кол-во дней между первым и последним визитом пользователя
* 'act_days_pct' - доля дней, когда пользователь совершал визит
* 'cpe_type_cd - тип устройства
* 'cpe_model_os_type' - операционная система устройства
* 'cpe_manufacturer_name' -производитель устройства
* 'price' - цена устройства пользователя
* 'region_cnt' - кол-во уникальных регионов, из которых был совершен визит
* 'city_cnt' - кол-во уникальных городов, из которых был совершен визит
* 'url_host_cnt' - кол-во уникальных ссылок, с которых был совершен визит
* 'user_id' – ID пользователя

Описание колонок файла с таргетами:

* 'age' – Возраст пользователя
* 'is_male' – Признак пользователя : мужчина (1-Да, 0-Нет)
* 'user_id' – ID пользователя

# Загрузка данных

In [167]:
# загрузим конфигурационный файл
config_path = '../config/parameters.yaml'
config = yaml.load(open(config_path), Loader=yaml.FullLoader)

In [170]:
config_prep = config['preprocessing']
config_prep

{'raw_data_extension': '.parquet',
 'change_col_types': {'region_name': 'category',
  'city_name': 'category',
  'cpe_manufacturer_name': 'category',
  'cpe_model_name': 'category',
  'url_host': 'category',
  'cpe_type_cd': 'category',
  'cpe_model_os_type': 'category',
  'date': 'datetime64[ns]',
  'price': 'float32',
  'part_of_day': 'category',
  'request_cnt': 'int8',
  'user_id': 'int32'},
 'agg_columns_type': {'user_id': 'int32',
  'part_of_day_day': 'int16,',
  'part_of_day_evening': 'int16',
  'part_of_day_morning': 'int16',
  'part_of_day_night': 'int16',
  'sum_visits': 'int16',
  'day_pct': 'float32',
  'evening_pct': 'float32',
  'morning_pct': 'float32',
  'night_pct': 'float32',
  'act_days': 'int16',
  'request_cnt': 'int32',
  'avg_req_per_day': 'float32',
  'period_days': 'int16',
  'act_days_pct': 'float32',
  'cpe_type_cd': 'category',
  'cpe_model_os_type': 'category',
  'cpe_manufacturer_name': 'category',
  'price': 'float32',
  'region_cnt': 'int8',
  'city_cnt'

In [168]:
config_train = config['train']
config_train

{'target': 'is_male',
 'train_test_size': 0.2,
 'train_val_size': 0.16,
 'random_state': 10,
 'k_folds': 5,
 'n_trials': 10,
 'columns_to_drop': 'user_id',
 'target_type': {'is_male': 'int8'},
 'target_data_path': '../data/raw/public_train.pqt',
 'train_data_path': '../data/processed/train_data.csv',
 'model_path': '../models/model_clf.joblib',
 'study_path': '../models/study.joblib',
 'metrics_path': '../report/metrics.json',
 'train_split_path': '../data/processed/train.csv',
 'test_split_path': '../data/processed/test.csv'}

In [8]:
# путь до датасета для обучения модели
train_data_path = config_train['train_data_path']

In [9]:
train_data = pd.read_csv(train_data_path)
train_data.head(3)

Unnamed: 0,user_id,part_of_day_day,part_of_day_evening,part_of_day_morning,part_of_day_night,sum_visits,day_pct,evening_pct,morning_pct,night_pct,...,period_days,act_days_pct,cpe_type_cd,cpe_manufacturer_name,cpe_model_os_type,price,region_cnt,city_cnt,url_host_cnt,is_male
0,69,457,346,314,30,1147,0.398431,0.301656,0.273758,0.026155,...,60,0.933333,smartphone,Apple,iOS,16657.0,1,1,91,0
1,85,838,1067,602,333,2840,0.29507,0.375704,0.211972,0.117254,...,41,0.926829,smartphone,Apple,iOS,55809.004,2,6,174,0
2,106,165,221,168,142,696,0.237069,0.317529,0.241379,0.204023,...,63,0.952381,smartphone,Samsung,Android,8111.0,1,2,91,0


In [14]:
# типы данных для экономии памяти
data_types = config_prep['agg_columns_type']

In [15]:
# преобразуем типы колонок
train_data = train_data.astype(data_types)

In [16]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 264325 entries, 0 to 264324
Data columns (total 23 columns):
 #   Column                 Non-Null Count   Dtype   
---  ------                 --------------   -----   
 0   user_id                264325 non-null  int32   
 1   part_of_day_day        264325 non-null  int16   
 2   part_of_day_evening    264325 non-null  int16   
 3   part_of_day_morning    264325 non-null  int16   
 4   part_of_day_night      264325 non-null  int16   
 5   sum_visits             264325 non-null  int16   
 6   day_pct                264325 non-null  float32 
 7   evening_pct            264325 non-null  float32 
 8   morning_pct            264325 non-null  float32 
 9   night_pct              264325 non-null  float32 
 10  act_days               264325 non-null  int16   
 11  request_cnt            264325 non-null  int32   
 12  avg_req_per_day        264325 non-null  float32 
 13  period_days            264325 non-null  int16   
 14  act_days_pct        

# Baseline model

In [17]:
def get_metrics_classification(y_test, y_pred, y_score, name) -> pd.DataFrame:
    """
    Генерация таблицы с метриками для задачи бинарной классификации
    :param y_test: истинные значения целевой переменной
    :param y_pred: предсказанный класс
    :param y_score: предсказанные вероятности классов
    :param name: имя модели
    :return: датафрейм с метриками
    """
    df_metrics = pd.DataFrame()

    df_metrics['model'] = [name]
    df_metrics['Precision'] = precision_score(y_test, y_pred)
    df_metrics['Recall'] = recall_score(y_test, y_pred)
    df_metrics['f1'] = f1_score(y_test, y_pred)
    df_metrics['ROC_AUC'] = roc_auc_score(y_test, y_score[:, 1])

    return df_metrics


def check_overfitting_classification(model, X_train, y_train, X_test, y_test) -> None:
    """
    Проверка на overfitting для бинарной классификации
    :param model: обученная модель
    :param X_train: матрица объект-признак train
    :param y_train: целевая переменная train
    :param X_test: матрица объект-признак test
    :param y_test: целевая переменная test
    :return: None
    """
    roc_auc_train = roc_auc_score(y_train, model.predict_proba(X_train)[:, 1])
    roc_auc_test = roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])

    print("ROC-AUC train = %.3f" % roc_auc_train)
    print("ROC-AUC test = %.3f" % roc_auc_test)
    print(
        f'delta = {round((roc_auc_train - roc_auc_test) / roc_auc_test * 100, 2)}%'
    )

## Train/test split

In [80]:
# таргет
target_col = config_train['target']
# размер тестовой выборки
test_size = config_train['train_test_size']
# размер валидационной выборки
val_size = config_train['train_val_size']
# колонки, которые нужно дропнуть
col_to_drop = config_train['columns_to_drop']
RAND = config_train['random_state']

In [81]:
# сплит данных на train/validation/test
X = train_data.drop([col_to_drop, target_col], axis=1)
y = train_data[target_col]

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=test_size,
                                                    shuffle=True,
                                                    stratify=y,
                                                    random_state=RAND)

X_train_, X_val, y_train_, y_val = train_test_split(X_train,
                                                    y_train,
                                                    test_size=val_size,
                                                    shuffle=True,
                                                    stratify=y_train,
                                                    random_state=RAND)

# значение для scale_pos_weight
ratio = float(np.sum(y == 0)) / np.sum(y == 1)

## LGBM

In [82]:
# обучаем бейзлайн LGBM
lgbm_clf = LGBMClassifier(class_weight='balanced',
                          random_state=RAND,
                          scale_pos_weight=ratio)
eval_set = [(X_val, y_val)]

lgbm_clf.fit(X_train_,
             y_train_,
             eval_set=eval_set,
             early_stopping_rounds=100,
             verbose=False)

In [83]:
# проверка на переобучение
check_overfitting_classification(lgbm_clf, X_train, y_train, X_test, y_test)

ROC-AUC train = 0.651
ROC-AUC test = 0.627
delta = 3.83%


In [84]:
# получим итоговую оценку
y_pred = lgbm_clf.predict(X_test)
y_score = lgbm_clf.predict_proba(X_test)

metrics = get_metrics_classification(y_test,
                           y_pred,
                           y_score,
                           name='LightGBM_Baseline')
metrics

Unnamed: 0,model,Precision,Recall,f1,ROC_AUC
0,LightGBM_Baseline,0.616125,0.51585,0.561546,0.626908


## Catboost

In [85]:
# обучаем бейзлайн Catboost
cat_features = X.select_dtypes('category').columns.tolist()
cb_clf = CatBoostClassifier(allow_writing_files=False,
                            random_state=RAND,
                            cat_features=cat_features,
                            scale_pos_weight=ratio,
                            verbose=False)

cb_clf.fit(X_train, y_train)

<catboost.core.CatBoostClassifier at 0x176383340>

In [86]:
# проверка на переобучение
check_overfitting_classification(cb_clf, X_train, y_train, X_test, y_test)

ROC-AUC train = 0.691
ROC-AUC test = 0.629
delta = 9.95%


In [87]:
# получим итоговую оценку
y_pred = cb_clf.predict(X_test)
y_score = cb_clf.predict_proba(X_test)

metrics = pd.concat([
    metrics,
    get_metrics_classification(y_test,
                               y_pred,
                               y_score,
                               name='CatBoost_Baseline')
])
metrics

Unnamed: 0,model,Precision,Recall,f1,ROC_AUC
0,LightGBM_Baseline,0.616125,0.51585,0.561546,0.626908
0,CatBoost_Baseline,0.609918,0.558043,0.582828,0.628648


## XGBoost

In [88]:
# сплит данных на train/validation/test с one hot encoding
X = pd.get_dummies(train_data.drop([col_to_drop, target_col], axis=1))
y = train_data[target_col]

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=test_size,
                                                    shuffle=True,
                                                    stratify=y,
                                                    random_state=RAND)

X_train_, X_val, y_train_, y_val = train_test_split(X_train,
                                                    y_train,
                                                    test_size=val_size,
                                                    shuffle=True,
                                                    stratify=y_train,
                                                    random_state=RAND)

In [89]:
# обучаем бейзлайн XGBoost
xgb_clf = XGBClassifier(random_state=random_state)
eval_set = [(X_val, y_val)]

xgb_clf.fit(X_train_,
            y_train_,
            eval_set=eval_set,
            early_stopping_rounds=100,
            verbose=0)

In [90]:
# проверка на переобучение
check_overfitting_classification(xgb_clf, X_train, y_train, X_test, y_test)

ROC-AUC train = 0.655
ROC-AUC test = 0.623
delta = 5.15%


In [91]:
# получим итоговую оценку
y_pred = xgb_clf.predict(X_test)
y_score = xgb_clf.predict_proba(X_test)

metrics = pd.concat([
    metrics,
    get_metrics_classification(y_test,
                               y_pred,
                               y_score,
                               name='XGBoost_Baseline')
])
metrics

Unnamed: 0,model,Precision,Recall,f1,ROC_AUC
0,LightGBM_Baseline,0.616125,0.51585,0.561546,0.626908
0,CatBoost_Baseline,0.609918,0.558043,0.582828,0.628648
0,XGBoost_Baseline,0.599283,0.586973,0.593064,0.622826


## RandomForest

In [92]:
# обучение бейзлайна RandomForest
rf_clf = RandomForestClassifier(class_weight='balanced',
                                random_state=RAND)

rf_clf.fit(X_train, y_train)

In [93]:
# проверка на переобучение
check_overfitting_classification(rf_clf, X_train, y_train, X_test, y_test)

ROC-AUC train = 1.000
ROC-AUC test = 0.595
delta = 67.92%


In [94]:
# получим итоговую оценку
y_pred = rf_clf.predict(X_test)
y_score = rf_clf.predict_proba(X_test)

metrics = pd.concat([
    metrics,
    get_metrics_classification(y_test,
                               y_pred,
                               y_score,
                               name='RandomForest_Baseline')
])
metrics

Unnamed: 0,model,Precision,Recall,f1,ROC_AUC
0,LightGBM_Baseline,0.616125,0.51585,0.561546,0.626908
0,CatBoost_Baseline,0.609918,0.558043,0.582828,0.628648
0,XGBoost_Baseline,0.599283,0.586973,0.593064,0.622826
0,RandomForest_Baseline,0.580521,0.584017,0.582264,0.595487


Вывод: лучшие метрики у Catboost. Попробуем подобрать параметры для этой модели и улучшить метрики

# Model tuning

## Train/test split

In [96]:
# сплит данных на train/validation/test
X = train_data.drop([col_to_drop, target_col], axis=1)
y = train_data[target_col]

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=test_size,
                                                    shuffle=True,
                                                    stratify=y,
                                                    random_state=RAND)

X_train_, X_val, y_train_, y_val = train_test_split(X_train,
                                                    y_train,
                                                    test_size=val_size,
                                                    shuffle=True,
                                                    stratify=y_train,
                                                    random_state=RAND)

# значение для scale_pos_weight
ratio = float(np.sum(y == 0)) / np.sum(y == 1)

## Tuning

In [129]:
# функция для поиска оптимальных параметров с помощью Optuna
def objective_cat(trial, X, y, N_FOLDS, random_state):

    cat_params = {
#         "iterations": trial.suggest_categorical("iterations", [i for i in range(1000, 3501, 100)]),
#         "learning_rate": trial.suggest_float("learning_rate",
#                                              0.001,
#                                              0.3,
#                                              log=True),
        "iterations":
        trial.suggest_categorical("iterations", [1300]),
        "learning_rate":
        trial.suggest_categorical("learning_rate", [0.048130802271454436]),
        "max_depth":
        trial.suggest_int("max_depth", 4, 10),
        "colsample_bylevel":
        trial.suggest_float("colsample_bylevel", 0.5, 1.0),
        "l2_leaf_reg":
        trial.suggest_uniform("l2_leaf_reg", 1e-5, 1e2),
        "random_strength":
        trial.suggest_uniform('random_strength', 10, 50),
        "bootstrap_type":
        trial.suggest_categorical("bootstrap_type",
                                  ["Bayesian", "Bernoulli", "MVS", "No"]),
        "border_count":
        trial.suggest_categorical('border_count', [128, 254]),
        "grow_policy":
        trial.suggest_categorical('grow_policy',
                                  ["SymmetricTree", "Depthwise", "Lossguide"]),
        "od_wait":
        trial.suggest_int('od_wait', 500, 2000),
        "leaf_estimation_iterations":
        trial.suggest_int('leaf_estimation_iterations', 1, 15),
        "use_best_model":
        trial.suggest_categorical("use_best_model", [True]),
        "eval_metric":
        trial.suggest_categorical("eval_metric", ['AUC']),
        "scale_pos_weight":
        trial.suggest_categorical("scale_pos_weight", [ratio]),
        "random_state":
        trial.suggest_categorical("random_state", [random_state])
    }

    if cat_params["bootstrap_type"] == "Bayesian":
        cat_params["bagging_temperature"] = trial.suggest_float(
            "bagging_temperature", 0, 100)
    elif cat_params["bootstrap_type"] == "Bernoulli":
        cat_params["subsample"] = trial.suggest_float("subsample",
                                                      0.1,
                                                      1,
                                                      log=True)

    cv = StratifiedKFold(n_splits=N_FOLDS,
                         shuffle=True,
                         random_state=random_state)

    cv_predicts = np.empty(N_FOLDS)

    for idx, (train_idx, test_idx) in enumerate(cv.split(X, y)):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
        
        cat_features = X.select_dtypes('category').columns.tolist()
        model = CatBoostClassifier(**cat_params, cat_features=cat_features)
        model.fit(X_train,
                  y_train,
                  eval_set=[(X_test, y_test)],
                  early_stopping_rounds=100,
                  verbose=0)

        preds_proba = model.predict_proba(X_test)[:, 1]
        cv_predicts[idx] = roc_auc_score(y_test, preds_proba)

        return np.mean(cv_predicts)

In [135]:
# поиск оптимальных параметров

study_cat = optuna.create_study(
    direction="maximize",
    pruner=optuna.pruners.SuccessiveHalvingPruner(),
    study_name="Cat_01")

function = lambda trial: objective_cat(trial,
                                       X_train,
                                       y_train,
                                       N_FOLDS=config['train']['k_folds'],
                                       random_state=config['train'][
                                           'random_state'])

study_cat.optimize(func, n_trials=15, show_progress_bar=True)

  0%|          | 0/15 [00:00<?, ?it/s]

In [139]:
study_cat.best_params

{'iterations': 1300,
 'learning_rate': 0.048130802271454436,
 'max_depth': 6,
 'colsample_bylevel': 0.9268140879307578,
 'l2_leaf_reg': 82.20111388604167,
 'random_strength': 19.727940456658636,
 'bootstrap_type': 'No',
 'border_count': 254,
 'grow_policy': 'SymmetricTree',
 'od_wait': 1322,
 'leaf_estimation_iterations': 3,
 'use_best_model': True,
 'eval_metric': 'AUC',
 'scale_pos_weight': 0.9531740695036613,
 'random_state': 10}

## Train with best params

In [136]:
# обучаем CatBoost на лучших найденных параметрах
cat_features = X.select_dtypes('category').columns.tolist()
cat_gender = CatBoostClassifier(cat_features=cat_features, **study_cat.best_params)
eval_set = [(X_val, y_val)]

cat_gender.fit(X_train_,
               y_train_,
               eval_set=eval_set,
               verbose=False,
               early_stopping_rounds=100);

In [137]:
# проверим не переобучилась ли наша модель
check_overfitting_classification(cat_gender, X_train, y_train, X_test, y_test)

ROC-AUC train = 0.643
ROC-AUC test = 0.629
delta = 2.19%


In [140]:
# получим итоговые метрики на тестовых данных
y_pred_test = cat_gender.predict(X_test)
y_score_test = cat_gender.predict_proba(X_test)

metrics = pd.concat([
    metrics,
    get_metrics_classification(y_test,
                               y_pred_test,
                               y_score_test,
                               name='CatBoost_Optuna_best_params')
])
metrics

Unnamed: 0,model,Precision,Recall,f1,ROC_AUC
0,LightGBM_Baseline,0.616125,0.51585,0.561546,0.626908
0,CatBoost_Baseline,0.609918,0.558043,0.582828,0.628648
0,XGBoost_Baseline,0.599283,0.586973,0.593064,0.622826
0,RandomForest_Baseline,0.580521,0.584017,0.582264,0.595487
0,CatBoost_Optuna_best_params,0.609584,0.556972,0.582091,0.629386


# Save model and metrics

In [144]:
def get_metrics_dict(y_test: pd.Series, y_pred: np.array, y_score: np.array) -> dict:
    """Функция считает метрики и возвращает словарь"""
    metrics = {
        'roc_auc': round(roc_auc_score(y_test, y_score[:, 1]), 4),
        'precision': round(precision_score(y_test, y_pred), 4),
        'recall': round(recall_score(y_test, y_pred), 4),
        'f1': round(f1_score(y_test, y_pred), 4)
    }
    return metrics

In [148]:
# получим метрики в виде словаря, чтобы в дальнейшем использовать их для сравнения
metrics_dict = get_metrics_dict(y_test, y_pred_test, y_score_test)
print(metrics_dict)

{'roc_auc': 0.6294, 'precision': 0.6096, 'recall': 0.557, 'f1': 0.5821}

In [160]:
metrics_path = config_train['metrics_path']
metrics_path

'../report/metrics.json'

In [162]:
# сохраним метрики
with open(metrics_path, 'w') as f:
    json.dump(metrics_dict, f)

In [152]:
# сохраним лучшую модель
joblib.dump(cat_gender, config_train['model_path'])

['../models/model_clf.joblib']