In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
%cd /content/drive/MyDrive/MTS ML Cup

/content/drive/MyDrive/MTS ML Cup


In [4]:
import warnings
import os
import sys

sys.path.insert(0, '/content/drive/MyDrive/MTS ML Cup/src')

os.environ['OPENBLAS_NUM_THREADS'] = '1'
warnings.filterwarnings('ignore')

In [None]:
!pip install catboost

In [None]:
!pip install lightgbm

In [7]:
import pandas as pd
import numpy as np

from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier

import sklearn.metrics as m
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.linear_model import LogisticRegression

In [9]:
from metric_functions import get_metrics_classification, check_overfitting_classification

In [10]:
import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline
sns.set_style('darkgrid')

In [11]:
RAND = 42
N_FOLDS = 5

final_types = {
    'user_id': 'int32',
    'part_of_day_day': 'int16',
    'part_of_day_night': 'int16',
    'day_pct': 'float32',
    'evening_pct': 'float32',
    'morning_pct': 'float32',
    'night_pct': 'float32',
    'act_days': 'int16',
    'avg_req_per_day': 'float32',
    'requests_std': 'float32',
    'act_days_pct': 'float32',
    'cpe_type_cd': 'category',
    'cpe_model_os_type': 'category',
    'cpe_manufacturer_name': 'category',
    'price': 'float32',
    'region_cnt': 'int8',
    'city_cnt': 'int8',
    'url_host_cnt': 'int16',
}

# Сбор всех данных

In [12]:
id_to_submit = pd.read_parquet('submit.pqt')

In [13]:
targets = pd.read_parquet('public_train.pqt')

In [15]:
df = pd.read_csv('data_agg/df_final.csv')

In [16]:
df.head()

Unnamed: 0,user_id,part_of_day_day,part_of_day_night,day_pct,evening_pct,morning_pct,night_pct,act_days,avg_req_per_day,requests_std,act_days_pct,cpe_type_cd,cpe_model_os_type,cpe_manufacturer_name,price,region_cnt,city_cnt,url_host_cnt
0,4,199,10,0.336717,0.287648,0.358714,0.01692,20,38.85,0.647632,0.465,smartphone,Android,Huawei,12990.0,5,9,108
1,16,443,137,0.35987,0.260764,0.268075,0.111292,64,39.515625,1.397836,0.8647,smartphone,Android,Samsung,9583.0,1,1,50
2,18,566,34,0.503111,0.101333,0.365333,0.030222,32,60.9375,1.204972,0.8423,smartphone,Android,Samsung,22887.0,1,2,141
3,26,180,126,0.176817,0.400786,0.298625,0.123772,20,71.8,0.688696,0.909,smartphone,Android,Samsung,4990.0,1,1,126
4,27,808,342,0.302622,0.331461,0.237828,0.12809,67,66.07462,1.088583,0.9307,smartphone,Android,Xiaomi,12990.0,1,2,209


In [17]:
df = df.astype(final_types)
df.dtypes

user_id                     int32
part_of_day_day             int16
part_of_day_night           int16
day_pct                   float32
evening_pct               float32
morning_pct               float32
night_pct                 float32
act_days                    int16
avg_req_per_day           float32
requests_std              float32
act_days_pct              float32
cpe_type_cd              category
cpe_model_os_type        category
cpe_manufacturer_name    category
price                     float32
region_cnt                   int8
city_cnt                     int8
url_host_cnt                int16
dtype: object

In [18]:
df.shape

(415317, 18)

In [20]:
url_emb = pd.read_csv('embeddings/url_emb.csv')
url_emb = url_emb.add_suffix('_url').rename(columns={'user_id_url': 'user_id'})

In [21]:
reg_emb = pd.read_csv('embeddings/reg_emb.csv')
reg_emb = reg_emb.add_suffix('_reg').rename(columns={'user_id_reg': 'user_id'})

In [22]:
city_emb = pd.read_csv('embeddings/city_emb.csv')
city_emb = city_emb.add_suffix('_city').rename(
    columns={'user_id_city': 'user_id'})

In [23]:
model_emb = pd.read_csv('embeddings/model_emb.csv')
model_emb = model_emb.add_suffix('_model').rename(
    columns={'user_id_model': 'user_id'})

# Split data

In [24]:
df_train_cat = targets.merge(df, how='inner', on=['user_id']) \
                      .merge(url_emb, how='inner', on='user_id') \
                      .merge(city_emb, how='inner', on=['user_id']) \
                      .merge(reg_emb, how='inner', on=['user_id']) \
                      .merge(model_emb, how='inner', on=['user_id'])

df_train_cat = df_train_cat[df_train_cat['is_male'] != 'NA']
df_train_cat = df_train_cat.dropna()
df_train_cat['is_male'] = df_train_cat['is_male'].map(int)
df_train_cat['is_male'].value_counts()

1    135331
0    128994
Name: is_male, dtype: int64

In [25]:
%%time
X = df_train_cat.drop(['user_id', 'age', 'is_male'], axis=1)
y = df_train_cat['is_male']

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    shuffle=True,
                                                    stratify=y,
                                                    random_state=RAND)

cat_features = X.select_dtypes('category').columns.tolist()
# значение для scale_pos_weight
ratio = float(np.sum(y == 0)) / np.sum(y == 1)

CPU times: user 645 ms, sys: 472 ms, total: 1.12 s
Wall time: 1.11 s


# Metrics

In [76]:
# параметры для моделей были подобраны в model_tuning.ipnb с помощью Optuna
lgbm_gender_best1 = {
    'n_estimators': 1600,
    'learning_rate': 0.04323025519185361,
    'num_leaves': 900,
    'max_depth': 7,
    'min_child_samples': 1200,
    'reg_alpha': 0,
    'reg_lambda': 9,
    'min_split_gain': 3,
    'subsample': 0.6909275901887844,
    'subsample_freq': 1,
    'colsample_bytree': 0.34716278523119637,
    'scale_pos_weight': 0.9531740695036613,
    'random_state': 42
}

lgbm_gender_params_2 = {
    'n_estimators': 2200,
    'learning_rate': 0.04667738692602193,
    'scale_pos_weight': 0.9531740695036613,
    'random_state': 42
}

lgbm_gender_params_3 = {
    'n_estimators': 2600,
    'learning_rate': 0.03825929958722602,
    'scale_pos_weight': 0.9531740695036613,
    'random_state': 42
}

cat_gender_best = {
    'iterations': 3000,
    'learning_rate': 0.051418504137325544,
    'max_depth': 4,
    'colsample_bylevel': 0.7594434568905333,
    'l2_leaf_reg': 57.13098420673366,
    'random_strength': 30.89358906898957,
    'bootstrap_type': 'No',
    'border_count': 128,
    'grow_policy': 'Lossguide',
    'od_wait': 1886,
    'leaf_estimation_iterations': 15,
    'eval_metric': 'AUC',
    'scale_pos_weight': 0.9531740695036613,
    'random_state': 42
}

In [77]:
clf1 = LGBMClassifier(**lgbm_gender_best1)
clf1.fit(X_train, y_train)

clf2 = LGBMClassifier(**lgbm_gender_params_2)
clf2.fit(X_train, y_train)

clf3 = LGBMClassifier(**lgbm_gender_params_3).fit(X_train, y_train)
clf3.fit(X_train, y_train)

clf4 = CatBoostClassifier(**cat_gender_best, cat_features=cat_features)
clf4.fit(X_train, y_train, verbose=False)

models_check = [clf1, clf2, clf3, clf4]

metrics = pd.DataFrame()
for model in models_check:
    y_pred = model.predict(X_test)
    y_score = model.predict_proba(X_test)

    metrics = metrics.append(
        get_metrics_classification(y_test,
                                   y_pred,
                                   y_score,
                                   name=f'{model.__class__.__name__}'))

In [81]:
metrics.reset_index(drop=True, inplace=True)

In [116]:
metrics

Unnamed: 0,model,Precision,Recall,f1,ROC_AUC,GINI
0,LGBMClassifier,0.773767,0.751496,0.762469,0.848864,0.697729
1,LGBMClassifier,0.76984,0.753713,0.761691,0.846432,0.692865
2,LGBMClassifier,0.769871,0.751866,0.760762,0.847162,0.694323
3,CatBoostClassifier,0.77482,0.75375,0.76414,0.848991,0.697983


# LGBM Optuna 1

In [27]:
meta_X = pd.DataFrame()
meta_X_test = pd.DataFrame()

pred_val = []
pred_score_val = []

cv = StratifiedKFold(n_splits=N_FOLDS)

for fold, (train_idx, test_idx) in enumerate(cv.split(X_train, y_train)):
    X_train_, X_val = X_train.iloc[train_idx], X_train.iloc[test_idx]
    y_train_, y_val = y_train.iloc[train_idx], y_train.iloc[test_idx]

    lgbm_1 = LGBMClassifier(**lgbm_gender_best1)
    lgbm_1.fit(X_train_,
               y_train_,
               eval_set=[(X_val, y_val)],
               eval_metric="auc",
               early_stopping_rounds=100,
               verbose=0)

    y_pred_val = lgbm_1.predict(X_val)
    y_score_val = lgbm_1.predict_proba(X_val)

    print("Fold:", fold + 1,
          "ROC-AUC SCORE %.3f" % m.roc_auc_score(y_val, y_score_val[:, 1]))
    print("---")

    # holdout list
    pred_val.append(y_pred_val)
    pred_score_val.append(y_score_val)

# обучаем модель для предсказания на test
lgbm_1.fit(X_train, y_train)

meta_X['lgbm_01'] = np.concatenate(pred_score_val)[:, 1]
meta_X_test['lgbm_01'] = lgbm_1.predict_proba(X_test)[:, 1]

Fold: 1 ROC-AUC SCORE 0.846
---
Fold: 2 ROC-AUC SCORE 0.845
---
Fold: 3 ROC-AUC SCORE 0.846
---
Fold: 4 ROC-AUC SCORE 0.846
---
Fold: 5 ROC-AUC SCORE 0.847
---


In [30]:
check_overfitting_classification(lgbm_1, X_train, y_train, X_test, y_test)

ROC-AUC train = 0.906
ROC-AUC test = 0.849
delta = 6.71%


# LGBM Optuna 2

In [31]:
pred_val = []
pred_score_val = []

cv = StratifiedKFold(n_splits=N_FOLDS)

for fold, (train_idx, test_idx) in enumerate(cv.split(X_train, y_train)):
    X_train_, X_val = X_train.iloc[train_idx], X_train.iloc[test_idx]
    y_train_, y_val = y_train.iloc[train_idx], y_train.iloc[test_idx]

    lgbm_2 = LGBMClassifier(**lgbm_gender_params_2)
    lgbm_2.fit(X_train_,
               y_train_,
               eval_set=[(X_val, y_val)],
               eval_metric="auc",
               early_stopping_rounds=100,
               verbose=0)

    y_pred_val = lgbm_2.predict(X_val)
    y_score_val = lgbm_2.predict_proba(X_val)

    print("Fold:", fold + 1,
          "ROC-AUC SCORE %.3f" % m.roc_auc_score(y_val, y_score_val[:, 1]))
    print("---")

    # holdout list
    pred_val.append(y_pred_val)
    pred_score_val.append(y_score_val)

# обучаем модель для предсказания на test
lgbm_2.fit(X_train, y_train)

meta_X['lgbm_02'] = np.concatenate(pred_score_val)[:, 1]
meta_X_test['lgbm_02'] = lgbm_2.predict_proba(X_test)[:, 1]

Fold: 1 ROC-AUC SCORE 0.842
---
Fold: 2 ROC-AUC SCORE 0.841
---
Fold: 3 ROC-AUC SCORE 0.842
---
Fold: 4 ROC-AUC SCORE 0.841
---
Fold: 5 ROC-AUC SCORE 0.843
---


In [34]:
check_overfitting_classification(lgbm_2, X_train, y_train, X_test, y_test)

ROC-AUC train = 0.952
ROC-AUC test = 0.846
delta = 12.53%


# LGBM Optuna 3

In [83]:
pred_val = []
pred_score_val = []

cv = StratifiedKFold(n_splits=N_FOLDS)

for fold, (train_idx, test_idx) in enumerate(cv.split(X_train, y_train)):
    X_train_, X_val = X_train.iloc[train_idx], X_train.iloc[test_idx]
    y_train_, y_val = y_train.iloc[train_idx], y_train.iloc[test_idx]

    lgbm_3 = LGBMClassifier(**lgbm_gender_params_3)
    lgbm_3.fit(X_train_,
               y_train_,
               eval_set=[(X_val, y_val)],
               eval_metric="auc",
               early_stopping_rounds=100,
               verbose=0)

    y_pred_val = lgbm_3.predict(X_val)
    y_score_val = lgbm_3.predict_proba(X_val)

    print("Fold:", fold + 1,
          "ROC-AUC SCORE %.3f" % m.roc_auc_score(y_val, y_score_val[:, 1]))
    print("---")

    # holdout list
    pred_val.append(y_pred_val)
    pred_score_val.append(y_score_val)

# обучаем модель для предсказания на test
lgbm_3.fit(X_train, y_train)

meta_X['lgbm_03'] = np.concatenate(pred_score_val)[:, 1]
meta_X_test['lgbm_03'] = lgbm_3.predict_proba(X_test)[:, 1]

Fold: 1 ROC-AUC SCORE 0.843
---
Fold: 2 ROC-AUC SCORE 0.841
---
Fold: 3 ROC-AUC SCORE 0.843
---
Fold: 4 ROC-AUC SCORE 0.842
---
Fold: 5 ROC-AUC SCORE 0.844
---


In [89]:
check_overfitting_classification(lgbm_3, X_train, y_train, X_test, y_test)

ROC-AUC train = 0.951
ROC-AUC test = 0.847
delta = 12.2%


# Catboost Optuna

In [41]:
pred_val = []
pred_score_val = []

cv = StratifiedKFold(n_splits=N_FOLDS)

for fold, (train_idx, test_idx) in enumerate(cv.split(X_train, y_train)):
    X_train_, X_val = X_train.iloc[train_idx], X_train.iloc[test_idx]
    y_train_, y_val = y_train.iloc[train_idx], y_train.iloc[test_idx]

    cat_1 = CatBoostClassifier(cat_features=cat_features, **cat_gender_best)
    cat_1.fit(X_train_,
              y_train_,
              eval_set=[(X_val, y_val)],
              early_stopping_rounds=100,
              verbose=False)

    y_pred_val = cat_1.predict(X_val)
    y_score_val = cat_1.predict_proba(X_val)

    print("Fold:", fold + 1,
          "ROC-AUC SCORE %.3f" % m.roc_auc_score(y_val, y_score_val[:, 1]))
    print("---")

    # holdout list
    pred_val.append(y_pred_val)
    pred_score_val.append(y_score_val)

# обучаем модель для предсказания на test
cat_1.fit(X_train, y_train, verbose=False)

meta_X['cat_01'] = np.concatenate(pred_score_val)[:, 1]
meta_X_test['cat_01'] = cat_1.predict_proba(X_test)[:, 1]

Fold: 1 ROC-AUC SCORE 0.845
---
Fold: 2 ROC-AUC SCORE 0.844
---
Fold: 3 ROC-AUC SCORE 0.845
---
Fold: 4 ROC-AUC SCORE 0.845
---
Fold: 5 ROC-AUC SCORE 0.846
---


In [44]:
check_overfitting_classification(cat_1, X_train, y_train, X_test, y_test)

ROC-AUC train = 0.912
ROC-AUC test = 0.849
delta = 7.42%


In [45]:
meta_X.to_csv('submissions/meta_X.csv', index=False)
meta_X_test.to_csv('submissions/meta_X_test.csv', index=False)

# Final meta model

## All

In [109]:
meta_X.drop('catboost_02', axis=1, inplace=True)
meta_X_test.drop('catboost_02', axis=1, inplace=True)

In [110]:
meta_model = LogisticRegression(random_state=RAND)
meta_model.fit(meta_X, y_train)

In [111]:
check_overfitting_classification(meta_model, meta_X, y_train, meta_X_test,
                                 y_test)

ROC-AUC train = 0.845
ROC-AUC test = 0.851
delta = -0.73%


In [117]:
y_pred_test = meta_model.predict(meta_X_test)
y_score_test = meta_model.predict_proba(meta_X_test)

y_pred_train = meta_model.predict_proba(meta_X)

metrics = metrics.append(
    get_metrics_classification(y_test,
                               y_pred_test,
                               y_score_test,
                               name='Stacking_hand'))
metrics

Unnamed: 0,model,Precision,Recall,f1,ROC_AUC,GINI
0,LGBMClassifier,0.773767,0.751496,0.762469,0.848864,0.697729
1,LGBMClassifier,0.76984,0.753713,0.761691,0.846432,0.692865
2,LGBMClassifier,0.769871,0.751866,0.760762,0.847162,0.694323
3,CatBoostClassifier,0.77482,0.75375,0.76414,0.848991,0.697983
0,Stacking_hand,0.76945,0.768455,0.768952,0.850829,0.701658


С помощью стекинга удалось улучшить метрики и снизить переобучение