In [1]:
import setproctitle

setproctitle.setproctitle("alex_permutation")

In [2]:
import gc
import os
import sys
import time

import catboost
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tqdm.notebook import tqdm

tqdm.pandas()

In [3]:
cat_features = [
    "emp_title",
    "home_ownership",
    "verification_status",
    "purpose",
    "zip_code",
    "addr_state",
    "initial_list_status",
    "application_type",
    "verification_status_joint",
    "disbursement_method",
]

In [4]:
samples = {
    k: pd.read_parquet(f"/home/asciishell/s3/jupyter.asciishell.ru/data_{k}.parquet") for k in ["train", "oos", "oot"]
}

In [5]:
features = sorted(set(samples["train"].columns) - {"target", "issue_d", "last_pymnt_d", "next_pymnt_d"})

In [6]:
pools = {
    k: catboost.Pool(v[v["target"] != -1][features], v[v["target"] != -1]["target"], cat_features=cat_features)
    for k, v in samples.items()
}
# pools['train'].quantize()
# pools['train'].save_quantization_borders('borders.dat')
# pools['oos'].quantize(input_borders='borders.dat')
# pools['oot'].quantize(input_borders='borders.dat')

In [7]:
params = {
    "iterations": 400,
    "max_depth": 4,
    "learning_rate": 0.1,
    "rsm": 0.7,
    "od_wait": 100,
    "use_best_model": True,
    "eval_metric": "AUC:hints=skip_train~false",
    "logging_level": "Silent",
}

# Train some model

In [8]:
clf = catboost.CatBoostClassifier(**params)
clf.fit(
    pools["train"],
    eval_set=pools["oos"],
    verbose=0,
    plot=True,
)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

<catboost.core.CatBoostClassifier at 0x7ff3502f9670>

In [9]:
# Перемешивание колонки для uplift
import contextlib


@contextlib.contextmanager
def column_shuffler(df, col, rs):
    orig = df[col].copy()
    df[col] = df[col].sample(frac=1, random_state=rs).values
    try:
        yield df
    finally:
        df[col] = orig

In [10]:
x = samples['oos'][samples['oos']['target'] != -1].copy()
imp = []
for i, f in enumerate(tqdm(features)):
    with column_shuffler(x, f, 42) as dfs:
        proba = clf.predict_proba(dfs[features])[:, 1]
        imp.append({
            'feature': f,
            'imp': roc_auc_score(dfs['target'], proba)
        })
imp = pd.DataFrame(imp).sort_values('imp').reset_index(drop=True)
imp

  0%|          | 0/62 [00:00<?, ?it/s]

Unnamed: 0,feature,imp
0,int_rate,0.676174
1,term,0.701812
2,emp_title,0.723821
3,dti,0.727446
4,home_ownership,0.728288
...,...,...
57,application_type,0.732869
58,sec_app_mort_acc,0.732869
59,sec_app_open_acc,0.732869
60,tax_liens,0.732870


In [None]:
uplift = []
for thr in tqdm(sorted(set(np.percentile(imp["imp"], np.linspace(0, 1, 20))))):
    features2 = imp[imp["imp"] >= thr]["feature"]
    pools2 = {
        k: catboost.Pool(v[v["target"] != -1][features2], v[v["target"] != -1]["target"], cat_features=cat_features)
        for k, v in samples.items()
    }
    clf = catboost.CatBoostClassifier(**params)
    clf.fit(pools2["train"], eval_set=pools2["oos"], verbose=0, plot=False)

    x = samples["train"][samples["train"]["target"] != -1]
    proba = clf.predict_proba(x[features2])[:, 1]
    mtrain = roc_auc_score(x["target"], proba)

    x = samples["oos"][samples["oos"]["target"] != -1]
    proba = clf.predict_proba(x[features2])[:, 1]
    moos = roc_auc_score(x["target"], proba)

    uplift.append(
        {
            "columns": len(features2),
            "features": features2,
            "train_score": mtrain,
            "test_score": moos,
        }
    )
uplift = pd.DataFrame(uplift)
uplift

  0%|          | 0/20 [00:00<?, ?it/s]

In [None]:
data = pd.DataFrame({'true': y_true, 'score': y_score})
x = data.assign(q=pd.qcut(data['score'], q=7, duplicates='drop')).groupby('q').agg('mean')
xi = np.arange(len(x))
plt.figure(figsize=(20, 7), facecolor='w')
plt.plot(xi, x['true'].values, label='True')
plt.plot(xi, x['score'].values, label='Score')
plt.xticks(xi, x.index, rotation=45)

plt.xlabel('Бин предсказания')
plt.ylabel('Target rate')
plt.title('Динамика TR от бина')
plt.grid()
plt.legend()