In [15]:
import numpy as np
import pandas as pd
%matplotlib inline
import seaborn as sns
from sklearn.datasets import make_classification
from xgboost import XGBClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import *

import warnings
warnings.filterwarnings("ignore")

In [125]:
def evaluate_model(y_true, y_pred, y_prob):
    assert len(y_true) == len(y_pred) == len(y_prob)
    
    acc = accuracy_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)

    def compute_ks(y_true, y_prob):
        fpr, tpr, thresholds = roc_curve(y_true, y_prob)
        return np.max(tpr-fpr)
  
        
    if all(y_prob):
        auc = roc_auc_score(y_true, y_prob)
        ks = compute_ks(y_true, y_prob)
    
    res_description = 'ACC:{:.3f}, Recall:{:.3f}, Precision:{:.3f}, F1:{:.3f}, AUC:{:.3f}, KS:{:.3f}'
    print(res_description.format(acc, recall, precision, f1, auc, ks))

In [126]:
# ddd   
evaluate_model(y, y_pred, y_prob)

ACC:0.945, Recall:0.950, Precision:0.941, F1:0.945, AUC:0.985, KS:0.891


In [28]:
X, y = make_classification(n_samples=25000, n_features=15, n_informative=12, random_state=2019)
X = MinMaxScaler().fit_transform(X)

In [66]:
clf = XGBClassifier(random_state=2019)
clf.fit(X, y)
y_pred = clf.predict(X)
y_prob = clf.predict_proba(X)[:, -1]
scores = (1-y_prob)*1000

In [30]:
pd.Series(scores).describe()

count    25000.000000
mean       500.013916
std        399.333282
min          1.136422
25%         74.196098
50%        480.368225
75%        932.698151
max        999.175049
dtype: float64

In [31]:
def get_bins(val):
    if val<82:
        return 'A'
    elif 82 <= val < 470:
        return 'B'
    elif 470 <= val < 934:
        return 'C'
    else:
        return 'D'

In [32]:
bins = pd.Series(scores).map(get_bins)

In [128]:
def get_weight_prob(a, b, c, d):
    clf = XGBClassifier(random_state=2019)
    map_dict =  {i:j for i,j in zip(list('ABCD'), [a, b, c, d])}
    sample_weight = bins.map(map_dict)
    clf.fit(X, y, sample_weight=sample_weight)
    y_pred = clf.predict(X)
    y_prob = clf.predict_proba(X)[:, -1]
    return y_pred, y_prob

In [130]:
def display_weights_res(a, b, c, d):
    y_pred, y_prob = get_weight_prob(a, b, c, d)
    scores = 1000 * (1-y_prob)
    df_res = pd.DataFrame({'class':bins, 'scores':scores, 'y_mean':y}).groupby('class')['scores', 'y_mean'].mean()

    evaluate_model(y, y_pred, y_prob)
    return df_res

In [131]:
display_weights_res(1, 1, 1, 1)

ACC:0.945, Recall:0.950, Precision:0.941, F1:0.945, AUC:0.985, KS:0.891


Unnamed: 0_level_0,scores,y_mean
class,Unnamed: 1_level_1,Unnamed: 2_level_1
A,37.840183,0.992234
B,211.441788,0.895416
C,788.337341,0.107439
D,970.631348,0.005176


#### max(d-a)

In [49]:
def opt_func(a, b, c, d):
    y_prob = get_weight_prob(a, b, c, d)
    scores = (1-y_prob)*1000
    score_a = np.extract(bins=='A', scores).mean()
    score_d = np.extract(bins=='D', scores).mean()
    return score_d - score_a

In [50]:
opt_func(1, 1, 1, 1)

932.79114

In [51]:
from bayes_opt import BayesianOptimization

In [58]:
param_bound = {"a":(0, 100), 'b':(0, 20), 
              "c":(0, 20), "d":(0, 50)}

optimizer = BayesianOptimization(f=opt_func, pbounds=param_bound, random_state=2019, verbose=2)

In [59]:
num_iter = 30
init_points = 5
optimizer.maximize(init_points=init_points, n_iter=num_iter)

|   iter    |  target   |     a     |     b     |     c     |     d     |
-------------------------------------------------------------------------
| [0m 1       [0m | [0m 962.5   [0m | [0m 90.35   [0m | [0m 7.862   [0m | [0m 12.48   [0m | [0m 31.89   [0m |
| [95m 2       [0m | [95m 964.3   [0m | [95m 88.05   [0m | [95m 5.983   [0m | [95m 14.04   [0m | [95m 45.16   [0m |
| [0m 3       [0m | [0m 956.2   [0m | [0m 88.14   [0m | [0m 8.115   [0m | [0m 9.049   [0m | [0m 13.35   [0m |
| [0m 4       [0m | [0m 954.2   [0m | [0m 16.29   [0m | [0m 17.78   [0m | [0m 2.97    [0m | [0m 49.24   [0m |
| [0m 5       [0m | [0m 942.1   [0m | [0m 3.236   [0m | [0m 10.31   [0m | [0m 4.023   [0m | [0m 44.3    [0m |
| [0m 6       [0m | [0m 963.7   [0m | [0m 88.83   [0m | [0m 6.619   [0m | [0m 13.52   [0m | [0m 40.7    [0m |
| [0m 7       [0m | [0m 963.6   [0m | [0m 90.68   [0m | [0m 7.148   [0m | [0m 14.63   [0m | [0m 45.1   

In [60]:
optimizer.max

{'target': 976.4725341796875,
 'params': {'a': 92.99773468238647,
  'b': 0.0,
  'c': 0.0,
  'd': 47.69674462738514}}

In [133]:
display_weights_res(1, 1, 1, 1)

ACC:0.945, Recall:0.950, Precision:0.941, F1:0.945, AUC:0.985, KS:0.891


Unnamed: 0_level_0,scores,y_mean
class,Unnamed: 1_level_1,Unnamed: 2_level_1
A,37.840183,0.992234
B,211.441788,0.895416
C,788.337341,0.107439
D,970.631348,0.005176


In [140]:
display_weights_res(**optimizer.max['params'])

ACC:0.886, Recall:0.921, Precision:0.860, F1:0.890, AUC:0.962, KS:0.782


Unnamed: 0_level_0,scores,y_mean
class,Unnamed: 1_level_1,Unnamed: 2_level_1
A,11.218301,0.992234
B,141.664795,0.895416
C,730.933167,0.107439
D,987.690918,0.005176
