In [None]:
# Общее
import pandas as pd
import numpy as np
from scipy import stats
from dateutil.relativedelta import relativedelta
import datetime

# Графики
from matplotlib import pyplot as plt
import matplotlib
from pylab import rcParams
import seaborn

import telegram_send
    
%matplotlib inline

# Model

In [None]:
# featured_data = data

In [None]:
from xgboost import XGBClassifier
from xgboost import plot_importance

from sklearn.metrics import roc_auc_score, roc_curve, accuracy_score
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
import time
from IPython.display import clear_output

In [None]:
# Check cnt of predict by label cl

def count_class(true, pred, cl=1):
    all_ones = 0
    pred_ones = 0
    for i in range(len(true)):
        if true.iloc[i] == cl:
            all_ones += 1
            if pred[i] == cl:
                pred_ones += 1
    print('Classifier with this thres. assign to {} class:'.format(cl))
    print('all_in_class = {}'.format(all_ones))
    print('clf_pred = {}'.format(pred_ones))
    print()

    
# Make oversampling for clf

def oversampling(features, labels):
    feat_to_add = features[labels == 1].copy()
    labels_to_add = labels[labels == 1].copy()
    steps = labels.shape[0] // labels_to_add.shape[0]
    for i in range(steps):
        np.append(features, feat_to_add)
        labels.append(labels_to_add)
    
    diff = labels[labels == 0].shape[0] - labels[labels == 1].shape[0]
    np.append(features, feat_to_add[:diff])
    labels.append(labels_to_add[:diff])


def make_classification(featured_data, labels, clf, scaler, test_size=0.25, 
                        show_feat_imp=None, make_cv=False, cv_folds = 5, 
                        prob_thres=0.5, show_class=0):
    '''
    Make classification by clf classifier.
    
    featured_data : features
    labels : labels of classes
    clf : classifier
    show_feat_imp : show plot or dframe weights of features
    make_cv : if True make cross-validation on cv_folds 
    prob_thres : probability threshold to classify object as 1 class
    show_class : class to count in count_class func
    '''
    # Scale and Split
    # featured_data = featured_data[featured_data.columns[compreh_good_feats]]
    X = scaler.fit_transform(featured_data)
    y = labels
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42, shuffle=True, stratify=y)
    oversampling(X_train, y_train)
    
     # Cross-validation
    if make_cv:
        
        cv = StratifiedKFold(n_splits=cv_folds, shuffle=True, random_state=42)
        cv_score = cross_val_score(clf, X, y, scoring='roc_auc', cv=cv)
        print('AUC on CV =', cv_score)
        print('Mean AUC on CV =', cv_score.mean())
        telegram_send.send(messages=['Score on cv: ' + str(cv_score.mean()),], 
                   conf=r'C:\Users\shumilkinayu\Documents\tg_notification.conf')
        
    
    # Clf fit
    clf.fit(X_train, y_train)
    print('----- Clf fitted! -----')
    
    
    # Feature importance
    if show_feat_imp == 'log_reg':
        pd.DataFrame(index=featured_data.columns.values, data=clf.coef_.reshape([-1, 1])).sort_values(by=0).head()
        
    if show_feat_imp == 'xgb':
        pd.DataFrame(index=featured_data.columns.values, data=clf.feature_importances_.reshape([-1, 1])).sort_values(by=0, ascending=False).head()
        
        # Xgb plot feature importance
        fig, ax = plt.subplots(1, 1,figsize=(15, 45))
        plot_importance(clf, ax)
        plt.show()
        
    
    # Metric on test
    print('TEST:')
    
#     for prob_thres in [0.2, 0.3, 0.4, 0.5, 0.6, 0.7]:
#         preds = list(map(lambda x: 1 if x[1] > prob_thres else 0, clf.predict_proba(X_test)))
#         print('AUC =', roc_auc_score(y_test, preds))
#         print('Thresh =', prob_thres)
    
    count_class(y_test, preds, show_class)

    preds = clf.predict_proba(X_test)[:, 1]
    print('AUC =', roc_auc_score(y_test, preds))
    from sklearn.metrics import log_loss
    fpr, tpr, thr = roc_curve(y_test, preds)
    plt.plot(fpr, tpr)
    plt.plot([0,1], [0,1])
    plt.show()
    
    
    # Metric on train
    print('TRAIN:')
    preds = list(map(lambda x: 1 if x[1] > prob_thres else 0, clf.predict_proba(X_train)))
    count_class(y_train, preds, show_class)

    preds = clf.predict_proba(X_train)[:, 1]
    print('AUC =', roc_auc_score(y_train, preds))
    fpr, tpr, thr = roc_curve(y_train, preds)
    plt.plot(fpr, tpr)
    plt.plot([0,1], [0,1])
    plt.show()
    
    print('ALL:')
    preds = list(map(lambda x: 1 if x[1] > prob_thres else 0, clf.predict_proba(X)))
    count_class(y, preds, show_class)
    
    preds = clf.predict_proba(X)[:, 1]
    print('AUC =', roc_auc_score(y, preds))
    fpr, tpr, thr = roc_curve(y, preds)
    plt.plot(fpr, tpr)
    plt.plot([0,1], [0,1])
    plt.show()
    
    return preds

In [None]:
telegram_send.send(messages=['Classifier fit_predict began!',], 
                   conf=r'C:\Users\shumilkinayu\Documents\tg_notification.conf')
clf = XGBClassifier()
scaler = StandardScaler()

# params = {'base_score': 0.5,
#      'colsample_bylevel': 1,
#      'colsample_bytree': 1,
#      'gamma': 0.4,
#      'learning_rate': 0.015,
#      'max_delta_step': 0,
#      'max_depth': 3,
#      'min_child_weight': 6,
#      'missing': None,
#      'n_estimators': 1000,
#      'nthread': -1,
#      'objective': 'binary:logistic',
#      'reg_alpha': 10,
#      'reg_lambda': 0.5,
#      'scale_pos_weight': 1,
#      'seed': 0,
#      'silent': True,
#      'subsample': 1}


clf.set_params(**params)

pred_proba_on_data = make_classification(featured_data, y, clf, scaler,
                                         test_size=0.25, show_feat_imp='xgb', 
                                         make_cv=True, cv_folds=5, show_class=1, prob_thres=0.5)

telegram_send.send(messages=['Classifier fit_predict ended!',], 
                   conf=r'C:\Users\shumilkinayu\Documents\tg_notification.conf')

# Feature selection

In [None]:
def feature_selection(featured_data, y, model, scaler, params):
    thresholds = np.sort(np.unique(model.feature_importances_))
    model = clf
    scores = []
    mx_id = 0

    X = scaler.fit_transform(featured_data)
    
    for thresh, i in zip(thresholds, range(thresholds.shape[0])):
        selection = SelectFromModel(model, threshold=thresh, prefit=True)
        select_X = selection.transform(X)

        selection_model = XGBClassifier()
        selection_model.set_params(**params)

        X_overs = select_X.copy()
        y_overs = y.copy()
        oversampling(X_overs, y_overs)
        cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
        cv_score = cross_val_score(selection_model, X_overs, y_overs, scoring='roc_auc', cv=3)
        if i % 10 == 0:
            print('step', i)
        scores.append(cv_score.mean())
        if scores[i - 1] > scores[mx_id]:
            mx_id = i - 1
            
    print(thresholds[mx_id], scores[mx_id])
    a = (clf.feature_importances_ >= thresholds[mx_id])
    telegram_send.send(messages=['Score after feat. sel.: ' + str(scores[mx_id]),], 
                   conf=r'C:\Users\shumilkinayu\Documents\tg_notification.conf')
    print(featured_data.columns[a].shape)
    return a

In [None]:
params = {'base_score': 0.5,
     'colsample_bylevel': 1,
     'colsample_bytree': 1,
     'gamma': 0.4,
     'learning_rate': 0.15,
     'max_delta_step': 0,
     'max_depth': 3,
     'min_child_weight': 6,
     'missing': None,
     'n_estimators': 100,
     'nthread': -1,
     'objective': 'binary:logistic',
     'reg_alpha': 10,
     'reg_lambda': 0.5,
     'scale_pos_weight': 1,
     'seed': 0,
     'silent': True,
     'subsample': 1}

telegram_send.send(messages=['Feature selection began',], 
                   conf=r'C:\Users\shumilkinayu\Documents\tg_notification.conf')
compreh_good_feats = feature_selection(featured_data, y, clf, scaler, params)
telegram_send.send(messages=['Feature selection end!',], 
                   conf=r'C:\Users\shumilkinayu\Documents\tg_notification.conf')

# Model params tuning

In [None]:
from sklearn.model_selection import GridSearchCV

class ParametersTuner:
    '''Tuning parameters for XGBoost step by step.
    
    Parameters
    ----------
    validation_set : matrix, shape = [n_samples, n_features]
    Featured object set for validate parameters
    
    y : array, shape = [n_samples, 1]
    Labels for validation_set
    
    test_params_lists : dict with list values
    Dictionarty with list of xgb parameters for Grid Search validate on
    
    base_params : dict 
    Dictionary with base params for XGboost
    
    scoring : string, default='roc_auc'
    Sklearn metric in Grid Search view for score parameters
    
    cv : integer or cross-validation generator, default=3,
    If integer, it is number of cv folds. 
    Else cv. 
    '''
    
    def __init__(self, validation_set, y, test_params_lists,
                 base_params, scoring='roc_auc', cv=3):
        
        self.test_params_lists = {
            'learning_rate' : [0.005, 0.01, 0.1, 0.2, 0.5],
            'n_estimators' : [50, 100, 500, 1000],
            'max_depth' : [2, 3, 5, 10],
            'min_child_weight':[1, 2, 3, 6],
            'gamma' : [i / 10.0 for i in range(0,5)],
            'reg_alpha' : [0.001, 0.01, 0.1, 0.5, 1, 10, 50, 100, 1000],
            'reg_lambda' : [0.001, 0.01, 0.1, 0.5, 1, 10, 50, 100, 1000],
            'seed' : 0
        }
        self.test_params_lists.update(test_params_lists)
        
        self.val_set = validation_set
        self.y = y
        self.model_params = base_params
        self.model = XGBClassifier()
        self.scoring = scoring
        self.cv = cv
    
    def _make_grid_search_and_param_change(self, test_params):
        
        grid_search = GridSearchCV(estimator=self.model, param_grid=test_params, 
                                   scoring=self.scoring, iid=False, cv=self.cv, verbose=10)
        grid_search.fit(self.val_set, self.y)
                         
        print('Best params:', grid_search.best_params_,
              'Score:', grid_search.best_score_)
        print()
        
        for key, val in grid_search.best_params_.items():
            self.model_params[key] = val
        self.model.set_params(**self.model_params)
    
    def make_tuning(self):
    
        steps = {
            '1. RATE' : ['learning_rate'],
            '2. N_OF_EST' : ['n_estimators'],
            '3. TREE_PARAMS' : ['min_child_weight', 'max_depth', 'gamma'],
            '4. REGURALIZATION' : ['reg_alpha', 'reg_lambda']
        }
        
        for step, param_names in steps.items():
            print('--------STEP: {}--------'.format(step))
            test_params = dict(zip(steps[step], 
                                   [self.test_params_lists[key] 
                                    for key in steps[step]]))
            print('],\n'.join(str(test_params).split('],')))
            self._make_grid_search_and_param_change(test_params)
             
        step = '5. REDUCE RATE' 
        print('--------STEP {}--------'.format(step))
        test_params = {'learning_rate' : [self.model_params['learning_rate'], 
                                                              self.model_params['learning_rate'] / 10],
                                           'n_estimators' : [self.model_params['n_estimators'], 
                                                             self.model_params['n_estimators'] * 10]}
        print('],\n'.join(str(test_params).split('],')))
        self._make_grid_search_and_param_change(test_params)

In [None]:
base_params = {
            'learning_rate' : 0.01,
            'n_estimators' : 100,
            'max_depth' : 3,
            'gamma' : 0,
            'min_child_weight' : 1,
            'reg_alpha' : 0,
            'reg_lambda' : 1,
            'seed' : 0
        }

params_list = {
    'learning_rate' : [0.2],
    'n_estimators' : [100],
    'max_depth' : [2, 3, 4, 5, 10],
    'min_child_weight':[1, 2, 3, 4, 6],
    'gamma' : [i / 10.0 for i in range(0,5)],
    'reg_alpha' : [0.01, 0.1, 0.5, 1, 1.5, 10, 30, 50, 100, 1000],
    'reg_lambda' : [0.01, 0.1, 0.5, 1, 1.5, 10, 30, 50, 100, 1000],
    'seed' : 0
}

scaler = StandardScaler()
tuner = ParametersTuner(scaler.fit_transform(featured_data[featured_data.columns[compreh_good_feats]]), 
                        y, params_list, base_params)
tuner.make_tuning()
tuner.model.get_params()