In [1]:
import pandas as pd
import numpy as np

import itertools

# Обучене модели и подготовка данных
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import KFold
from sklearn.preprocessing import scale, label_binarize
from sklearn.model_selection import cross_val_score, GridSearchCV, train_test_split

from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

from itertools import cycle

from sklearn import svm, datasets
from sklearn.metrics import roc_curve, auc, accuracy_score, confusion_matrix, roc_auc_score, roc_curve, precision_recall_curve, classification_report
from sklearn.multiclass import OneVsRestClassifier
from scipy import interp


# визуализация
import seaborn as sns
import matplotlib.pyplot as plt
sns.set(style="darkgrid")
sns.set(rc={'figure.figsize':(12,8)})

import warnings
warnings.filterwarnings('ignore')


def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j],
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')


def roc_curve(X, Y):
    sns.set(font_scale=1.5)
    sns.set_color_codes("muted")

    plt.figure(figsize=(10, 8))
    fpr, tpr, thresholds = roc_curve(Y, lr.predict_proba(X)[:,1], pos_label=1)
    lw = 2
    plt.plot(fpr, tpr, lw=lw, label='ROC curve ')
    plt.plot([0, 1], [0, 1])
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC curve')

In [2]:
best_col = ['TARGET', 'CREDIT', 'FACT_LIVING_TERM', 'FST_PAYMENT', 'AGE', 'PERSONAL_INCOME',
'WORK_TIME', 'LOAN_NUM_PAYM', 'TERM', 'EDUCATION', 'CHILD_TOTAL', 'DEPENDANTS',
'MARITAL_STATUS', 'FAMILY_INCOME', 'LOAN_NUM_CLOSED', 'FACT_PHONE_FL',
'LOAN_AVG_DLQ_AMT', 'GPF_DOCUMENT_FL', 'LOAN_MAX_DLQ_AMT', 'GENDER',
'FL_PRESENCE_FL', 'LOAN_NUM_TOTAL', 'HS_PRESENCE_FL', 'REG_FACT_POST_TP_FL',
'OWN_AUTO', 'REG_FACT_POST_FL', 'LOAN_DLQ_NUM', 'AUTO_RUS_FL', 'REG_POST_FL',
'REG_FACT_FL', 'PREVIOUS_CARD_NUM_UTILIZED', 'LAND_PRESENCE_FL',
'REG_PHONE_FL', 'FACT_POST_FL', 'SOCSTATUS_PENS_FL', 'LOAN_MAX_DLQ',
'ORG_TP_FCAPITAL', 'SOCSTATUS_WORK_FL', 'GAR_PRESENCE_FL', 'GEN_PHONE_FL',
'COT_PRESENCE_FL']

In [3]:
data = pd.read_csv('credit.clean', sep='\t', index_col=None)
data = data[best_col]
# CALC
X = data.loc[:, data.columns.difference(['TARGET'])].values
y = data['TARGET'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
cv_scores_gbm = []

for k in range(85,100, 2):
    for i in range(75, 90, 2):
        clf = GradientBoostingClassifier(learning_rate=1, n_estimators=k, random_state=i, verbose=0)
        kf = KFold(n_splits=5, random_state=i, shuffle=True)
        scores = cross_val_score(clf, X_train, y_train, cv=kf, scoring='roc_auc')
        clf.fit(X_train, y_train)
        err_train = np.mean(y_train != clf.predict(X_train))
        err_test  = np.mean(y_test  != clf.predict(X_test))
        cv_scores_gbm.append([k, i, scores.mean(), scores.std()])

df_scores_gbm = pd.DataFrame(cv_scores_gbm, columns=['k', 'i', 'roc_auc_score', 'STD', 'err_train', 'err_test'])
print (df_scores_gbm.sort_values(by=['err_test'], ascending=True))
sns.lineplot(x="k", y="err_test", data=df_scores_gbm)