In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import numpy as np
import pandas as pd
import random

from sklearn.model_selection import train_test_split, cross_val_score, learning_curve
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.tree import DecisionTreeClassifier
import xgboost as xgb, lightgbm as lgbm, catboost as catb
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import (roc_auc_score, roc_curve, auc, confusion_matrix, \
                             accuracy_score, classification_report, plot_confusion_matrix, \
                             plot_precision_recall_curve, precision_recall_curve, recall_score,
                             plot_roc_curve, f1_score)


from sklearn.model_selection import KFold, GridSearchCV
from seaborn import heatmap
from datetime import datetime

import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

def evaluate_preds(model, X_train, X_valid, y_train, y_valid):
    y_train_pred = model.predict(X_train)
    y_valid_pred = model.predict(X_valid)
    print("TRAIN\n", round(f1_score(y_train, y_train_pred), 3))
    print("TEST\n", round(f1_score(y_valid, y_valid_pred), 3))
    get_classification_report(y_train, y_train_pred, y_valid, y_valid_pred)

def get_classification_report(y_train_true, y_train_pred, y_test_true, y_test_pred):
    print('TRAIN\n\n' + classification_report(y_train_true, y_train_pred))
    print('TEST\n\n' + classification_report(y_test_true, y_test_pred))
    print('CONFUSION MATRIX\n')
    print(pd.crosstab(y_test_true, y_test_pred))
    

### <b>Proba Calibration Plots

In [None]:
def show_proba_calibration_plots(y_predicted_probs, y_true_labels):
    preds_with_true_labels = np.array(list(zip(y_predicted_probs, y_true_labels)))

    thresholds = []
    precisions = []
    recalls = []
    f1_scores = []

    for threshold in np.linspace(0.1, 0.9, 9):
        thresholds.append(threshold)
        precisions.append(precision_score(y_true_labels, list(map(int, y_predicted_probs > threshold))))
        recalls.append(recall_score(y_true_labels, list(map(int, y_predicted_probs > threshold))))
        f1_scores.append(f1_score(y_true_labels, list(map(int, y_predicted_probs > threshold))))

    scores_table = pd.DataFrame({'f1':f1_scores,
                                 'precision':precisions,
                                 'recall':recalls,
                                 'probability':thresholds}).sort_values('f1', ascending=False).round(3)
  
    figure = plt.figure(figsize = (15, 5))

    plt1 = figure.add_subplot(121)
    plt1.plot(thresholds, precisions, label='Precision', linewidth=4)
    plt1.plot(thresholds, recalls, label='Recall', linewidth=4)
    plt1.plot(thresholds, f1_scores, label='F1', linewidth=4)
    plt1.set_ylabel('Scores')
    plt1.set_xlabel('Probability threshold')
    plt1.set_title('Probabilities threshold calibration')
    plt1.legend(bbox_to_anchor=(0.25, 0.25))   
    plt1.table(cellText = scores_table.values,
               colLabels = scores_table.columns, 
               colLoc = 'center', cellLoc = 'center', loc = 'bottom', bbox = [0, -1.3, 1, 1])

    plt2 = figure.add_subplot(122)
    plt2.hist(preds_with_true_labels[preds_with_true_labels[:, 1] == 0][:, 0], 
              label='Another class', color='royalblue', alpha=1)
    plt2.hist(preds_with_true_labels[preds_with_true_labels[:, 1] == 1][:, 0], 
              label='Main class', color='darkcyan', alpha=0.8)
    plt2.set_ylabel('Number of examples')
    plt2.set_xlabel('Probabilities')
    plt2.set_title('Probability histogram')
    plt2.legend(bbox_to_anchor=(1, 1))

    plt.show()
    


### <b>Learning Curve Plot

In [None]:
def show_learning_curve_plot(estimator, X, y, cv=3, n_jobs=-1, train_sizes=np.linspace(.1, 1.0, 5)):

    train_sizes, train_scores, test_scores = learning_curve(estimator, X, y, 
                                                            cv=cv, 
                                                            scoring='f1',
                                                            train_sizes=train_sizes, 
                                                            n_jobs=n_jobs)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)

    plt.figure(figsize=(15,8))
    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1, color="r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1, color="g")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
             label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
             label="Cross-validation score")

    plt.title(f"Learning curves ({type(estimator).__name__})")
    plt.xlabel("Training examples")
    plt.ylabel("Score")     
    plt.legend(loc="best")
    plt.grid()
    plt.show()

def show_feature_importances(feature_names, feature_importances, get_top=None):
    feature_importances = pd.DataFrame({'feature': feature_names, 'importance': feature_importances})
    feature_importances = feature_importances.sort_values('importance', ascending=False)
       
    plt.figure(figsize = (20, len(feature_importances) * 0.355))
    
    sns.barplot(feature_importances['importance'], feature_importances['feature'])
    
    plt.xlabel('Importance')
    plt.title('Importance of features')
    plt.show()
    
    if get_top is not None:
        return feature_importances['feature'][:get_top].tolist()



**Описание датасета**

* **Home Ownership** - домовладение
* **Annual Income** - годовой доход
* **Years in current job** - количество лет на текущем месте работы
* **Tax Liens** - налоговые обременения
* **Number of Open Accounts** - количество открытых счетов
* **Years of Credit History** - количество лет кредитной истории
* **Maximum Open Credit** - наибольший открытый кредит (максимальная сумма, которая когда-либо была доступна клиенту)
* **Number of Credit Problems** - количество проблем с кредитом
* **Months since last delinquent** - количество месяцев с последней просрочки платежа
* **Bankruptcies** - банкротства
* **Purpose** - цель кредита
* **Term** - срок кредита
* **Current Loan Amount** - текущая сумма кредита (сумма, которую еще предстоит выплатить клиенту)
* **Current Credit Balance** - текущий кредитный баланс (сумма, которую может тратить клиент с кредитного счета)
* **Monthly Debt** - ежемесячный долг
* **Credit Score** - баллы кредитного рейтинга
* **Credit Default** - факт невыполнения кредитных обязательств (0 - погашен вовремя, 1 - просрочка)

In [None]:
"""
df - DataFrame с данными
feature - имя исследуемой переменной
target = target(feature), 
"""
def bar_plot_for_cat_dep_on_target(df, feature, target):
    data = df[feature].value_counts()
    fig, axes = plt.subplots(1, len(data.index))
    fig.set_size_inches(20, 5)
    for i, idx in enumerate(data.index):
        tmp_data = df.loc[df[feature] == idx, target].value_counts()
        axes[i].bar(x = tmp_data.index, height = tmp_data.values)
        axes[i].set_xlabel(target)
        axes[i].set_ylabel(idx)
        
    plt.tight_layout()
    
def hist_plot_for_cat_dep_on_target(df, feature, target):
    data_index = df[feature].value_counts().index
    fig, axes = plt.subplots(1, len(data_index))
    fig.set_size_inches(20, 5)
    for i, idx in enumerate(data_index):        
        axes[i].hist(df.loc[df[feature] == idx, 'Credit Score'], color = 'orange', bins = 5)
        axes[i].set_xlabel(target)
        axes[i].set_ylabel(idx)
        
    plt.tight_layout()
    
def box_plot_for_unique_values_of_feature(df, feature, target):
    uniq = df[feature].value_counts().index    
    cols = 4
    rows = len(uniq) // cols    
    fig, axes = plt.subplots(rows + 1, cols)
    axes = axes.flatten()
    fig.set_size_inches(15, 15)
    for i, idx in enumerate(uniq):           
            axes[i].boxplot(df.loc[df[feature] == idx, target])
            axes[i].set_xlabel(target)
            axes[i].set_ylabel(idx)
        
    plt.tight_layout()

In [None]:
TRAIN_DATASET_PATH = '/kaggle/input/gb-credit-default/train.csv'
TEST_DATASET_PATH = '/kaggle/input/gb-credit-default/test.csv'
SAMPLE_SUBMIT_PATH = '/kaggle/input/gb-credit-default/sample_submission.csv'

In [None]:
df = pd.read_csv(TRAIN_DATASET_PATH)

### <b>Признаки с пропусками

In [None]:
nan_features = ['Annual Income', 'Years in current job', 'Months since last delinquent', 'Bankruptcies', 'Credit Score']

### <b> Номинативные признаки

In [None]:
cat_features = ['Home Ownership', 'Years in current job', 'Purpose', 'Term']

### <b>Вещественные признаки
   

In [None]:
num_features = ['Annual Income', 'Tax Liens', 'Number of Open Accounts', 'Years of Credit History', 'Maximum Open Credit',\
                'Number of Credit Problems', 'Months since last delinquent', 'Bankruptcies', 'Current Loan Amount', 'Current Credit Balance',\
                'Monthly Debt', 'Credit Score', 'Credit Default']

In [None]:
df.hist(bins = 20, figsize = (16, 16))
plt.show()

In [None]:
df.info()

### <b>Подозрительные на выбросы
* Maximum Open Credit, наибольший открытый кредит (максимальная сумма, которая когда-либо была доступна клиенту)
* Months since last delinquent, количество месяцев с последней просрочки платежа
* Current Loan Amount, текущая сумма кредита (сумма, которую еще предстоит выплатить клиенту)
* Current Credit Balance, текущий кредитный баланс (сумма, которую может тратить клиент с кредитного счета)
* Credit Score, баллы кредитного рейтинга






<iframe src="https://www.kaggle.com/embed/pavelzagreba/credit-default?cellId=6&cellIds=6&kernelSessionId=58683825" height="300" style="margin: 0 


### <b>Корреляционная матрица

In [None]:
fig, axes = plt.subplots(1, 1)
fig.set_size_inches(15, 10)
corr_matrix = df.corr()
corr_matrix[corr_matrix < 0.3] = 0
heatmap(corr_matrix, annot = True, cmap = 'Blues')


### <b>Линейные связи<br>
    
    
###    1. Bankruptcies ~ Number of Credit Problems <b>(0.73)</b>, количество банкротств и кол-во проблем с кредитом
    
###    2. Tax Liens ~ Number of Credit Problems <b>(0.6)</b>, налоговые долги и кол-во проблем с кредитом
    
###    3. Monthly Debt ~ Annual Income <b>(0.58)</b>, ежемесячный долг и годовой доход
     
###    4. Current Credit Balance ~ Monthly Debt <b>(0.5)</b>, текущий кредитный баланс (сумма, которую может тратить клиент с кредитного счета) и ежемесячный долг
    
###    5. Credit Score ~ Credit Default <b>(0.44)</b>, баллы кредитного рейтинга и факт невыполнения кредитных обязательств
    
    

### <b>Обработка выбросов

* Maximum Open Credit, наибольший открытый кредит (максимальная сумма, которая когда-либо была доступна клиенту)
* Months since last delinquent, количество месяцев с последней просрочки платежа
* Current Loan Amount, текущая сумма кредита (сумма, которую еще предстоит выплатить клиенту)
* Current Credit Balance, текущий кредитный баланс (сумма, которую может тратить клиент с кредитного счета)
* Credit Score, баллы кредитного рейтинга

### <b>Maximum Open Credit
    * У признака было 64 значения, равных 0, но при этом дургие признаки не были согласованы с этим значением. Например, у большинства было указано: цель кредита, срок и т.д. Для начала просто удалим эти данные, а в дальнейшем попробуем придумать хоть какую-то обработку.
    * У признака было 75 значений, превышающих 0.99 квантиль — поступим с ними аналогичным образом.

In [None]:
df['Maximum Open Credit'].plot(kind = 'box')



In [None]:
cond_1 = df['Maximum Open Credit'] == 0
cond_2 = df['Maximum Open Credit'] > df['Maximum Open Credit'].quantile(q = 0.99)

df.drop(df[cond_1].index, inplace = True)
df.drop(df[cond_2].index, inplace = True)

In [None]:
df['Maximum Open Credit'].hist(bins = 20, figsize = (4,4))

In [None]:
df['Maximum Open Credit'].plot(kind = 'box')

### <b>Months since last delinquent
    1. Данные изначально имеют хорошее распределение (+)
    2. Более половины значений этого признака попросту отсутствуют (-)
    
   * Скорее всего эти данные придется удалить.

In [None]:
df['Months since last delinquent'].plot(kind = 'box')
print(df['Months since last delinquent'].quantile(0.001), df['Months since last delinquent'].quantile(0.99))

### <b>Current Loan Amount
    * У этого признака есть 850 одинаковых значений. Пока что заменим их на медиану.

In [None]:
df['Current Loan Amount'].plot(kind = 'box')

In [None]:
max_loan_amount = np.max(df['Current Loan Amount'])
cond_1 = df['Current Loan Amount'] >= max_loan_amount
df['Current Loan Amount Outlier'] = 0
df.loc[cond_1, 'Current Loan Amount Outlier'] = 1
df.loc[cond_1, 'Current Loan Amount'] = df['Current Loan Amount'].median()
df['Current Loan Amount'].plot(kind = 'box')

### <b>Current Credit Balance
    * В целом, уникальные значения признака Current Credit Balance, имеют хорошие распределения. Большинство значений, из-за которых считалось, что в этом признаке есть выбросы — имеют одну и ту же цель кредита, а именно "debt consolidation". Вероятно люди просто открыли кредит заранее и начали выплачить сумму до того, как успели воспользоваться деньгами.

In [None]:
cond = df['Current Credit Balance'] > df['Current Loan Amount']
df_CCB = df.loc[cond, ]

In [None]:
box_plot_for_unique_values_of_feature(df_CCB, 'Purpose', 'Current Credit Balance')

### <b>Credit Score
    *Признак Credit Score должен содержаться в диапозоене [300, 850]. Меньшие значения заменим на 300. Значения, превышающие 850, содержатся в диапозоне [6000, 8000] — скорее всего это ошибка: значения Credit Score были умножены на 10 или случайно записали лишний 0, поэтому разделим эти значения на 10.

In [None]:
df['Credit Score'].plot(kind = 'hist', bins = 20, figsize = (5,5))

In [None]:
cond_1 = df['Credit Score'] < 300
cond_2 = df['Credit Score'] > 850
df['Credit Score Outlier'] = 0
df.loc[cond_1 | cond_2, 'Credit Score Outlier'] = 1
df.loc[cond_1, 'Credit Score'] = 300
df.loc[cond_2, 'Credit Score'] = df.loc[cond_2, 'Credit Score']/10


In [None]:
df['Credit Score'].plot(kind = 'hist', bins = 20, figsize = (5,5))

### <b>Обработка пропусков
    

In [None]:
df[nan_features].hist(bins = 20, figsize = (15,15))
plt.show()

In [None]:
groups = df['Years in current job'].value_counts().index
job_values = df['Years in current job'].value_counts().values
plt.barh(groups, job_values)

In [None]:
df.isna().sum()

* Annual Income
* Month since last delinquent
* Bankruptcies
* Credit Score
* Years in current job

### <b> Идеи для восстановления пропусков
    * SMOTE
    * Medians
    * Modes

### <b> Annual Income

In [None]:
df['Annual Income'].hist(bins = 20, figsize = (7,7))

In [None]:
df['Annual Income'].median()

In [None]:
df['Annual Income NaN'] = 0
df.loc[df['Annual Income'].isna(), 'Annual Income NaN'] = 1
df['Annual Income'].fillna(df['Annual Income'].median(), inplace = True)

In [None]:
df['Annual Income'].hist(bins = 20, figsize = (7,7))

### <b> Months since last delinquent

In [None]:
df.drop('Months since last delinquent', axis = 1, inplace = True)


### <b> Bankruptcies

In [None]:
df['Bankruptcies NaN'] = 0
df.loc[df['Bankruptcies'].isna(), 'Bankruptcies NaN'] = 1
df['Bankruptcies'].fillna(df['Bankruptcies'].median(), inplace = True)

### <b> Credit Score

In [None]:
df['Credit Score NaN'] = 0
df.loc[df['Credit Score'].isna(), 'Credit Score NaN'] = 1
df['Credit Score'].fillna(df['Credit Score'].median(), inplace = True)
df['Credit Score'].isna().sum()

### <b> Years in current job

Т.к этот признак является категориальным, то для замены пропусков в нем будем использовать моду.

In [None]:
df['Years in current job NaN'] = 0
df.loc[df['Years in current job'].isna(), 'Years in current job NaN'] = 1
df['Years in current job'].fillna(df['Years in current job'].mode().values[0], inplace = True)

In [None]:
df.isna().sum()

### <b> Генерация новых признаков

 * Можно перевести признак 'Years in current job' в числовой, посредством объединения некоторых значений в группы с присвоением каждой числа. Обычно, в вакансиях есть 3 градации: 0-2 года, 3-6, >6. Каждое из этих значений повышает уверенность в стабильности человека, что в свою очередь есть хорошо для банка, который собирается выдать кредит, поэтому "0-2" = 0, "3-6" = 1, ">6" = 2, ">10" = 3.
 * Признак 'Home Ownership' имеет мало уникальных значений, поэтому можно сделать из них dummie-переменные.
 * Признак 'Term' имеет всего два уникальных значения: 'Long Term', 'Short Term'. Можно сделать из него dummie-переменные, но кажется лучше сделать один бинарный признак 'Long Term', у которого '1' = 'Long Term', '0' = 'Short Term'.
 * Это позволит избавиться от категориальных признаков.

### <b> Years in current job

In [None]:
df['Years in current job'] = df['Years in current job'].map({'< 1 year':0, 
                                                                       '1 year':0, 
                                                                       '2 years':0, 
                                                                       '3 years':1, 
                                                                       '4 years':1, 
                                                                       '5 years':1, 
                                                                       '6 years':1, 
                                                                       '7 years':2,
                                                                       '8 years':2,
                                                                       '9 years':2,
                                                                       '10+ years':3})

### <b> Credit Score
    https://twitter.com/pmf_world/status/1395018934571704323

In [None]:
def credit_score_to_cat(X):
    bins = [300, 579, 669, 739, 799, 850]
    X['Credit Score Cat'] = pd.cut(X['Credit Score'], bins=bins, labels=False)
    
    return X

df = credit_score_to_cat(df)


### <b> Home Ownership

In [None]:
df = pd.concat([df, pd.get_dummies(df['Home Ownership'])], axis=1)



### <b> Term

In [None]:
df['Long Term'] = df['Term'].map({'Long Term': 1, 'Short Term': 0})


### <b> Data Preprocessing
   

In [None]:
class DataPreprocessing:
    
    def __init__(self):
        self.medians=None
        self.modes = None        
        self.max_credit = None
        self.min_credit = None
        self.max_loan_amount = None
        self.max_score = 850
        self.min_score = 300
        
    def fit(self, X):
        """Сохранение статистик""" 
        X = X.copy()
        # Расчет медиан и мод
        self.medians = X.median()
        self.modes = X.mode().loc[0]
        self.max_credit = X['Maximum Open Credit'].quantile(q = 0.99)
        self.min_credit = X['Maximum Open Credit'].quantile(q = 0.01)
        self.max_loan_amount = np.max(X['Current Loan Amount'])
                
    def transform(self, X):
        
        """Трансформация данных"""
        
        # Maximum Open Credit
        cond_1 = X['Maximum Open Credit'] == 0
        cond_2 = X['Maximum Open Credit'] > self.max_credit       
        
        X['Maximum Open Credit Outlier'] = 0
        X.loc[cond_1 | cond_2, 'Maximum Open Credit Outlier'] = 1
        X.loc[cond_2, 'Maximum Open Credit'] = self.max_credit
        X.loc[cond_1, 'Maximum Open Credit'] = self.min_credit
        
        # Months since last delinquent
        X.drop('Months since last delinquent', axis=1, inplace=True)
        
        # Current Loan Amount
        cond_1 = X['Current Loan Amount'] > self.max_loan_amount
        X['Current Loan Amount Outlier'] = 0
        X.loc[cond_1, 'Current Loan Amount Outlier'] = 1
        X.loc[cond_1, 'Current Loan Amount'] = self.medians['Current Loan Amount']
        
        # Credit score
        cond_1 = X['Credit Score'] < self.min_score
        cond_2 = X['Credit Score'] > self.max_score
        X['Credit Score Outlier'] = 0
        X['Credit Score NaN'] = 0
        X.loc[cond_1 | cond_2, 'Credit Score Outlier'] = 1
        X.loc[X['Credit Score'].isna(), 'Credit Score NaN'] = 1

        X.loc[cond_2, 'Credit Score'] = X.loc[cond_2, 'Credit Score'] / 10
        X['Credit Score'].fillna(self.medians['Credit Score'], inplace = True)       
        
        
        # Annual Income
        X['Annual Income NaN'] = 0
        X.loc[X['Annual Income'].isna(), 'Annual Income NaN'] = 1
        X['Annual Income'].fillna(self.medians['Annual Income'], inplace = True)
        
        # Bankruptcies
        X['Bankruptcies NaN'] = 0
        X.loc[X['Bankruptcies'].isna(), 'Bankruptcies NaN'] = 1
        X['Bankruptcies'].fillna(self.medians['Bankruptcies'], inplace = True)
        
        X.fillna(self.medians, inplace=True)
        
        # Years in current job
        X['Years in current job NaN'] = 0
        X.loc[X['Years in current job'].isna(), 'Years in current job NaN'] = 1
        X['Years in current job'].fillna(self.modes['Years in current job'], inplace = True)
        
        return X



### <b> Генерация новых признаков

In [None]:
class FeatureGenetator():
    """Генерация новых фич"""    
      
    def fit(self, X, y=None):
        X = X.copy()
                
    def transform(self, X):
        
        # Years in current job
        X['Years in current job'] = X['Years in current job'].map({'< 1 year':0, 
                                                                       '1 year':0, 
                                                                       '2 years':0, 
                                                                       '3 years':1, 
                                                                       '4 years':1, 
                                                                       '5 years':1, 
                                                                       '6 years':1, 
                                                                       '7 years':2,
                                                                       '8 years':2,
                                                                       '9 years':2,
                                                                       '10+ years':3})
        
        # Credit Score
        X = self.credit_score_to_cat(X)
        
        # Home Ownership
        X = pd.concat([X, pd.get_dummies(X['Home Ownership'])], axis=1)
        
        # Term
        X['Long Term'] = X['Term'].map({'Long Term': 1, 'Short Term': 0})
        
        return X
    
    
    
    def credit_score_to_cat(self, X):
        bins = [300, 579, 669, 739, 799, 850]
        X['Credit Score Cat'] = pd.cut(X['Credit Score'], bins=bins, labels=False)
        
        return X

### <b> Исходные признаки

In [None]:
features_names = ['Annual Income',
                'Tax Liens',
                'Number of Open Accounts',
                'Years of Credit History',
                'Maximum Open Credit',
                'Number of Credit Problems',
#                'Months since last delinquent',
                'Bankruptcies',
                'Current Loan Amount',
                'Current Credit Balance',
                'Monthly Debt',
                'Credit Score',
                'Years in current job',
#                'Purpose'
#               'Term'
#                'Home Ownership'
                ]

### <b> Новые признаки

In [None]:
new_feature_names = [
#                         'Credit Score Outlier',
#                         'Current Loan Amount Outlier',
#                         'Maximum Open Credit Outlier',
#                         'Annual Income NaN',
#                         'Years in current job NaN',
#                         'Bankruptcies NaN',
#                         'Credit Score NaN',
                         'Long Term',
                         'Have Mortgage',
                         'Home Mortgage',
                         'Own Home',
                         'Rent',
                         'Credit Score Cat'
                    ]

target_name = 'Credit Default'

### <b> Разбиение на train и test

In [None]:
df_train = pd.read_csv(TRAIN_DATASET_PATH)
df_test = pd.read_csv(TEST_DATASET_PATH)

X = df_train.drop(columns=target_name)
y = df_train[target_name]



X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.33, shuffle=True, random_state=21)


preprocessor = DataPreprocessing()
preprocessor.fit(X_train)

X_train = preprocessor.transform(X_train)
X_valid = preprocessor.transform(X_valid)
df_test = preprocessor.transform(df_test)

print(X_train.isna().sum().sum(), X_valid.isna().sum().sum(), df_test.isna().sum().sum())
print(X_train.shape, X_valid.shape, df_test.shape)



In [None]:
features_gen = FeatureGenetator()
features_gen.fit(X_train, y_train)

X_train = features_gen.transform(X_train)
X_valid = features_gen.transform(X_valid)
df_test = features_gen.transform(df_test)


X_train.shape, X_valid.shape, df_test.shape



In [None]:
X_train = X_train[features_names + new_feature_names]
X_valid = X_valid[features_names + new_feature_names]
df_test = df_test[features_names + new_feature_names]

X_train.isna().sum().sum(), X_valid.isna().sum().sum(), df_test.isna().sum().sum()

### <b> Сравнение бустинговых моделей.

* 1. XGBClassifier: f1 = 0.4680)
* 2. LightGBM: f1 = 0.4698)
* 3. Catboost: f1 = 0.4976)

In [None]:
disbalance = y_train.value_counts()[0] / y_train.value_counts()[1]


In [None]:
model_catb = catb.CatBoostClassifier(silent = True,
                                     random_state=21,
                                     eval_metric = 'F1',
                                     use_best_model = True,
                                     class_weights = [1, disbalance],
                                     max_depth = 5,
                                     min_data_in_leaf = 5
                                    )
model_catb.fit(X_train, y_train, eval_set = (X_valid, y_valid))

y_train_pred = model_catb.predict(X_train)
y_valid_pred = model_catb.predict(X_valid)

get_classification_report(y_train, y_train_pred, y_valid, y_valid_pred)



### <b> Прогнозирование на тестовом датасете

In [None]:
submit = pd.read_csv(SAMPLE_SUBMIT_PATH)
pred = model_catb.predict(df_test)
submit['Credit Default'] = pred
submit.to_csv('final_sub', index = False)