Новые признаки:
1. Была ли раньше просрочка или нет (на основании Months since last delinquent)

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import classification_report, f1_score, precision_score, recall_score
from sklearn.ensemble import RandomForestRegressor

In [None]:
import warnings
warnings.simplefilter('ignore')

In [None]:
def get_classification_report(y_train_true, y_train_pred, y_test_true, y_test_pred):
    print('TRAIN\n\n' + classification_report(y_train_true, y_train_pred))
    print('TEST\n\n' + classification_report(y_test_true, y_test_pred))
    print('CONFUSION MATRIX\n')
    print(pd.crosstab(y_test_true, y_test_pred))


def evaluate_preds(model, X_train, X_test, y_train, y_test):
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    get_classification_report(y_train, y_train_pred, y_test, y_test_pred)

In [None]:
df = pd.read_csv('data/train.csv')

<li><b>Home Ownership</b> - домовладение</li>
<li><b>Annual Income</b> - годовой доход</li>
<li><b>Years in current job</b> - количество лет на текущем месте работы</li>
<li><b>Tax Liens</b> - налоговые обременения</li>
<li><b>Number of Open Accounts</b> - количество открытых счетов</li>
<li><b>Years of Credit History</b> - количество лет кредитной истории</li>
<li><b>Maximum Open Credit</b> - наибольший открытый кредит</li>
<li><b>Number of Credit Problems</b> - количество проблем с кредитом</li>
<li><b>Months since last delinquent</b> - количество месяцев с последней просрочки платежа</li>
<li><b>Bankruptcies</b> - банкротства</li>
<li><b>Purpose</b> - цель кредита</li>
<li><b>Term</b> - срок кредита</li>
<li><b>Current Loan Amount</b> - текущая сумма кредита</li>
<li><b>Current Credit Balance</b> - текущий кредитный баланс</li>
<li><b>Monthly Debt</b> - ежемесячный долг</li>
<li><b>Credit Score</b> - баллы кредитного рейтинга</li>
<li><b>Credit Default</b> - факт невыполнения кредитных обязательств (0 - погашен вовремя, 1 - просрочка)</li>

### Корреляция признаков 

In [None]:
corr_with_target = df.corr().iloc[:-1, -1].sort_values(ascending=False)
plt.figure(figsize=(10, 8))
sns.barplot(x=corr_with_target.values, y=corr_with_target.index)
plt.title('Корреляция признаков с целевой переменной')
plt.show()

In [None]:
df['Credit Score'].describe()

In [None]:
df[(df['Credit Score'] > 600) & (df['Credit Score'] < 700) & (df['Credit Default'] == 1)]

In [None]:
dict(df[(df['Credit Score'] > 751)]['Credit Default'].value_counts())

In [None]:
df[df['Credit Score'] < 1000]['Credit Score'].hist(bins = 10, figsize=(40,30))
plt.show()

In [None]:
cs_groups

In [None]:
plt.figure(figsize=(20, 8))

sns.countplot(x='Credit Score', hue='Credit Default', data=df[(df['Credit Score'] >= 0) & (df['Credit Score'] < 800)])
plt.title('PAY_1 grouped by target variable')
plt.legend(title='Target', loc='upper right')

plt.show()

In [None]:
df.info()

In [None]:
categorial_features = ['Home Ownership', 'Years in current job', 'Purpose', 'Term']
numerical_features = ['Annual Income', 'Tax Liens','Number of Open Accounts', 'Years of Credit History',
                      'Maximum Open Credit', 'Number of Credit Problems', 'Months since last delinquent', 'Bankruptcies',
                      'Current Loan Amount', 'Current Credit Balance', 'Monthly Debt', 'Credit Score']
all_features = categorial_features + numerical_features
target_feature = ['Credit Default']

# No NaN numerical features
nnn_features = ['Tax Liens','Number of Open Accounts', 'Years of Credit History',
                      'Maximum Open Credit', 'Number of Credit Problems',
                      'Current Loan Amount', 'Current Credit Balance', 'Monthly Debt']

wnn_features = ['Annual Income', 'Months since last delinquent', 'Bankruptcies', 'Credit Score']

#### Классы дисбалансированы, поэтому основная идея - удалить строки с nan значениями, которые относятся к нулевому классу, а потом применить SMOTE

### Удалим признаки с пропусками, относящиеся к нулевой группе, кроме Months since last delinquent

In [None]:
ft_na = (df['Annual Income'].isna() | df['Bankruptcies'].isna() | df['Credit Score'].isna()) & (df['Credit Default'] == 0)
df = df.drop(df[ft_na].index)

#### Считаю важным признаком Months since last delinquent, поэтому для хорошей разделимости NaN значения заполним цифрой 999, а так же создадим новый логический признак по этому параметру

In [None]:
def modyfy_MSL(df1):
    df1['Never delinquent'] = 0
    df1.loc[df1['Months since last delinquent'].isna(), 'Never delinquent'] = 1
    df1.loc[df1['Months since last delinquent'].isna(), 'Months since last delinquent'] = 999
    return df1

df = modyfy_MSL(df)
numerical_features += ['Never delinquent']

In [None]:
df.info() # ВРЕМЕННАЯ

### Заполнение пропусков при помощи регрессии

In [None]:
def fill_nan_rfr(X, y):
    """Ищет лучшие параметры"""
    rfr = RandomForestRegressor()
    parametrs = { 'n_estimators': [1000],
                  'max_depth': [9],
                  'min_samples_leaf': range(3,6),
                  'min_samples_split': range(3,6) }

    grid = GridSearchCV(rfr, parametrs, cv=5, n_jobs=-1)
    grid.fit(X, y)
    return grid

In [None]:
X_features = nnn_features + ['Credit Default', 'Home Ownership', 'Purpose', 'Term'] + target_feature

In [None]:
# %%time
# a = fill_nan_rfr(df[~df['Annual Income'].isna()][X_features], df[~df['Annual Income'].isna()]['Annual Income'])
best_params = {'max_depth': 9,
               'min_samples_leaf': 3,
               'min_samples_split': 3,
               'n_estimators': 1000,
               'n_jobs': -1}

In [None]:
def fix_outlier(df1):
    df1['CS_out'] = 0
    df1.loc[df1['Credit Score'] >= 800, 'CS_out'] = 1
    df1.loc[df1['Credit Score'] >= 800, 'Credit Score'] = np.nan
    return df1


def simple_label_encoder(df1, cats=categorial_features):
    le = LabelEncoder()
    for i in cats:
        le.fit(df1[i].astype(str))
        df1[i] = le.transform(df1[i].astype(str))
    return df1


def simple_fill_na(df1):
    for i in df1.columns:
        df1[i].fillna(df1[i].median(), inplace=True)
    return df1

def pred_nan(X_df, target, cats=categorial_features, params=best_params):
    X_df1 = X_df.copy()
    X_df1 = simple_label_encoder(X_df1)
    y = X_df1[target]
    X = X_df1.drop(target, axis=1)
    X = simple_fill_na(X)
    X_train = X[~y.isna()]
    y_train = y[~y.isna()]
    X_pred = X[y.isna()]
    rfr = RandomForestRegressor(**params)
    rfr.fit(X_train, y_train)
    pred = rfr.predict(X_pred)
    X_df.loc[X_df[target].isna(), target] = pred
    return X_df


In [None]:
%%time
def fill_all_na(df1):
    df1 = fix_outlier(df1)
    df1 = pred_nan(df1, 'Annual Income')
    df1 = pred_nan(df1, 'Bankruptcies')
    df1 = pred_nan(df1, 'Credit Score')
    df1.loc[df1['Years in current job'].isna(),'Years in current job'] = df1['Years in current job'].mode()[0]
    
    return df1
df = fill_all_na(df)

In [None]:
df.describe()

### Mean Encoding

In [None]:
cs_groups = [[0, 670]] + [[i, i+10] for i in range(670, 740, 10)] + [[740, 9999]]

cs_groups_test = []
for i, j in cs_groups:
    df_filter = (df['Credit Score'] >= i) & (df['Credit Score'] < j)
    cnt = dict(df[df_filter]['Credit Default'].value_counts())
    cd_sum = cnt.setdefault(0, 0) + cnt.setdefault(1, 0)
    cd_perc = cnt[1] / cd_sum
    cs_groups_test.append([i, j, cd_perc])
    df.loc[df_filter, 'target_per_CS'] = cd_perc
    
def mean_test_cs_groups(df1, cs_groups=cs_groups_test):
    df1['target_per_CS'] = 0
    for i, j, c in cs_groups:
        df_filter = (df1['Credit Score'] >= i) & (df1['Credit Score'] < j)
        df1.loc[df_filter, 'target_per_CS'] = c
    return df1

In [None]:
df.info()

### Кодирование категориальных признаков

In [None]:
df = simple_label_encoder(df)

In [None]:
df.hist(bins = 30, figsize=(20,15))
plt.show()

In [None]:
sns.heatmap(df.corr());

In [None]:
def get_classification_report(y_train_true, y_train_pred, y_test_true, y_test_pred):
    print('TRAIN\n\n' + classification_report(y_train_true, y_train_pred))
    print('TEST\n\n' + classification_report(y_test_true, y_test_pred))
    print('CONFUSION MATRIX\n')
    print(pd.crosstab(y_test_true, y_test_pred))


def evaluate_preds(model, X_train, X_test, y_train, y_test):
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    get_classification_report(y_train, y_train_pred, y_test, y_test_pred)

### Разбиение на трейн и тест

In [None]:
X = df.drop(columns='Credit Default')
y = df['Credit Default']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True,
                                                    random_state=1, stratify=df['Credit Default'])
X_train.shape, X_test.shape

### SMOTE

In [None]:
from imblearn.over_sampling import SMOTE
smote = SMOTE()
X_smote, y_smote = smote.fit_sample(X, y)
X_train_smote, y_train_smote = smote.fit_sample(X_train, y_train)
# df = pd.concat([X_smote, y_smote], axis=1)

In [None]:
from sklearn.metrics import f1_score

In [None]:
rfc = RandomForestClassifier(random_state=1,
                             max_depth=3,
                             n_estimators=512,
                             min_samples_split = 2,
                             min_samples_leaf = 1,
                             n_jobs=-1)

rfc = RandomForestClassifier(random_state=1, max_depth=5, n_estimators=99, n_jobs=-1)

rfc.fit(X_train_smote, y_train_smote)
y_test_pred = rfc.predict(X_test)
print(f1_score(y_test, y_test_pred))
evaluate_preds(rfc, X_train, X_test, y_train, y_test)

In [None]:
df_test = pd.read_csv('data/test.csv')

In [None]:
df_test = modyfy_MSL(df_test)

In [None]:
%%time
df_test = fill_all_na(df_test)

In [None]:
df_test = simple_label_encoder(df_test)
df_test = mean_test_cs_groups(df_test)

In [None]:
rfc.fit(X_smote, y_smote)
df_test_pred = pd.DataFrame()
df_test_pred['Id'] = list(df_test.index)
df_test_pred['Credit Default'] = rfc.predict(df_test)

In [None]:
df_test_pred

In [None]:
df_test_pred.to_csv('final_pred_v2-3.csv', index=False)

## LightGBM

In [None]:
df['Credit Default'].value_counts()

In [687]:
%%time
import lightgbm as lgbm
model_lgbm = lgbm.LGBMClassifier(random_state=21, 
                                 is_unbalance=True,
                                 n_estimators=25,
                                 min_data_in_leaf=105,
                                 num_leaves=5,
                                 learning_rate=0.15,
                                )
model_lgbm.fit(X_train, y_train)

evaluate_preds(model_lgbm, X_train, X_test, y_train, y_test)

TRAIN

              precision    recall  f1-score   support

           0       0.86      0.64      0.74      3481
           1       0.52      0.79      0.62      1690

    accuracy                           0.69      5171
   macro avg       0.69      0.71      0.68      5171
weighted avg       0.75      0.69      0.70      5171

TEST

              precision    recall  f1-score   support

           0       0.86      0.64      0.73       870
           1       0.51      0.78      0.62       423

    accuracy                           0.68      1293
   macro avg       0.68      0.71      0.67      1293
weighted avg       0.74      0.68      0.69      1293

CONFUSION MATRIX

col_0             0    1
Credit Default          
0               555  315
1                93  330
Wall time: 59 ms


In [690]:
model_lgbm.fit(X, y)
df_test_pred = pd.DataFrame()
df_test_pred['Id'] = list(df_test.index)
df_test_pred['Credit Default'] = model_lgbm.predict(df_test)
df_test_pred.to_csv('final_pred_v2-3_LGBM.csv', index=False)



In [None]:
param_grid = {
    'num_leaves': [31, 127],
#     'reg_alpha': [0.1, 0.5],
    'min_data_in_leaf': [30, 50, 100, 300, 400],
#     'lambda_l1': [0, 1,1.5],
#     'lambda_l2': [0, 1]
    }

lgb_estimator = lgbm.LGBMClassifier(boosting_type='gbdt',
                                   objective='binary',
                                   learning_rate=0.01,
                                   metric='f1')

gsearch = GridSearchCV(estimator=lgb_estimator, param_grid=param_grid, cv=5)

In [None]:
lgb_model = gsearch.fit(X=X, y=y)

print(lgb_model.best_params_, lgb_model.best_score_)

## XGBOOST

In [None]:
import xgboost as xgb
model_xgb = xgb.XGBClassifier(random_state=21, 
                              max_depth=1, 
                              )
model_xgb.fit(X_train_smote, y_train_smote)
y_test_pred = model_xgb.predict(X_test)
print(f1_score(y_test, y_test_pred))
evaluate_preds(model_xgb, X_train, X_test, y_train, y_test)

In [None]:
df_test = pd.read_csv('data/test.csv')

In [None]:
df_test = modyfy_MSL(df_test)

In [None]:
%%time
df_test = fill_all_na(df_test)

In [None]:
df_test = simple_label_encoder(df_test)

In [None]:
df_test_pred = pd.DataFrame()
df_test_pred['Id'] = list(df_test.index)
df_test_pred['Credit Default'] = rfc.predict(df_test)

In [None]:
df_test_pred

In [None]:
df_test_pred.to_csv('final_pred_v2-1XGB.csv', index=False)

2 - добавил smote() # 0.43621<br><br><br>
2.1 - изменил параметры: rfc = RandomForestClassifier(random_state=1, max_depth=3, n_estimators=9, n_jobs=-1)<br>
\# 0.49063<br>
TRAIN

              precision    recall  f1-score   support

           0       0.75      0.96      0.84      3492
           1       0.94      0.68      0.79      3469

    accuracy                           0.82      6961
   macro avg       0.84      0.82      0.81      6961
weighted avg       0.84      0.82      0.81      6961

TEST

              precision    recall  f1-score   support

           0       0.75      0.95      0.84       859
           1       0.93      0.69      0.79       882

    accuracy                           0.82      1741
   macro avg       0.84      0.82      0.81      1741
weighted avg       0.84      0.82      0.81      1741

CONFUSION MATRIX

col_0             0    1
Credit Default          
0               815   44
1               276  606

### CatBoost

In [None]:
%%time
import catboost as catb
frozen_params = {
     'silent':True,
     'random_state':21,
     'cat_features':categorial_features,
     'eval_metric':'F1',
     'early_stopping_rounds':20
}

model_catb = catb.CatBoostClassifier(**frozen_params, iterations=200, max_depth=5)
model_catb.fit(X_train, y_train, plot=True, eval_set=(X_test, y_test))

evaluate_preds(model_catb, X_train, X_test, y_train, y_test)

In [None]:
df_test = pd.read_csv('data/test.csv')

In [None]:
df_test[df_test['Credit Score'] > 751]

In [None]:
df_test = modyfy_MSL(df_test)

In [None]:
%%time
df_test = fill_all_na(df_test)

In [None]:
df_test = simple_label_encoder(df_test)

In [None]:
df_test_pred = pd.DataFrame()
df_test_pred['Id'] = list(df_test.index)
df_test_pred['Credit Default'] = model_catb.predict(df_test)

In [None]:
df_test_pred

In [None]:
df_test_pred.to_csv('final_pred_v2-2_catb.csv', index=False)