In [137]:
from pathlib import Path

import pandas as pd
import catboost as catb
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import classification_report, r2_score
from sklearn.model_selection import train_test_split

In [138]:
DATA_DIR = Path('./data')
TRAIN_FILE = DATA_DIR / 'train.csv'
TEST_FILE = DATA_DIR / 'test.csv'

In [139]:
train_df = pd.read_csv(TRAIN_FILE)
test_df = pd.read_csv(TEST_FILE)
target_feature = 'Credit Default'
base_features = train_df.columns.drop(target_feature).tolist()

In [140]:
def predict_missing_values(data, dest_feature, exclude=''):
    '''

    :param data:
    :type data: pd.DataFrame
    :param dest_feature:
    :return:
    '''
    tmp = data.copy()
    if exclude:
        tmp.drop(columns=exclude, inplace=True)
    tmp['Id'] = tmp.index

    categorical_features = tmp.select_dtypes(include=[object]).columns.tolist()
    dummies = pd.get_dummies(tmp[categorical_features])
    dummies['Id'] = tmp['Id']
    tmp = tmp.drop(columns=categorical_features).merge(dummies, on=['Id'])

    features = tmp.columns

    tmp = tmp[features]

    train = tmp[~tmp[dest_feature].isna()]
    predict_data = tmp[tmp[dest_feature].isna()]

    X = train.drop(columns=dest_feature)
    y = train[dest_feature]

    X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                        test_size=0.2,
                                                        shuffle=True,
                                                        random_state=47)

    model = RandomForestRegressor(n_estimators=100,
                                  max_depth=10,
                                  random_state=47,
                                  verbose=0)
    model.fit(X_train, y_train)

    pred_train = model.predict(X_train)
    pred_test = model.predict(X_test)

    print("R2 for {}".format(dest_feature))
    print(f"r2 на train: {r2_score(y_train, pred_train)}")
    print(f"r2 на test: {r2_score(y_test, pred_test)}")

    pred = model.predict(predict_data.drop(columns=dest_feature))

    data.loc[data[dest_feature].isna(), dest_feature] = list(pred)

    return data

In [154]:
def transform_data(data, is_train = True):
    transformed_data = data.copy()

    max_current_loan_amount = train_df['Current Loan Amount'].max()
    median_current_load_amount = train_df['Current Loan Amount'].median()
    train_df.loc[train_df['Current Loan Amount'] == max_current_loan_amount, 'Current Loan Amount'] = median_current_load_amount

    transformed_data.loc[transformed_data['Years in current job'].isin(['< 1 year', '1 year', '2 years', '3 years', '4 years', '5 years']), 'Years in current job'] = '<= 5 years'
    transformed_data.loc[transformed_data['Years in current job'].isin(['6 years', '7 years', '8 years', '9 years']), 'Years in current job'] = '<= 10 years'
    transformed_data.loc[transformed_data['Years in current job'].isna(), 'Years in current job'] = transformed_data['Years in current job'].mode()[0]

    transformed_data.drop(columns=['Months since last delinquent', 'Bankruptcies'], inplace=True)

    transformed_data.loc[transformed_data['Purpose'].isin(['educational expenses', 'medical bills', 'buy house', 'wedding', 'moving', 'vacation']), 'Purpose'] = 'additionals'

    if is_train:
        transformed_data = predict_missing_values(transformed_data, 'Annual Income', 'Credit Score')
        transformed_data = predict_missing_values(transformed_data, 'Credit Score')

    return transformed_data

In [155]:
changed_train_df = transform_data(train_df)
changed_test_df = transform_data(test_df, False)

R2 for Annual Income
r2 на train: 0.7419408073884077
r2 на test: 0.39718806683020713
R2 for Credit Score
r2 на train: 0.5598597143250565
r2 на test: 0.14961004440323322


In [156]:
classes = changed_train_df[[target_feature]].value_counts().array
undersampling_count = classes[0] - classes[1]
changed_train_df.drop(index=changed_train_df[changed_train_df[target_feature] == 0].sample(undersampling_count, random_state=47).index, inplace=True)

In [144]:
def get_classification_report(y_train_true, y_train_pred, y_test_true, y_test_pred):
    print('TRAIN\n\n' + classification_report(y_train_true, y_train_pred))
    print('TEST\n\n' + classification_report(y_test_true, y_test_pred))

def evaluate_preds(model, X_train, X_test, y_train, y_test):
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    get_classification_report(y_train, y_train_pred, y_test, y_test_pred)

In [157]:
def do_train_prediction(df):
    X = df.drop(columns=[target_feature])
    y = df[[target_feature]]
    categorical_features = X.select_dtypes(include=[object]).columns.tolist()

    X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                        shuffle=True,
                                                        test_size=0.3,
                                                        random_state=47)
    model_catb = catb.CatBoostClassifier(silent=True,
                                         random_state=47,
                                         eval_metric='F1',
                                         cat_features=categorical_features,
                                         early_stopping_rounds=22,
                                         use_best_model=True,
                                         custom_metric=['Precision', 'Recall'])
    model_catb.fit(X_train, y_train, plot=False, eval_set=(X_test, y_test))
    evaluate_preds(model_catb, X_train, X_test, y_train, y_test)

    return model_catb

final_model = do_train_prediction(changed_train_df)

TRAIN

              precision    recall  f1-score   support

           0       0.72      0.92      0.81      1463
           1       0.89      0.64      0.75      1495

    accuracy                           0.78      2958
   macro avg       0.81      0.78      0.78      2958
weighted avg       0.81      0.78      0.78      2958

TEST

              precision    recall  f1-score   support

           0       0.71      0.91      0.80       650
           1       0.87      0.61      0.72       618

    accuracy                           0.77      1268
   macro avg       0.79      0.76      0.76      1268
weighted avg       0.79      0.77      0.76      1268



In [159]:
y_test_preds = final_model.predict(changed_test_df)

result = pd.DataFrame()

result['Id'] = changed_test_df.index
result[target_feature] = pd.Series(y_test_preds)
result.to_csv('result.csv', index=False)
result.values

array([[   0,    1],
       [   1,    1],
       [   2,    1],
       ...,
       [2497,    0],
       [2498,    0],
       [2499,    1]], dtype=int64)