# Bank credit scoring

You have been provided with information from the borrowers' personal data and the fact that there is a default.

## Field descriptions:

- `client_id` - client identifier
- `education` - level of education
- `sex` - borrower's gender
- `age` - borrower's age
- `car` - flag of the presence of a car
- `car_type` - flag of a foreign car
- `decline_app_cnt` - number of declined past bids
- `good_work` - flag of having "good" work
- `bki_request_cnt` - number of requests to the bki
- `home_address` - home address categorizer
- `work_address` - work address categorizer
- `income` - borrower's income
- `foreign_passport` - availability of a foreign passport
- `sna` - communication between the borrower and the bank's clients
- `first_time` - how old the information about the borrower was
- `score_bki` - scoring score according to data from the BCI
- `region_rating` - region rating
- `app_date` - date of application submission
- `default` - credit default flag

In [1]:
def import_extra_package(package):
    try:
        return __import__(package)
    except ImportError:
        !pip install package
        return __import__(package)

In [2]:
import pandas as pd
import_extra_package('pandas_profiling')
from pandas_profiling import ProfileReport
import numpy as np

import datetime

import matplotlib.pyplot as plt
import seaborn as sns 
%matplotlib inline

from sklearn.feature_selection import f_classif, mutual_info_classif
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.metrics import plot_confusion_matrix
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.metrics import auc, roc_auc_score, roc_curve
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import cross_validate
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from scipy.stats import randint
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import RandomizedSearchCV

import_extra_package('mlxtend')
from mlxtend.feature_selection import ExhaustiveFeatureSelector as EFS
from math import log as log
import os

## Helpers

Functions to combine all operations.

## Load the train and test datasets:

In [3]:
path = ''

In [4]:
for dirname, _, filenames in os.walk('kaggle'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        path = dirname

kaggle\input\sf-dst-scoring\sample_submission.csv
kaggle\input\sf-dst-scoring\test.csv.zip
kaggle\input\sf-dst-scoring\train.csv.zip


In [5]:
if path == '':
    raise Exception('The input path is empty!')
else:
    print(path)

kaggle\input\sf-dst-scoring


In [6]:
train_df = pd.read_csv(os.path.join(path, 'train.csv.zip'))
test_df = pd.read_csv(os.path.join(path, 'test.csv.zip'))

## General data inspection: 

In [7]:
profile = train_df.profile_report(
    title='Bank credit scoring',
    dark_mode=True,
    progress_bar=False,
    correlations={
        'pearson': {'calculate': True},
        'spearman': {'calculate': False},
        'kendall': {'calculate': False},
        'phi_k': {'calculate': False},
        'cramers': {'calculate': False},
    },
    interactions={
        'continuous': False,
        'targets': []
    },
    missing_diagrams={
        'heatmap': True,
        'dendrogram': False,
        'matrix': False
    },
    vars={
        'cat': {
            'characters': False,
            'words': False,
            'n_obs': 10
        }
    }
)

In [8]:
profile.to_widgets()

VBox(children=(Tab(children=(Tab(children=(GridBox(children=(VBox(children=(GridspecLayout(children=(HTML(valu…