In [28]:
# numpy and pandas for data manipulation
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# sklearn preprocessing for dealing with categorical variables
from sklearn.preprocessing import OneHotEncoder

# modeling
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import *
from sklearn.linear_model import LogisticRegression
import sklearn.metrics
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import fbeta_score, make_scorer


#explainability
import shap

#serialization
import joblib

#remove warnings
import warnings 
warnings.filterwarnings("ignore")

In [29]:
#read the data 
app_train = pd.read_csv("../raw_data/application_train.csv").astype("object")

In [30]:
X_train, X_test, y_train, y_test = train_test_split(app_train.drop(['TARGET'], axis=1), #x
                                                    app_train['TARGET'], #y
                                                    test_size=0.2,
                                                    stratify=app_train['TARGET'], #stratified sample
                                                    random_state=123)

# Features Types

We need to convert variables that are not correctly cast as of float in the raw data : 

In [31]:
def convert_dtypes_func(df):
    #List the columns to be converted to float
    list_columns_to_convert_to_float = ("CNT_CHILDREN" 
                                ,"AMT_INCOME_TOTAL" 
                                ,"AMT_CREDIT" 
                                ,"AMT_ANNUITY" 
                                ,"AMT_GOODS_PRICE" 
                                ,"DAYS_BIRTH"
                                ,"DAYS_EMPLOYED" 
                                ,"DAYS_REGISTRATION"
                                ,"REGION_RATING_CLIENT"
                                ,"REGION_POPULATION_RELATIVE"
                                ,"HOUR_APPR_PROCESS_START"
                                ,"CNT_FAM_MEMBERS" 
                                ,"DAYS_ID_PUBLISH"
                                ,"OWN_CAR_AGE"
                                ,"EXT_SOURCE_1               "
                                ,"EXT_SOURCE_2               "
                                ,"EXT_SOURCE_3               "
                                ,"APARTMENTS_AVG             "
                                ,"BASEMENTAREA_AVG           "
                                ,"YEARS_BEGINEXPLUATATION_AVG"
                                ,"YEARS_BUILD_AVG            "
                                ,"COMMONAREA_AVG             "
                                ,"ELEVATORS_AVG              "
                                ,"ENTRANCES_AVG              "
                                ,"FLOORSMAX_AVG              "
                                ,"FLOORSMIN_AVG              "
                                ,"LANDAREA_AVG               "
                                ,"LIVINGAPARTMENTS_AVG       "
                                ,"LIVINGAREA_AVG             "
                                ,"NONLIVINGAPARTMENTS_AVG    "
                                ,"NONLIVINGAREA_AVG          "
                                ,"APARTMENTS_MODE            "
                                ,"BASEMENTAREA_MODE          "
                                ,"YEARS_BEGINEXPLUATATION_MODE"
                                ,"YEARS_BUILD_MODE           "
                                ,"COMMONAREA_MODE            "
                                ,"ELEVATORS_MODE             "
                                ,"ENTRANCES_MODE             "
                                ,"FLOORSMAX_MODE             "
                                ,"FLOORSMIN_MODE             "
                                ,"LANDAREA_MODE              "
                                ,"LIVINGAPARTMENTS_MODE      "
                                ,"LIVINGAREA_MODE            "
                                ,"NONLIVINGAPARTMENTS_MODE   "
                                ,"NONLIVINGAREA_MODE         "
                                ,"APARTMENTS_MEDI            "
                                ,"BASEMENTAREA_MEDI          "
                                ,"YEARS_BEGINEXPLUATATION_MEDI"
                                ,"YEARS_BUILD_MEDI           "
                                ,"COMMONAREA_MEDI            "
                                ,"ELEVATORS_MEDI             "
                                ,"ENTRANCES_MEDI             "
                                ,"FLOORSMAX_MEDI             "
                                ,"FLOORSMIN_MEDI             "
                                ,"LANDAREA_MEDI              "
                                ,"LIVINGAPARTMENTS_MEDI      "
                                ,"LIVINGAREA_MEDI            "
                                ,"NONLIVINGAPARTMENTS_MEDI   "
                                ,"NONLIVINGAREA_MEDI         "
                                ,"TOTALAREA_MODE             "
                                ,"OBS_30_CNT_SOCIAL_CIRCLE   "
                                ,"DEF_30_CNT_SOCIAL_CIRCLE   "
                                ,"OBS_60_CNT_SOCIAL_CIRCLE   "
                                ,"DEF_60_CNT_SOCIAL_CIRCLE   "
                                ,"DAYS_LAST_PHONE_CHANGE     "
                                ,"AMT_REQ_CREDIT_BUREAU_HOUR"
                                ,"AMT_REQ_CREDIT_BUREAU_DAY "
                                ,"AMT_REQ_CREDIT_BUREAU_WEEK"
                                ,"AMT_REQ_CREDIT_BUREAU_MON "
                                ,"AMT_REQ_CREDIT_BUREAU_QRT "
                                ,"AMT_REQ_CREDIT_BUREAU_YEAR")       

    #remove spaces in the list created
    list_columns_to_convert_to_float = [s.strip() for s in list_columns_to_convert_to_float]       
    
    convert_count = 0

    #convert object columns to float
    for col in list_columns_to_convert_to_float:
        df[col] = df[col].astype(float)
        
        # Keep track of how many columns were label encoded
        convert_count += 1

    print('%d object columns were converted to int.' % convert_count)
    
    return df

In [32]:
X_train = convert_dtypes_func(X_train)

71 object columns were converted to int.


# Feature Engineering

In [33]:
def feature_eng(df):

    # Create an anomalous flag column
    df['DAYS_EMPLOYED_ANOM'] = df["DAYS_EMPLOYED"] == 365243

    # Replace the anomalous values with nan
    df['DAYS_EMPLOYED'].replace({365243: np.nan}, inplace = True)

    #correct sign of Days Birth
    df["DAYS_BIRTH"] = abs(df["DAYS_BIRTH"])
    df['AGE_INT'] = round((df['DAYS_BIRTH'] / 365).astype(float),2)

    #footing financial ratios
    df['annuity_income_ratio'] = round((df['AMT_INCOME_TOTAL'] / df['AMT_ANNUITY']).astype(float),2)
    df['credit_annuity_ratio'] = round((df['AMT_CREDIT'] / df['AMT_ANNUITY']).astype(float),2)
    df['credit_goods_price_ratio'] = round((df['AMT_CREDIT'] / df['AMT_GOODS_PRICE']).astype(float),2)
    df['credit_downpayment'] = round((df['AMT_GOODS_PRICE'] - df['AMT_CREDIT']).astype(float),2)
 
    print('Feature engineering success')
   
    return df

In [34]:
X_train = feature_eng(X_train)

Feature engineering success


# Feature Selection

In [35]:
def df_feature_selection(df):
    columns_list = df.columns.to_list()

    columns_to_drop_intersec = (['DAYS_BIRTH',
    'REGION_RATING_CLIENT_W_CITY',
    'BASEMENTAREA_MODE',
    'YEARS_BUILD_MODE',
    'COMMONAREA_MODE',
    'ELEVATORS_MODE',
    'ENTRANCES_MODE',
    'FLOORSMAX_MODE',
    'LANDAREA_MODE',
    'LIVINGAPARTMENTS_MODE',
    'NONLIVINGAPARTMENTS_MODE',
    'NONLIVINGAREA_MODE',
    'APARTMENTS_MEDI',
    'BASEMENTAREA_MEDI',
    'YEARS_BEGINEXPLUATATION_MEDI',
    'YEARS_BUILD_MEDI',
    'COMMONAREA_MEDI',
    'ELEVATORS_MEDI',
    'ENTRANCES_MEDI',
    'FLOORSMAX_MEDI',
    'FLOORSMIN_MEDI',
    'LIVINGAPARTMENTS_MEDI',
    'LIVINGAREA_MEDI',
    'NONLIVINGAPARTMENTS_MEDI',
    'NONLIVINGAREA_MEDI',
    'OBS_60_CNT_SOCIAL_CIRCLE',
    "DAYS_EMPLOYED_ANOM"])

    #Drop columns
    df_selec_col = pd.DataFrame(data=df, columns=columns_list).drop(columns_to_drop_intersec, axis=1)
    
    print('Feature selection success')

    return df_selec_col

In [36]:
X_train = df_feature_selection(X_train)

Feature selection success


# Imputation - Encoding - Standardization - Imputation

In [37]:
#keep id into a separate serie
user_id = X_train[['SK_ID_CURR']]

#create df of features by type
cat_features = X_train.select_dtypes(include=['object']).drop(['SK_ID_CURR'], axis=1)
num_features = X_train.select_dtypes(exclude=['object'])

#imputation
categorical_imputer = SimpleImputer(missing_values=np.nan, strategy="most_frequent")
cat_features = categorical_imputer.fit_transform(cat_features)

simple_imputer = SimpleImputer(missing_values=np.nan, strategy="median")
num_features = simple_imputer.fit_transform(num_features)

#One hot encoding categorical variables
ohe = OneHotEncoder(handle_unknown='ignore')
cat_array = ohe.fit_transform(cat_features).todense()
cat_array = np.asarray(cat_array)

#Standard Scaling numerical variables
scaler = StandardScaler()
num_array = scaler.fit_transform(num_features)

#concatenate
X_train = np.concatenate([cat_array, num_array], axis=1)
# y = app_train['TARGET'].astype(int)

In [38]:
X_test = convert_dtypes_func(X_test)
X_test = feature_eng(X_test)
X_test = df_feature_selection(X_test)

#keep id into a separate serie
user_id = X_test[['SK_ID_CURR']]

#create df of features by type
cat_features = X_test.select_dtypes(include=['object']).drop(['SK_ID_CURR'], axis=1)
num_features = X_test.select_dtypes(exclude=['object'])

#imputation
cat_features = categorical_imputer.transform(cat_features)

num_features = simple_imputer.transform(num_features)

#One hot encoding categorical variables
cat_array = ohe.transform(cat_features).todense()
cat_array = np.asarray(cat_array)

#Standard Scaling numerical variables
num_array = scaler.transform(num_features)

#concatenate
X_test = np.concatenate([cat_array, num_array], axis=1)
# y = app_train['TARGET'].astype(int)

71 object columns were converted to int.
Feature engineering success
Feature selection success


In [None]:
best_threshold_LogReg = 0.48

model_log = LogisticRegression(class_weight = "balanced")

tuned_parameters_ex = {"C": [0.005]}

df_results_Log_fin_ = train_test_fin(model_log,
                                    2,
                                    best_threshold_LogReg,
                                    tuned_parameters_ex,
                                    X_train,
                                    X_test,
                                    y_train,
                                    y_test)

In [50]:
scorer = make_scorer(fbeta_score, beta=2)

clf = GridSearchCV(model_log, 
                param_grid=tuned_parameters_ex, 
                scoring=scorer, 
                refit=True,
                cv=5,
                return_train_score=True
                )

In [51]:
clf

GridSearchCV(cv=5, estimator=LogisticRegression(class_weight='balanced'),
             param_grid={'C': [0.005]}, return_train_score=True,
             scoring=make_scorer(fbeta_score, beta=2))

In [39]:
def train_test_fin (model, beta, threshold, tuned_parameters, x_train, x_test, y_train, y_test, beta_search=None):
  
  #https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter
  #https://openclassrooms.com/fr/courses/4297211-evaluez-les-performances-dun-modele-de-machine-learning/4308256-evaluez-un-algorithme-de-classification-qui-retourne-des-valeurs-binaires
  print(tuned_parameters)

  scorer = make_scorer(fbeta_score, beta=beta)

  clf = GridSearchCV(model, 
                    param_grid=tuned_parameters, 
                    scoring=scorer, 
                    refit=True,
                    cv=5,
                    return_train_score=True
                    )
  
  clf.fit(x_train, y_train)
  
  y_pred_train_proba = clf.best_estimator_.predict_proba(x_train) 
  y_pred_train_proba = y_pred_train_proba[:, 1]

  y_pred_proba = clf.best_estimator_.predict_proba(x_test) 
  y_pred_proba = y_pred_proba[:, 1]
  y_pred = y_pred_proba > threshold
  y_pred = y_pred.astype(int)

  #confusion matrix from predictions
  print(f'Confusion Matrix from predictions for {model} :')
  ConfusionMatrixDisplay.from_predictions(y_test, y_pred)
  plt.grid(False)
  plt.show()
  print()

  #performance metrics on test set
  recall_score = round(sklearn.metrics.recall_score(y_test, y_pred),4)
  f1_score = round(sklearn.metrics.f1_score(y_test, y_pred),4)
  roc_auc_score_train = round(sklearn.metrics.roc_auc_score(y_train, y_pred_train_proba),4)
  roc_auc_score = round(sklearn.metrics.roc_auc_score(y_test, y_pred_proba),4)
  fbeta_score_ = round(sklearn.metrics.fbeta_score(y_test, y_pred, beta=beta),4)

  #execution time
  mean_fit_time = clf.cv_results_['mean_fit_time']
  mean_score_time = clf.cv_results_['mean_score_time']
  n_splits = clf.n_splits_ #number of splits of training data
  n_iter = pd.DataFrame(clf.cv_results_).shape[0] #Iterations it takes per split for the solver to converge 
  execution_time = round(np.mean(mean_fit_time + mean_score_time) * n_splits * n_iter, 0)

  #return (i) perf metrics (ii) best hyperparams
  print(f'Best Threshold : {threshold} ')
  print(f'Performance Metrics for {model} :')
  print(f'Fbeta Score : {fbeta_score_}')
  print(f'Recall/sensitivity [true positive rate = TP / (TP + FN)]: {recall_score}')
  print(f'F1 Score : {f1_score}')
  print(f'AUROC : {roc_auc_score}')
  print(f'Execution time (sec): {execution_time}')
  print()

  print(f'Best Hyperparam(s) on training set :{clf.best_params_}')

  # store results in a dataframe
  df_results = pd.DataFrame(columns=['Model',
                                     'best_params_', 
                                     'roc_auc_score_train', 
                                     'roc_auc_score_test', 
                                     'recall_score_test',
                                     'fbeta_score_',
                                     'f1_score_test',
                                     'execution_time (sec)'
                                     ])

  data = {'Model': [model], 
          'best_params_' : [clf.best_params_], 
          'roc_auc_score_train' : [roc_auc_score_train],
          'roc_auc_score_test' : [roc_auc_score],
          'recall_score_test' : [recall_score], 
          'fbeta_score_' : [fbeta_score_], 
          'f1_score_test': [f1_score], 
          'execution_time (sec)': [execution_time]
          }

  results = pd.DataFrame.from_dict(data)

  df_results = df_results.append(results)
  
  if beta_search == True:

    beta_score_list = []
    for i in range(1,101):
        fbeta_results = fbeta_score(y_test,y_pred,beta=i)
        beta_score_list.append(fbeta_results)

    fig = plt.figure(figsize=(10,6))
    sns.lineplot(x=range(1,101), y=beta_score_list)
    plt.title('Fbeta Score with beta 0 to 100')

  return df_results

In [49]:
best_threshold_LogReg = 0.48

model_log = LogisticRegression(class_weight = "balanced")

tuned_parameters_ex = {"C": 0.005}

df_results_Log_fin_ = train_test_fin(model_log,
                                    2,
                                    best_threshold_LogReg,
                                    tuned_parameters_ex,
                                    X_train,
                                    X_test,
                                    y_train,
                                    y_test)

{'C': [0.005]}


NotFittedError: All estimators failed to fit

In [510]:
#Model with best params
model = LogisticRegression(class_weight = 'balanced', C=0.005, max_iter=1000)

#fit
model.fit(X_train,y_train)

LogisticRegression(C=0.005, class_weight='balanced', max_iter=1000)

In [511]:
#predict_proba returns the probability of the sample
#for each class in the model
#where classes are ordered as they are in self.classes_.
model.classes_

array([0, 1])

# SHAP

## Footing Shapley Explainer on Training Set

In [357]:
#cat columns list after ohe
cat_features_list = list(app_train.select_dtypes(include=['object']).drop(['SK_ID_CURR','TARGET'], axis=1).columns)
#https://stackoverflow.com/questions/54570947/feature-names-from-onehotencoder
cat_features_list_after_ohe = ohe.get_feature_names(cat_features_list).tolist()

#numerical columns list
num_features_list_after_preproc = list(app_train.select_dtypes(exclude=['object']).columns)

#concatenate list of features
features_list_after_prepr = cat_features_list_after_ohe + num_features_list_after_preproc

#transform X into a dataframe with column names
ohe_dataframe = pd.DataFrame(X, columns=features_list_after_prepr)

In [358]:
sub_sampled_train_data = shap.sample(ohe_dataframe, 50)
log_reg_explainer = shap.KernelExplainer(model.predict_proba, sub_sampled_train_data)

In [359]:
print(f'Shap Expected Value : {log_reg_explainer.expected_value}')

result_proba_training = model.predict_proba(X).mean(axis=0)
print(f'Model Mean Value (Theoretical Expected Value of default) : {result_proba_training[1]}')

Shap Expected Value : [0.51462926 0.48537074]
Model Mean Value (Theoretical Expected Value of default) : 0.4227130435028335


## Prediction on Test Data and Shap Values

In [360]:
# df_test = pd.read_csv("./dashboard_data/df_test.csv").astype(object)

# list_cat_features = ["NAME_CONTRACT_TYPE",
#     "CODE_GENDER",
#     "FLAG_OWN_CAR",
#     "FLAG_OWN_REALTY",
#     "NAME_TYPE_SUITE",
#     "NAME_INCOME_TYPE",
#     "NAME_EDUCATION_TYPE",
#     "NAME_FAMILY_STATUS",
#     "NAME_HOUSING_TYPE",
#     "FLAG_MOBIL",
#     "FLAG_EMP_PHONE",
#     "FLAG_WORK_PHONE",
#     "FLAG_CONT_MOBILE",
#     "FLAG_PHONE",
#     "FLAG_EMAIL",
#     "OCCUPATION_TYPE",
#     "WEEKDAY_APPR_PROCESS_START",
#     "REG_REGION_NOT_LIVE_REGION",
#     "REG_REGION_NOT_WORK_REGION",
#     "LIVE_REGION_NOT_WORK_REGION",
#     "REG_CITY_NOT_LIVE_CITY",
#     "REG_CITY_NOT_WORK_CITY",
#     "LIVE_CITY_NOT_WORK_CITY",
#     "ORGANIZATION_TYPE",
#     "FONDKAPREMONT_MODE",
#     "HOUSETYPE_MODE",
#     "WALLSMATERIAL_MODE",
#     "EMERGENCYSTATE_MODE",
#     "FLAG_DOCUMENT_2",
#     "FLAG_DOCUMENT_3",
#     "FLAG_DOCUMENT_4",
#     "FLAG_DOCUMENT_5",
#     "FLAG_DOCUMENT_6",
#     "FLAG_DOCUMENT_7",
#     "FLAG_DOCUMENT_8",
#     "FLAG_DOCUMENT_9",
#     "FLAG_DOCUMENT_10",
#     "FLAG_DOCUMENT_11",
#     "FLAG_DOCUMENT_12",
#     "FLAG_DOCUMENT_13",
#     "FLAG_DOCUMENT_14",
#     "FLAG_DOCUMENT_15",
#     "FLAG_DOCUMENT_16",
#     "FLAG_DOCUMENT_17",
#     "FLAG_DOCUMENT_18",
#     "FLAG_DOCUMENT_19",
#     "FLAG_DOCUMENT_20",
#     "FLAG_DOCUMENT_21"
# ]
# list_num_features = [
#     "CNT_CHILDREN",
#     "AMT_INCOME_TOTAL",
#     "AMT_CREDIT",
#     "AMT_ANNUITY",
#     "AMT_GOODS_PRICE",
#     "DAYS_EMPLOYED",
#     "DAYS_REGISTRATION",
#     "DAYS_ID_PUBLISH",
#     "REGION_RATING_CLIENT",
#     "REGION_POPULATION_RELATIVE",
#     "CNT_FAM_MEMBERS",
#     "HOUR_APPR_PROCESS_START",
#     "OWN_CAR_AGE",
#     "EXT_SOURCE_1",
#     "EXT_SOURCE_2",
#     "EXT_SOURCE_3",
#     "APARTMENTS_AVG",
#     "BASEMENTAREA_AVG",
#     "YEARS_BEGINEXPLUATATION_AVG",
#     "YEARS_BUILD_AVG",
#     "COMMONAREA_AVG",
#     "ELEVATORS_AVG",
#     "ENTRANCES_AVG",
#     "FLOORSMAX_AVG",
#     "FLOORSMIN_AVG",
#     "LANDAREA_AVG",
#     "LIVINGAPARTMENTS_AVG",
#     "LIVINGAREA_AVG",
#     "NONLIVINGAPARTMENTS_AVG",
#     "NONLIVINGAREA_AVG",
#     "APARTMENTS_MODE",
#     "YEARS_BEGINEXPLUATATION_MODE",
#     "FLOORSMIN_MODE",
#     "LIVINGAREA_MODE",
#     "LANDAREA_MEDI",
#     "TOTALAREA_MODE",
#     "OBS_30_CNT_SOCIAL_CIRCLE",
#     "DEF_30_CNT_SOCIAL_CIRCLE",
#     "DEF_60_CNT_SOCIAL_CIRCLE",
#     "DAYS_LAST_PHONE_CHANGE",
#     "AMT_REQ_CREDIT_BUREAU_HOUR",
#     "AMT_REQ_CREDIT_BUREAU_DAY",
#     "AMT_REQ_CREDIT_BUREAU_WEEK",
#     "AMT_REQ_CREDIT_BUREAU_MON",
#     "AMT_REQ_CREDIT_BUREAU_QRT",
#     "AMT_REQ_CREDIT_BUREAU_YEAR",
#     "AGE_INT",
#     "annuity_income_ratio",
#     "credit_annuity_ratio",
#     "credit_goods_price_ratio",
#     "credit_downpayment"
# ]


In [361]:
# data_dict = joblib.load("./bin/data_dict.joblib")
# ohe = joblib.load("./bin/ohe.joblib")
# categorical_imputer = joblib.load("./bin/categorical_imputer.joblib")
# simple_imputer = joblib.load("./bin/simple_imputer.joblib")
# scaler = joblib.load("./bin/scaler.joblib")
# model = joblib.load("./bin/model.joblib")

# #SimpleImputing (most frequent) and ohe of categorical features
# cat_array = categorical_imputer.transform(df_test[list_cat_features])
# cat_array = ohe.transform(cat_array).todense()

# #SimpleImputing (median) and StandardScaling of numerical features
# num_array = simple_imputer.transform(df_test[list_num_features])
# num_array = scaler.transform(num_array)

# #concatenate
# X = np.concatenate([cat_array, num_array], axis=1)
# X = np.asarray(X)

# #predict
# result_proba = model.predict_proba(X)
# y_pred_proba = result_proba[:,1]

# #cat columns list after ohe
# cat_features_list_after_ohe = ohe.get_feature_names(list_cat_features).tolist()

# #concatenate list of features
# features_list_after_prepr_test = cat_features_list_after_ohe + list_num_features

# #transform X into a dataframe with column names
# ohe_dataframe_test = pd.DataFrame(X, columns=features_list_after_prepr_test)



In [362]:
# sample_idx = 0
# sub_sampled_test_data = ohe_dataframe.iloc[sample_idx,:].values.reshape(1,-1)

In [363]:
# shap_vals = log_reg_explainer.shap_values(sub_sampled_test_data)

In [364]:
# print("Model prediction for test data", model.predict_proba(sub_sampled_test_data))

# shap.initjs()
# #https://shap-lrjball.readthedocs.io/en/latest/generated/shap.force_plot.html
# shap.force_plot(base_value=log_reg_explainer.expected_value[1],
#                 shap_values=shap_vals[1][0],
#                 features=sub_sampled_test_data[0],
#                 feature_names=features_list_after_prepr_test
# )

In [365]:
# #https://shap-lrjball.readthedocs.io/en/latest/generated/shap.summary_plot.html
# shap.summary_plot(shap_values=shap_vals, 
#                   features=sub_sampled_test_data, 
#                   feature_names=features_list_after_prepr_test, 
#                   max_display=10)

In [366]:
# shap.plots._waterfall.waterfall_legacy(log_reg_explainer.expected_value[1],#expected_value,
#                                        shap_vals[1][0],
#                                        sub_sampled_test_data[0],
#                                        feature_names=features_list_after_prepr_test,
#                                        max_display=10)

# Export DataSet

Export pre-processed dataset to be used for Dashboard Streamlit (on which heroku deployed model will predicting scoring) : 
- Sample 5% of the labelled dataset (app_train)
- Sample 5% of the unlabelled dataset (app_test)

In [367]:
#Labelled dataset

#recover Target label
app_train_init = pd.read_csv("../raw_data/application_train.csv").astype("object")
app_train["TARGET"] = app_train_init["TARGET"].astype(str)

#sample data
df_train = app_train.sample(frac=0.05, random_state=0)
df_train = df_train.astype('object')

#export
df_train.to_csv("./dashboard_data/df_train.csv", index=False)

print('df_train exported to dashboard_data folder.')

df_train exported to dashboard_data folder.


In [479]:
#Unlabelled dataset
app_test = pd.read_csv("../raw_data/application_test.csv").astype("object")

app_test_converted = convert_dtypes_func(app_test)
app_test_eng = feature_eng(app_test_converted)
app_test_selec_col = df_feature_selection(app_test_eng)

df_test = app_test_selec_col.sample(frac=0.05, random_state=0)
df_test = df_test.astype('object')

df_test.to_csv("./dashboard_data/df_test.csv", index=False)

print('df_test exported to dashboard_data folder.')


71 object columns were converted to int.
Feature engineering success
Feature selection success
df_test exported to dashboard_data folder.


# Serialization

In [393]:
joblib.dump(ohe, 'bin/ohe.joblib') #into a folder bin (for binary)
joblib.dump(categorical_imputer, 'bin/categorical_imputer.joblib')
joblib.dump(simple_imputer, 'bin/simple_imputer.joblib')
joblib.dump(scaler, 'bin/scaler.joblib')
joblib.dump(model, 'bin/model.joblib')

['bin/model.joblib']

# Pydantic Documentation

In [None]:
# from pydantic import BaseModel, create_model, Field, ValidationError

# model = {}
# for i in range(len(app_train.dtypes)):
#     name = app_train.dtypes.index[i]
#     var_type = type(app_train.iloc[0, i])
#     model.update({name: (var_type, Field(...))})    

In [None]:
# del model["SK_ID_CURR"]
# del model["TARGET"]
# # del model["DAYS_EMPLOYED_ANOM"]

In [None]:
# data_dict = joblib.dump(model, 'bin/data_dict.joblib')

# DEBUG (TO DELETE)

In [480]:
df_test = pd.read_csv("./dashboard_data/df_test.csv")

list_cat_features = ["NAME_CONTRACT_TYPE",
    "CODE_GENDER",
    "FLAG_OWN_CAR",
    "FLAG_OWN_REALTY",
    "NAME_TYPE_SUITE",
    "NAME_INCOME_TYPE",
    "NAME_EDUCATION_TYPE",
    "NAME_FAMILY_STATUS",
    "NAME_HOUSING_TYPE",
    "FLAG_MOBIL",
    "FLAG_EMP_PHONE",
    "FLAG_WORK_PHONE",
    "FLAG_CONT_MOBILE",
    "FLAG_PHONE",
    "FLAG_EMAIL",
    "OCCUPATION_TYPE",
    "WEEKDAY_APPR_PROCESS_START",
    "REG_REGION_NOT_LIVE_REGION",
    "REG_REGION_NOT_WORK_REGION",
    "LIVE_REGION_NOT_WORK_REGION",
    "REG_CITY_NOT_LIVE_CITY",
    "REG_CITY_NOT_WORK_CITY",
    "LIVE_CITY_NOT_WORK_CITY",
    "ORGANIZATION_TYPE",
    "FONDKAPREMONT_MODE",
    "HOUSETYPE_MODE",
    "WALLSMATERIAL_MODE",
    "EMERGENCYSTATE_MODE",
    "FLAG_DOCUMENT_2",
    "FLAG_DOCUMENT_3",
    "FLAG_DOCUMENT_4",
    "FLAG_DOCUMENT_5",
    "FLAG_DOCUMENT_6",
    "FLAG_DOCUMENT_7",
    "FLAG_DOCUMENT_8",
    "FLAG_DOCUMENT_9",
    "FLAG_DOCUMENT_10",
    "FLAG_DOCUMENT_11",
    "FLAG_DOCUMENT_12",
    "FLAG_DOCUMENT_13",
    "FLAG_DOCUMENT_14",
    "FLAG_DOCUMENT_15",
    "FLAG_DOCUMENT_16",
    "FLAG_DOCUMENT_17",
    "FLAG_DOCUMENT_18",
    "FLAG_DOCUMENT_19",
    "FLAG_DOCUMENT_20",
    "FLAG_DOCUMENT_21"
]
list_num_features = [
    "CNT_CHILDREN",
    "AMT_INCOME_TOTAL",
    "AMT_CREDIT",
    "AMT_ANNUITY",
    "AMT_GOODS_PRICE",
    "DAYS_EMPLOYED",
    "DAYS_REGISTRATION",
    "DAYS_ID_PUBLISH",
    "REGION_RATING_CLIENT",
    "REGION_POPULATION_RELATIVE",
    "CNT_FAM_MEMBERS",
    "HOUR_APPR_PROCESS_START",
    "OWN_CAR_AGE",
    "EXT_SOURCE_1",
    "EXT_SOURCE_2",
    "EXT_SOURCE_3",
    "APARTMENTS_AVG",
    "BASEMENTAREA_AVG",
    "YEARS_BEGINEXPLUATATION_AVG",
    "YEARS_BUILD_AVG",
    "COMMONAREA_AVG",
    "ELEVATORS_AVG",
    "ENTRANCES_AVG",
    "FLOORSMAX_AVG",
    "FLOORSMIN_AVG",
    "LANDAREA_AVG",
    "LIVINGAPARTMENTS_AVG",
    "LIVINGAREA_AVG",
    "NONLIVINGAPARTMENTS_AVG",
    "NONLIVINGAREA_AVG",
    "APARTMENTS_MODE",
    "YEARS_BEGINEXPLUATATION_MODE",
    "FLOORSMIN_MODE",
    "LIVINGAREA_MODE",
    "LANDAREA_MEDI",
    "TOTALAREA_MODE",
    "OBS_30_CNT_SOCIAL_CIRCLE",
    "DEF_30_CNT_SOCIAL_CIRCLE",
    "DEF_60_CNT_SOCIAL_CIRCLE",
    "DAYS_LAST_PHONE_CHANGE",
    "AMT_REQ_CREDIT_BUREAU_HOUR",
    "AMT_REQ_CREDIT_BUREAU_DAY",
    "AMT_REQ_CREDIT_BUREAU_WEEK",
    "AMT_REQ_CREDIT_BUREAU_MON",
    "AMT_REQ_CREDIT_BUREAU_QRT",
    "AMT_REQ_CREDIT_BUREAU_YEAR",
    "AGE_INT",
    "annuity_income_ratio",
    "credit_annuity_ratio",
    "credit_goods_price_ratio",
    "credit_downpayment"
]


In [481]:
data_dict = joblib.load("./bin/data_dict.joblib")
ohe = joblib.load("./bin/ohe.joblib")
categorical_imputer = joblib.load("./bin/categorical_imputer.joblib")
simple_imputer = joblib.load("./bin/simple_imputer.joblib")
scaler = joblib.load("./bin/scaler.joblib")
model = joblib.load("./bin/model.joblib")

#SimpleImputing (most frequent) and ohe of categorical features
cat_array = categorical_imputer.transform(df_test[list_cat_features])
cat_array = ohe.transform(cat_array).todense()

#SimpleImputing (median) and StandardScaling of numerical features
num_array = simple_imputer.transform(df_test[list_num_features])
num_array = scaler.transform(num_array)

#concatenate
X = np.concatenate([cat_array, num_array], axis=1)
X = np.asarray(X)

#predict
result_proba = model.predict_proba(X)
y_pred_proba = result_proba[:,1]

df_test["pred"] = y_pred_proba
df_test["pred"] = round(df_test["pred"].astype(np.float64),4)




In [485]:
#cat columns list after ohe
cat_features_list_after_ohe = ohe.get_feature_names(list_cat_features).tolist()

#concatenate list of features
features_list_after_prepr_test = cat_features_list_after_ohe + list_num_features

#transform X into a dataframe with column names
ohe_dataframe_test = pd.DataFrame(X, columns=features_list_after_prepr_test)
ohe_dataframe_test["pred"] = y_pred_proba


In [482]:
df_test["pred"].value_counts()

0.00000    1992
0.64590       2
0.78840       2
0.48870       2
0.80480       2
           ... 
0.74710       1
0.51580       1
0.58120       1
0.59100       1
0.56160       1
Name: pred, Length: 429, dtype: int64

In [489]:
app_train.describe()

Unnamed: 0,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,DAYS_EMPLOYED,DAYS_REGISTRATION,DAYS_ID_PUBLISH,OWN_CAR_AGE,CNT_FAM_MEMBERS,REGION_RATING_CLIENT,HOUR_APPR_PROCESS_START,EXT_SOURCE_1,EXT_SOURCE_2,EXT_SOURCE_3,APARTMENTS_AVG,BASEMENTAREA_AVG,YEARS_BEGINEXPLUATATION_AVG,YEARS_BUILD_AVG,COMMONAREA_AVG,ELEVATORS_AVG,ENTRANCES_AVG,FLOORSMAX_AVG,FLOORSMIN_AVG,LANDAREA_AVG,LIVINGAPARTMENTS_AVG,LIVINGAREA_AVG,NONLIVINGAPARTMENTS_AVG,NONLIVINGAREA_AVG,APARTMENTS_MODE,YEARS_BEGINEXPLUATATION_MODE,FLOORSMIN_MODE,LIVINGAREA_MODE,LANDAREA_MEDI,TOTALAREA_MODE,OBS_30_CNT_SOCIAL_CIRCLE,DEF_30_CNT_SOCIAL_CIRCLE,DEF_60_CNT_SOCIAL_CIRCLE,DAYS_LAST_PHONE_CHANGE,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR,AGE_INT,annuity_income_ratio,credit_annuity_ratio,credit_goods_price_ratio,credit_downpayment,pred,pred_loaded
count,307511.0,307511.0,307511.0,307499.0,307233.0,307511.0,252137.0,307511.0,307511.0,104582.0,307509.0,307511.0,307511.0,134133.0,306851.0,246546.0,151450.0,127568.0,157504.0,103023.0,92646.0,143620.0,152683.0,154491.0,98869.0,124921.0,97312.0,153161.0,93997.0,137829.0,151450.0,157504.0,98869.0,153161.0,124921.0,159080.0,306490.0,306490.0,306490.0,307510.0,265992.0,265992.0,265992.0,265992.0,265992.0,265992.0,307511.0,307499.0,307499.0,307233.0,307233.0,307511.0,307511.0
mean,0.41705,168797.9193,599025.99971,27108.57391,538396.20743,0.02087,-2384.16932,-4986.12033,-2994.20237,12.06109,2.15267,2.05246,12.06342,0.50213,0.51439,0.51085,0.11744,0.08844,0.97773,0.75247,0.04462,0.07894,0.14972,0.22628,0.23189,0.06633,0.10077,0.1074,0.00881,0.02836,0.11423,0.97707,0.22806,0.10598,0.06717,0.10255,1.42225,0.14342,0.10005,-962.85879,0.0064,0.007,0.03436,0.2674,0.26547,1.89997,43.93698,7.35287,21.61225,1.12386,-60918.7968,0.57729,0.42271
std,0.72212,237123.14628,402490.777,14493.73732,369446.46054,0.01383,2338.36016,3522.88632,1509.45042,11.94481,0.91068,0.50903,3.26583,0.21106,0.19106,0.19484,0.10824,0.08244,0.05922,0.11328,0.07604,0.13458,0.10005,0.14464,0.16138,0.08118,0.09258,0.11056,0.04773,0.06952,0.10794,0.06458,0.16116,0.11185,0.08217,0.10746,2.40099,0.4467,0.36229,826.80849,0.08385,0.11076,0.20468,0.916,0.79406,1.86929,11.95613,9.44194,7.82362,0.12451,70495.11795,0.19817,0.19817
min,0.0,25650.0,45000.0,1615.5,40500.0,0.00029,-17912.0,-24672.0,-7197.0,0.0,1.0,1.0,0.0,0.01457,0.0,0.00053,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-4292.0,0.0,0.0,0.0,0.0,0.0,0.0,20.52,0.53,8.04,0.15,-540000.0,0.00134,0.0
25%,0.0,112500.0,270000.0,16524.0,238500.0,0.01001,-3175.0,-7479.5,-4299.0,5.0,2.0,2.0,10.0,0.33401,0.39246,0.37065,0.0577,0.0442,0.9767,0.6872,0.0078,0.0,0.069,0.1667,0.0833,0.0187,0.0504,0.0453,0.0,0.0,0.0525,0.9767,0.0833,0.0427,0.0187,0.0412,0.0,0.0,0.0,-1570.0,0.0,0.0,0.0,0.0,0.0,0.0,34.01,4.37,15.61,1.0,-99972.0,0.43492,0.26473
50%,0.0,147150.0,513531.0,24903.0,450000.0,0.01885,-1648.0,-4504.0,-3254.0,9.0,2.0,2.0,12.0,0.506,0.56596,0.53528,0.0876,0.0763,0.9816,0.7552,0.0211,0.0,0.1379,0.1667,0.2083,0.0481,0.0756,0.0745,0.0,0.0036,0.084,0.9816,0.2083,0.0731,0.0487,0.0688,0.0,0.0,0.0,-757.0,0.0,0.0,0.0,0.0,0.0,1.0,43.15,6.14,20.0,1.12,-39204.0,0.60033,0.39967
75%,1.0,202500.0,808650.0,34596.0,679500.0,0.02866,-767.0,-2010.0,-1720.0,15.0,3.0,2.0,14.0,0.67505,0.66362,0.66906,0.1485,0.1122,0.9866,0.8232,0.0515,0.12,0.2069,0.3333,0.375,0.0856,0.121,0.1299,0.0039,0.0277,0.1439,0.9866,0.375,0.1252,0.0868,0.1276,2.0,0.0,0.0,-274.0,0.0,0.0,0.0,0.0,0.0,3.0,53.92,8.71,27.1,1.2,0.0,0.73527,0.56508
max,19.0,117000000.0,4050000.0,258025.5,4050000.0,0.07251,0.0,0.0,0.0,91.0,20.0,3.0,23.0,0.96269,0.855,0.89601,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,348.0,34.0,24.0,0.0,4.0,9.0,8.0,27.0,261.0,25.0,69.12,4466.59,45.31,6.0,765000.0,1.0,0.99866


In [490]:
app_train.columns

Index(['SK_ID_CURR', 'TARGET', 'NAME_CONTRACT_TYPE', 'CODE_GENDER',
       'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'CNT_CHILDREN', 'AMT_INCOME_TOTAL',
       'AMT_CREDIT', 'AMT_ANNUITY',
       ...
       'AMT_REQ_CREDIT_BUREAU_MON', 'AMT_REQ_CREDIT_BUREAU_QRT',
       'AMT_REQ_CREDIT_BUREAU_YEAR', 'AGE_INT', 'annuity_income_ratio',
       'credit_annuity_ratio', 'credit_goods_price_ratio',
       'credit_downpayment', 'pred', 'pred_loaded'],
      dtype='object', length=103)

In [483]:
df_test.describe()

Unnamed: 0,SK_ID_CURR,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,DAYS_EMPLOYED,DAYS_REGISTRATION,DAYS_ID_PUBLISH,OWN_CAR_AGE,FLAG_MOBIL,FLAG_EMP_PHONE,FLAG_WORK_PHONE,FLAG_CONT_MOBILE,FLAG_PHONE,FLAG_EMAIL,CNT_FAM_MEMBERS,REGION_RATING_CLIENT,HOUR_APPR_PROCESS_START,REG_REGION_NOT_LIVE_REGION,REG_REGION_NOT_WORK_REGION,LIVE_REGION_NOT_WORK_REGION,REG_CITY_NOT_LIVE_CITY,REG_CITY_NOT_WORK_CITY,LIVE_CITY_NOT_WORK_CITY,EXT_SOURCE_1,EXT_SOURCE_2,EXT_SOURCE_3,APARTMENTS_AVG,BASEMENTAREA_AVG,YEARS_BEGINEXPLUATATION_AVG,YEARS_BUILD_AVG,COMMONAREA_AVG,ELEVATORS_AVG,ENTRANCES_AVG,FLOORSMAX_AVG,FLOORSMIN_AVG,LANDAREA_AVG,LIVINGAPARTMENTS_AVG,LIVINGAREA_AVG,NONLIVINGAPARTMENTS_AVG,NONLIVINGAREA_AVG,APARTMENTS_MODE,YEARS_BEGINEXPLUATATION_MODE,FLOORSMIN_MODE,LIVINGAREA_MODE,LANDAREA_MEDI,TOTALAREA_MODE,OBS_30_CNT_SOCIAL_CIRCLE,DEF_30_CNT_SOCIAL_CIRCLE,DEF_60_CNT_SOCIAL_CIRCLE,DAYS_LAST_PHONE_CHANGE,FLAG_DOCUMENT_2,FLAG_DOCUMENT_3,FLAG_DOCUMENT_4,FLAG_DOCUMENT_5,FLAG_DOCUMENT_6,FLAG_DOCUMENT_7,FLAG_DOCUMENT_8,FLAG_DOCUMENT_9,FLAG_DOCUMENT_10,FLAG_DOCUMENT_11,FLAG_DOCUMENT_12,FLAG_DOCUMENT_13,FLAG_DOCUMENT_14,FLAG_DOCUMENT_15,FLAG_DOCUMENT_16,FLAG_DOCUMENT_17,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR,AGE_INT,annuity_income_ratio,credit_annuity_ratio,credit_goods_price_ratio,credit_downpayment,pred
count,2437.0,2437.0,2437.0,2437.0,2435.0,2437.0,2437.0,1992.0,2437.0,2437.0,838.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,1424.0,2437.0,1986.0,1266.0,1072.0,1323.0,858.0,771.0,1194.0,1288.0,1303.0,818.0,1055.0,809.0,1291.0,776.0,1150.0,1266.0,1323.0,818.0,1291.0,1055.0,1332.0,2435.0,2435.0,2435.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2125.0,2125.0,2125.0,2125.0,2125.0,2125.0,2437.0,2435.0,2435.0,2437.0,2437.0,2437.0
mean,280379.60854,0.37464,180298.98872,516991.74928,29741.67351,464163.87464,0.02121,-2553.78815,-4924.96758,-3028.98892,11.35322,1.0,0.8174,0.21297,0.99754,0.24949,0.15388,2.12556,2.04144,12.00369,0.02134,0.0636,0.0476,0.07961,0.22651,0.17357,0.49763,0.52213,0.49733,0.12055,0.09077,0.98021,0.74414,0.04764,0.08972,0.14739,0.23007,0.23748,0.06551,0.1068,0.11069,0.01198,0.02887,0.11677,0.97999,0.23171,0.10907,0.06623,0.10617,1.45092,0.1499,0.10719,-1070.69922,0.0,0.79565,0.0,0.01067,0.08289,0.0,0.0911,0.00451,0.0,0.00123,0.0,0.0,0.0,0.0,0.0,0.0,0.00123,0.0,0.0,0.0,0.00235,0.00188,0.00282,0.008,0.56094,1.99718,44.21022,7.13181,17.07612,1.13146,-52827.87464,0.10902
std,103418.47277,0.70038,111900.67572,364875.22338,16546.033,338666.92254,0.0145,2460.82434,3610.51022,1562.40508,9.93019,0.0,0.38642,0.40949,0.04957,0.4328,0.36091,0.87606,0.52554,3.29034,0.14454,0.24409,0.21296,0.27074,0.41866,0.37882,0.20253,0.17789,0.18921,0.11642,0.09337,0.02851,0.12694,0.08451,0.14687,0.10101,0.14563,0.16334,0.08168,0.10972,0.11595,0.06068,0.06424,0.11647,0.02856,0.16265,0.11881,0.08306,0.11404,2.30544,0.44514,0.36767,888.13304,0.0,0.40331,0.0,0.10276,0.27577,0.0,0.2878,0.06705,0.0,0.03507,0.0,0.0,0.0,0.0,0.0,0.0,0.03507,0.0,0.0,0.0,0.04846,0.05312,0.05307,0.09424,0.70489,1.8988,11.64006,4.69407,6.32433,0.12442,60584.62511,0.24371
min,100169.0,0.0,30150.0,45000.0,2965.5,45000.0,0.00128,-16295.0,-23722.0,-6187.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.02891,1e-05,0.00053,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0008,0.0,0.0,0.0,-3785.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,21.95,1.17,8.04,1.0,-356400.0,0.0
25%,191464.0,0.0,112500.0,260640.0,18558.0,225000.0,0.01001,-3359.25,-7438.0,-4442.0,5.0,1.0,1.0,0.0,1.0,0.0,0.0,2.0,2.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.34095,0.41597,0.35729,0.0546,0.044,0.9762,0.6804,0.00825,0.0,0.069,0.1667,0.0833,0.0186,0.0504,0.0445,0.0,0.0,0.0504,0.9762,0.0833,0.0408,0.0186,0.0422,0.0,0.0,0.0,-1776.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,34.77,4.38,12.59,1.0,-83160.0,0.0
50%,277327.0,0.0,157500.0,450000.0,26118.0,369000.0,0.01885,-1766.5,-4376.0,-3206.0,9.0,1.0,1.0,0.0,1.0,0.0,0.0,2.0,2.0,12.0,0.0,0.0,0.0,0.0,0.0,0.0,0.50681,0.5601,0.51369,0.08635,0.07645,0.9816,0.7552,0.0238,0.0,0.1379,0.1667,0.2083,0.0455,0.0756,0.0751,0.0,0.00315,0.084,0.9811,0.2083,0.0738,0.0459,0.06895,0.0,0.0,0.0,-822.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,43.52,5.98,16.27,1.13,-35640.0,0.0
75%,370711.0,1.0,225000.0,675000.0,37730.25,630000.0,0.02866,-871.25,-1800.0,-1707.0,15.0,1.0,1.0,0.0,1.0,0.0,0.0,2.0,2.0,14.0,0.0,0.0,0.0,0.0,0.0,0.0,0.66058,0.66267,0.6529,0.1485,0.1117,0.9866,0.8164,0.0537,0.16,0.2069,0.3333,0.375,0.08455,0.1305,0.13955,0.0077,0.0295,0.1502,0.98635,0.375,0.13095,0.08605,0.13353,2.0,0.0,0.0,-340.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,3.0,53.83,8.375,20.65,1.21,0.0,0.0
max,456120.0,8.0,2340000.0,2156400.0,173704.5,1800000.0,0.07251,-32.0,0.0,-2.0,65.0,1.0,1.0,1.0,1.0,1.0,1.0,10.0,3.0,22.0,1.0,1.0,1.0,1.0,1.0,1.0,0.9214,0.806,0.88253,1.0,1.0,0.9995,0.9932,0.8876,1.0,1.0,0.9583,1.0,1.0,1.0,1.0,0.8687,0.7419,1.0,0.9995,1.0,1.0,1.0,1.0,19.0,4.0,4.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,2.0,1.0,2.0,6.0,10.0,68.84,69.1,32.22,1.78,0.0,0.9451


In [486]:
ohe_dataframe_test.describe()

Unnamed: 0,NAME_CONTRACT_TYPE_Cash loans,NAME_CONTRACT_TYPE_Revolving loans,CODE_GENDER_F,CODE_GENDER_M,CODE_GENDER_XNA,FLAG_OWN_CAR_N,FLAG_OWN_CAR_Y,FLAG_OWN_REALTY_N,FLAG_OWN_REALTY_Y,NAME_TYPE_SUITE_Children,NAME_TYPE_SUITE_Family,NAME_TYPE_SUITE_Group of people,NAME_TYPE_SUITE_Other_A,NAME_TYPE_SUITE_Other_B,"NAME_TYPE_SUITE_Spouse, partner",NAME_TYPE_SUITE_Unaccompanied,NAME_INCOME_TYPE_Businessman,NAME_INCOME_TYPE_Commercial associate,NAME_INCOME_TYPE_Maternity leave,NAME_INCOME_TYPE_Pensioner,NAME_INCOME_TYPE_State servant,NAME_INCOME_TYPE_Student,NAME_INCOME_TYPE_Unemployed,NAME_INCOME_TYPE_Working,NAME_EDUCATION_TYPE_Academic degree,NAME_EDUCATION_TYPE_Higher education,NAME_EDUCATION_TYPE_Incomplete higher,NAME_EDUCATION_TYPE_Lower secondary,NAME_EDUCATION_TYPE_Secondary / secondary special,NAME_FAMILY_STATUS_Civil marriage,NAME_FAMILY_STATUS_Married,NAME_FAMILY_STATUS_Separated,NAME_FAMILY_STATUS_Single / not married,NAME_FAMILY_STATUS_Unknown,NAME_FAMILY_STATUS_Widow,NAME_HOUSING_TYPE_Co-op apartment,NAME_HOUSING_TYPE_House / apartment,NAME_HOUSING_TYPE_Municipal apartment,NAME_HOUSING_TYPE_Office apartment,NAME_HOUSING_TYPE_Rented apartment,NAME_HOUSING_TYPE_With parents,FLAG_MOBIL_0,FLAG_MOBIL_1,FLAG_EMP_PHONE_0,FLAG_EMP_PHONE_1,FLAG_WORK_PHONE_0,FLAG_WORK_PHONE_1,FLAG_CONT_MOBILE_0,FLAG_CONT_MOBILE_1,FLAG_PHONE_0,FLAG_PHONE_1,FLAG_EMAIL_0,FLAG_EMAIL_1,OCCUPATION_TYPE_Accountants,OCCUPATION_TYPE_Cleaning staff,OCCUPATION_TYPE_Cooking staff,OCCUPATION_TYPE_Core staff,OCCUPATION_TYPE_Drivers,OCCUPATION_TYPE_HR staff,OCCUPATION_TYPE_High skill tech staff,OCCUPATION_TYPE_IT staff,OCCUPATION_TYPE_Laborers,OCCUPATION_TYPE_Low-skill Laborers,OCCUPATION_TYPE_Managers,OCCUPATION_TYPE_Medicine staff,OCCUPATION_TYPE_Private service staff,OCCUPATION_TYPE_Realty agents,OCCUPATION_TYPE_Sales staff,OCCUPATION_TYPE_Secretaries,OCCUPATION_TYPE_Security staff,OCCUPATION_TYPE_Waiters/barmen staff,WEEKDAY_APPR_PROCESS_START_FRIDAY,WEEKDAY_APPR_PROCESS_START_MONDAY,WEEKDAY_APPR_PROCESS_START_SATURDAY,WEEKDAY_APPR_PROCESS_START_SUNDAY,WEEKDAY_APPR_PROCESS_START_THURSDAY,WEEKDAY_APPR_PROCESS_START_TUESDAY,WEEKDAY_APPR_PROCESS_START_WEDNESDAY,REG_REGION_NOT_LIVE_REGION_0,REG_REGION_NOT_LIVE_REGION_1,REG_REGION_NOT_WORK_REGION_0,REG_REGION_NOT_WORK_REGION_1,LIVE_REGION_NOT_WORK_REGION_0,LIVE_REGION_NOT_WORK_REGION_1,REG_CITY_NOT_LIVE_CITY_0,REG_CITY_NOT_LIVE_CITY_1,REG_CITY_NOT_WORK_CITY_0,REG_CITY_NOT_WORK_CITY_1,LIVE_CITY_NOT_WORK_CITY_0,LIVE_CITY_NOT_WORK_CITY_1,ORGANIZATION_TYPE_Advertising,ORGANIZATION_TYPE_Agriculture,ORGANIZATION_TYPE_Bank,ORGANIZATION_TYPE_Business Entity Type 1,ORGANIZATION_TYPE_Business Entity Type 2,ORGANIZATION_TYPE_Business Entity Type 3,ORGANIZATION_TYPE_Cleaning,ORGANIZATION_TYPE_Construction,ORGANIZATION_TYPE_Culture,ORGANIZATION_TYPE_Electricity,ORGANIZATION_TYPE_Emergency,ORGANIZATION_TYPE_Government,ORGANIZATION_TYPE_Hotel,ORGANIZATION_TYPE_Housing,ORGANIZATION_TYPE_Industry: type 1,ORGANIZATION_TYPE_Industry: type 10,ORGANIZATION_TYPE_Industry: type 11,ORGANIZATION_TYPE_Industry: type 12,ORGANIZATION_TYPE_Industry: type 13,ORGANIZATION_TYPE_Industry: type 2,ORGANIZATION_TYPE_Industry: type 3,ORGANIZATION_TYPE_Industry: type 4,ORGANIZATION_TYPE_Industry: type 5,ORGANIZATION_TYPE_Industry: type 6,ORGANIZATION_TYPE_Industry: type 7,ORGANIZATION_TYPE_Industry: type 8,ORGANIZATION_TYPE_Industry: type 9,ORGANIZATION_TYPE_Insurance,ORGANIZATION_TYPE_Kindergarten,ORGANIZATION_TYPE_Legal Services,ORGANIZATION_TYPE_Medicine,ORGANIZATION_TYPE_Military,ORGANIZATION_TYPE_Mobile,ORGANIZATION_TYPE_Other,ORGANIZATION_TYPE_Police,ORGANIZATION_TYPE_Postal,ORGANIZATION_TYPE_Realtor,ORGANIZATION_TYPE_Religion,ORGANIZATION_TYPE_Restaurant,ORGANIZATION_TYPE_School,ORGANIZATION_TYPE_Security,ORGANIZATION_TYPE_Security Ministries,ORGANIZATION_TYPE_Self-employed,ORGANIZATION_TYPE_Services,ORGANIZATION_TYPE_Telecom,ORGANIZATION_TYPE_Trade: type 1,ORGANIZATION_TYPE_Trade: type 2,ORGANIZATION_TYPE_Trade: type 3,ORGANIZATION_TYPE_Trade: type 4,ORGANIZATION_TYPE_Trade: type 5,ORGANIZATION_TYPE_Trade: type 6,ORGANIZATION_TYPE_Trade: type 7,ORGANIZATION_TYPE_Transport: type 1,ORGANIZATION_TYPE_Transport: type 2,ORGANIZATION_TYPE_Transport: type 3,ORGANIZATION_TYPE_Transport: type 4,ORGANIZATION_TYPE_University,ORGANIZATION_TYPE_XNA,FONDKAPREMONT_MODE_not specified,FONDKAPREMONT_MODE_org spec account,FONDKAPREMONT_MODE_reg oper account,FONDKAPREMONT_MODE_reg oper spec account,HOUSETYPE_MODE_block of flats,HOUSETYPE_MODE_specific housing,HOUSETYPE_MODE_terraced house,WALLSMATERIAL_MODE_Block,WALLSMATERIAL_MODE_Mixed,WALLSMATERIAL_MODE_Monolithic,WALLSMATERIAL_MODE_Others,WALLSMATERIAL_MODE_Panel,"WALLSMATERIAL_MODE_Stone, brick",WALLSMATERIAL_MODE_Wooden,EMERGENCYSTATE_MODE_No,EMERGENCYSTATE_MODE_Yes,FLAG_DOCUMENT_2_0,FLAG_DOCUMENT_2_1,FLAG_DOCUMENT_3_0,FLAG_DOCUMENT_3_1,FLAG_DOCUMENT_4_0,FLAG_DOCUMENT_4_1,FLAG_DOCUMENT_5_0,FLAG_DOCUMENT_5_1,FLAG_DOCUMENT_6_0,FLAG_DOCUMENT_6_1,FLAG_DOCUMENT_7_0,FLAG_DOCUMENT_7_1,FLAG_DOCUMENT_8_0,FLAG_DOCUMENT_8_1,FLAG_DOCUMENT_9_0,FLAG_DOCUMENT_9_1,FLAG_DOCUMENT_10_0,FLAG_DOCUMENT_10_1,FLAG_DOCUMENT_11_0,FLAG_DOCUMENT_11_1,FLAG_DOCUMENT_12_0,FLAG_DOCUMENT_12_1,FLAG_DOCUMENT_13_0,FLAG_DOCUMENT_13_1,FLAG_DOCUMENT_14_0,FLAG_DOCUMENT_14_1,FLAG_DOCUMENT_15_0,FLAG_DOCUMENT_15_1,FLAG_DOCUMENT_16_0,FLAG_DOCUMENT_16_1,FLAG_DOCUMENT_17_0,FLAG_DOCUMENT_17_1,FLAG_DOCUMENT_18_0,FLAG_DOCUMENT_18_1,FLAG_DOCUMENT_19_0,FLAG_DOCUMENT_19_1,FLAG_DOCUMENT_20_0,FLAG_DOCUMENT_20_1,FLAG_DOCUMENT_21_0,FLAG_DOCUMENT_21_1,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,DAYS_EMPLOYED,DAYS_REGISTRATION,DAYS_ID_PUBLISH,REGION_RATING_CLIENT,REGION_POPULATION_RELATIVE,CNT_FAM_MEMBERS,HOUR_APPR_PROCESS_START,OWN_CAR_AGE,EXT_SOURCE_1,EXT_SOURCE_2,EXT_SOURCE_3,APARTMENTS_AVG,BASEMENTAREA_AVG,YEARS_BEGINEXPLUATATION_AVG,YEARS_BUILD_AVG,COMMONAREA_AVG,ELEVATORS_AVG,ENTRANCES_AVG,FLOORSMAX_AVG,FLOORSMIN_AVG,LANDAREA_AVG,LIVINGAPARTMENTS_AVG,LIVINGAREA_AVG,NONLIVINGAPARTMENTS_AVG,NONLIVINGAREA_AVG,APARTMENTS_MODE,YEARS_BEGINEXPLUATATION_MODE,FLOORSMIN_MODE,LIVINGAREA_MODE,LANDAREA_MEDI,TOTALAREA_MODE,OBS_30_CNT_SOCIAL_CIRCLE,DEF_30_CNT_SOCIAL_CIRCLE,DEF_60_CNT_SOCIAL_CIRCLE,DAYS_LAST_PHONE_CHANGE,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR,AGE_INT,annuity_income_ratio,credit_annuity_ratio,credit_goods_price_ratio,credit_downpayment,pred
count,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0,2437.0
mean,0.99138,0.00862,0.66188,0.33812,0.0,0.65613,0.34387,0.30529,0.69471,0.01026,0.12515,0.00123,0.00287,0.00287,0.0279,0.82971,0.0,0.23389,0.0,0.1826,0.07468,0.0,0.0,0.50882,0.00082,0.26631,0.03734,0.00903,0.6865,0.07386,0.67706,0.06401,0.1387,0.0,0.04637,0.00205,0.89249,0.03242,0.00862,0.01847,0.04596,0.0,1.0,0.1826,0.8174,0.78703,0.21297,0.00246,0.99754,0.75051,0.24949,0.84612,0.15388,0.03734,0.01559,0.02257,0.09643,0.06196,0.0,0.03611,0.00205,0.49692,0.00492,0.07386,0.02134,0.00492,0.00246,0.0952,0.00451,0.02175,0.00205,0.14485,0.18178,0.08699,0.04268,0.17481,0.1986,0.17029,0.97866,0.02134,0.9364,0.0636,0.9524,0.0476,0.92039,0.07961,0.77349,0.22651,0.82643,0.17357,0.00123,0.00533,0.00657,0.01847,0.03201,0.22323,0.00041,0.02421,0.0,0.00246,0.00041,0.0279,0.00287,0.00903,0.00287,0.0,0.00862,0.00205,0.00041,0.00164,0.01477,0.00205,0.00246,0.00041,0.00451,0.0,0.01108,0.00082,0.02708,0.00205,0.03406,0.0119,0.00082,0.06073,0.00903,0.00451,0.00164,0.0,0.00985,0.02749,0.0078,0.00821,0.12064,0.00287,0.00246,0.00082,0.00492,0.0119,0.00041,0.0,0.00287,0.0238,0.00041,0.01026,0.00492,0.01805,0.00205,0.1826,0.02093,0.01847,0.9167,0.04391,0.9881,0.00616,0.00574,0.02872,0.00492,0.00533,0.00903,0.71522,0.22076,0.016,0.98974,0.01026,1.0,0.0,0.20435,0.79565,1.0,0.0,0.98933,0.01067,0.91711,0.08289,1.0,0.0,0.9089,0.0911,0.99549,0.00451,1.0,0.0,0.99877,0.00123,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.99877,0.00123,1.0,0.0,1.0,0.0,1.0,0.0,-0.05873,0.0485,-0.20382,0.18141,-0.2008,-150924.80635,-1.25146,0.55555,1.98499,-1.40823,-0.02976,19.54928,-0.08752,-0.02296,0.03994,-0.06493,0.03128,0.02484,0.02882,-0.04542,0.03042,0.07085,-0.0121,0.037,0.02398,0.00247,0.04482,0.03493,0.04207,0.01719,0.02765,0.03136,0.01639,0.03328,0.00164,0.03748,0.01343,0.01531,0.02041,-0.13043,-0.04469,-0.04284,-0.14293,-0.2618,0.34876,0.05157,0.02285,-0.02349,-0.5795,0.06113,0.11454,0.10902
std,0.09245,0.09245,0.47317,0.47317,0.0,0.47509,0.47509,0.46063,0.46063,0.10078,0.33096,0.03507,0.05353,0.05353,0.16473,0.37597,0.0,0.42339,0.0,0.38642,0.26293,0.0,0.0,0.50002,0.02864,0.44212,0.18963,0.0946,0.46401,0.2616,0.4677,0.24483,0.3457,0.0,0.21032,0.04526,0.30982,0.17714,0.09245,0.13465,0.20944,0.0,0.0,0.38642,0.38642,0.40949,0.40949,0.04957,0.04957,0.4328,0.4328,0.36091,0.36091,0.18963,0.12392,0.14855,0.29524,0.24113,0.0,0.1866,0.04526,0.50009,0.07001,0.2616,0.14454,0.07001,0.04957,0.29355,0.06705,0.14589,0.04526,0.35202,0.38574,0.28188,0.20217,0.37988,0.39903,0.37597,0.14454,0.14454,0.24409,0.24409,0.21296,0.21296,0.27074,0.27074,0.41866,0.41866,0.37882,0.37882,0.03507,0.07286,0.08078,0.13465,0.17605,0.41649,0.02026,0.15373,0.0,0.04957,0.02026,0.16473,0.05353,0.0946,0.05353,0.0,0.09245,0.04526,0.02026,0.04049,0.12066,0.04526,0.04957,0.02026,0.06705,0.0,0.10469,0.02864,0.16236,0.04526,0.18142,0.10846,0.02864,0.23888,0.0946,0.06705,0.04049,0.0,0.09877,0.16355,0.08797,0.09024,0.32578,0.05353,0.04957,0.02864,0.07001,0.10846,0.02026,0.0,0.05353,0.15246,0.02026,0.10078,0.07001,0.13318,0.04526,0.38642,0.14317,0.13465,0.27639,0.20493,0.10846,0.07823,0.07559,0.16706,0.07001,0.07286,0.0946,0.4514,0.41485,0.12551,0.10078,0.10078,0.0,0.0,0.40331,0.40331,0.0,0.0,0.10276,0.10276,0.27577,0.27577,0.0,0.0,0.2878,0.2878,0.06705,0.06705,0.0,0.0,0.03507,0.03507,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.03507,0.03507,0.0,0.0,0.0,0.0,0.0,0.0,0.9699,0.47191,0.90654,1.14119,0.91708,175962.25449,1.69016,0.4435,0.00035,0.00204,0.96199,6.4639,1.78481,1.11076,0.93202,0.9811,1.10441,1.16645,0.49537,1.1509,1.13877,1.12088,1.03993,1.04029,1.03717,1.03624,1.21548,1.08205,1.29883,0.95316,1.10715,0.45504,1.0326,1.09519,1.04081,1.09123,0.96101,0.99763,1.01602,1.07417,0.58013,0.4814,0.25989,0.10275,0.91979,1.02184,0.97356,0.49697,0.80812,0.99978,0.85977,0.24371
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,-0.57754,-0.58471,-1.3765,-1.66579,-1.33586,-1178130.09487,-10.05079,-0.34088,1.9843,-1.41103,-1.26572,-0.10306,-3.69383,-3.41014,-2.69553,-2.94827,-1.32145,-1.52224,-23.08874,-11.50177,-0.65386,-0.36851,-2.03223,-1.84181,-2.34234,-1.05702,-1.56558,-1.13972,-0.10085,-0.30526,-1.28027,-21.16424,-2.33711,-1.10973,-1.05744,-1.08024,-0.59103,-0.32048,-0.27566,-3.41331,-0.07099,-0.05877,-0.15584,-0.26995,-0.30862,-1.00733,-1.83897,-0.65484,-1.73481,-0.99521,-4.19351,0.0
25%,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,-0.57754,-0.23742,-0.84073,-0.58996,-0.84843,-214587.92445,-2.42787,0.15445,1.98497,-1.4098,-0.16764,15.61301,-0.01942,-0.26792,-0.51625,-0.66397,-0.25573,-0.09427,0.02309,0.01394,-0.16439,-0.36851,-0.08299,-0.28038,-0.0823,-0.14105,-0.14925,-0.25564,-0.10085,-0.23048,-0.19276,0.01778,-0.06917,-0.21797,-0.14116,-0.27756,-0.59103,-0.32048,-0.27566,-0.98347,-0.07099,-0.05877,-0.15584,-0.26995,-0.30862,-0.44093,-0.76672,-0.31486,-1.15322,-0.99521,-0.31591,0.0
50%,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,-0.57754,-0.04765,-0.37026,-0.06834,-0.45849,-95726.71576,-0.99448,0.5053,1.98497,-1.40856,-0.16764,19.54202,-0.01942,0.0121,0.23887,0.11206,-0.18985,-0.09427,0.04666,0.01394,-0.16439,-0.36851,-0.08299,-0.28038,-0.0823,-0.14105,-0.14925,-0.20548,-0.10085,-0.23048,-0.19276,0.0502,-0.06917,-0.20308,-0.14116,-0.22068,-0.59103,-0.32048,-0.27566,0.17036,-0.07099,-0.05877,-0.15584,-0.26995,-0.30862,-0.44093,-0.03488,-0.1454,-0.68156,0.04939,0.35846,0.0
75%,1.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.80727,0.23702,0.18876,0.73178,0.24827,-23571.31049,0.21141,0.93081,1.98497,-1.40718,-0.16764,23.47104,-0.01942,0.33561,0.77627,0.59423,-0.1498,-0.09427,0.05844,0.01394,-0.16439,-0.36851,-0.08299,-0.28038,-0.0823,-0.14105,-0.14925,-0.15908,-0.10085,-0.23048,-0.19276,0.061,-0.06917,-0.14107,-0.14116,-0.15242,0.24286,-0.32048,-0.27566,0.75333,-0.07099,-0.05877,-0.15584,-0.26995,1.03536,0.69188,0.82744,0.10773,-0.12299,0.69222,0.86423,0.0
max,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,10.50095,9.15645,3.86935,10.11465,3.41653,-0.14591,1.05403,1.41479,1.98563,-1.40102,8.61703,39.18711,16.20924,2.99184,1.52718,2.09938,11.59639,17.19293,0.46855,3.64309,19.93654,9.62672,12.10295,7.13431,8.50756,17.98607,17.16893,11.40035,32.43513,15.10432,11.66626,0.43705,8.55075,11.29312,17.75732,11.55031,7.33095,8.64749,10.78218,1.16455,12.74722,19.35175,5.08724,2.0643,7.75527,4.65672,2.08287,6.53981,1.3559,5.27238,0.86423,0.94515


In [419]:
df_test.shape[0] - 1992

445

In [408]:
df_test.head()

Unnamed: 0,SK_ID_CURR,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,NAME_TYPE_SUITE,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,REGION_POPULATION_RELATIVE,DAYS_EMPLOYED,DAYS_REGISTRATION,DAYS_ID_PUBLISH,OWN_CAR_AGE,FLAG_MOBIL,FLAG_EMP_PHONE,FLAG_WORK_PHONE,FLAG_CONT_MOBILE,FLAG_PHONE,FLAG_EMAIL,OCCUPATION_TYPE,CNT_FAM_MEMBERS,REGION_RATING_CLIENT,WEEKDAY_APPR_PROCESS_START,HOUR_APPR_PROCESS_START,REG_REGION_NOT_LIVE_REGION,REG_REGION_NOT_WORK_REGION,LIVE_REGION_NOT_WORK_REGION,REG_CITY_NOT_LIVE_CITY,REG_CITY_NOT_WORK_CITY,LIVE_CITY_NOT_WORK_CITY,ORGANIZATION_TYPE,EXT_SOURCE_1,EXT_SOURCE_2,EXT_SOURCE_3,APARTMENTS_AVG,BASEMENTAREA_AVG,YEARS_BEGINEXPLUATATION_AVG,YEARS_BUILD_AVG,COMMONAREA_AVG,ELEVATORS_AVG,ENTRANCES_AVG,FLOORSMAX_AVG,FLOORSMIN_AVG,LANDAREA_AVG,LIVINGAPARTMENTS_AVG,LIVINGAREA_AVG,NONLIVINGAPARTMENTS_AVG,NONLIVINGAREA_AVG,APARTMENTS_MODE,YEARS_BEGINEXPLUATATION_MODE,FLOORSMIN_MODE,LIVINGAREA_MODE,LANDAREA_MEDI,FONDKAPREMONT_MODE,HOUSETYPE_MODE,TOTALAREA_MODE,WALLSMATERIAL_MODE,EMERGENCYSTATE_MODE,OBS_30_CNT_SOCIAL_CIRCLE,DEF_30_CNT_SOCIAL_CIRCLE,DEF_60_CNT_SOCIAL_CIRCLE,DAYS_LAST_PHONE_CHANGE,FLAG_DOCUMENT_2,FLAG_DOCUMENT_3,FLAG_DOCUMENT_4,FLAG_DOCUMENT_5,FLAG_DOCUMENT_6,FLAG_DOCUMENT_7,FLAG_DOCUMENT_8,FLAG_DOCUMENT_9,FLAG_DOCUMENT_10,FLAG_DOCUMENT_11,FLAG_DOCUMENT_12,FLAG_DOCUMENT_13,FLAG_DOCUMENT_14,FLAG_DOCUMENT_15,FLAG_DOCUMENT_16,FLAG_DOCUMENT_17,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR,AGE_INT,annuity_income_ratio,credit_annuity_ratio,credit_goods_price_ratio,credit_downpayment,pred
0,223197,Cash loans,F,N,Y,0.0,117000.0,630000.0,32296.5,630000.0,"Spouse, partner",Working,Higher education,Married,With parents,0.028663,-3692.0,-8012.0,-4939.0,,1,1,0,1,0,1,Core staff,2.0,2.0,WEDNESDAY,9.0,0,0,0,0,0,0,Kindergarten,,0.466105,0.411849,,,,,,,,,,,,,,,,,,,,,,,,,4.0,0.0,0.0,0.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,3.0,38.14,3.62,19.51,1.0,0.0,0.0
1,135591,Cash loans,F,N,Y,0.0,202500.0,675000.0,36747.0,675000.0,Unaccompanied,Working,Secondary / secondary special,Married,House / apartment,0.01452,-1951.0,-6206.0,-1607.0,,1,1,0,1,1,1,Sales staff,2.0,2.0,SATURDAY,9.0,0,0,0,0,1,1,Business Entity Type 3,,0.260932,0.533482,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,-306.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,1.0,2.0,50.61,5.51,18.37,1.0,0.0,0.0
2,334756,Cash loans,F,Y,Y,1.0,121500.0,450000.0,23107.5,450000.0,,Commercial associate,Higher education,Married,House / apartment,0.018209,-3939.0,-4367.0,-4367.0,23.0,1,1,0,1,0,1,Accountants,3.0,3.0,MONDAY,7.0,0,0,0,0,0,0,Business Entity Type 3,0.514785,0.354315,0.251239,,,,,,,,,,,,,,,,,,,,,,,,,3.0,1.0,1.0,-2581.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,45.02,5.26,19.47,1.0,0.0,0.0
3,212448,Cash loans,M,Y,Y,0.0,225000.0,765000.0,39060.0,765000.0,Unaccompanied,Commercial associate,Higher education,Married,House / apartment,0.00702,-1460.0,-461.0,-4777.0,9.0,1,1,1,1,0,0,Core staff,2.0,2.0,TUESDAY,9.0,0,0,0,0,0,0,Self-employed,,0.575978,0.683269,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,-1660.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,6.0,44.96,5.76,19.59,1.0,0.0,0.0
4,157306,Cash loans,F,N,N,0.0,76500.0,315000.0,17086.5,315000.0,Unaccompanied,Working,Incomplete higher,Single / not married,House / apartment,0.008625,-789.0,-3529.0,-1682.0,,1,1,1,1,0,0,,1.0,2.0,SATURDAY,13.0,0,0,0,0,0,0,Business Entity Type 2,0.368835,0.311492,0.228883,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,-871.0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,24.68,4.48,18.44,1.0,0.0,0.0


In [372]:
df_test[df_test["SK_ID_CURR"] == 334756]

Unnamed: 0,SK_ID_CURR,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,NAME_TYPE_SUITE,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,REGION_POPULATION_RELATIVE,DAYS_EMPLOYED,DAYS_REGISTRATION,DAYS_ID_PUBLISH,OWN_CAR_AGE,FLAG_MOBIL,FLAG_EMP_PHONE,FLAG_WORK_PHONE,FLAG_CONT_MOBILE,FLAG_PHONE,FLAG_EMAIL,OCCUPATION_TYPE,CNT_FAM_MEMBERS,REGION_RATING_CLIENT,WEEKDAY_APPR_PROCESS_START,HOUR_APPR_PROCESS_START,REG_REGION_NOT_LIVE_REGION,REG_REGION_NOT_WORK_REGION,LIVE_REGION_NOT_WORK_REGION,REG_CITY_NOT_LIVE_CITY,REG_CITY_NOT_WORK_CITY,LIVE_CITY_NOT_WORK_CITY,ORGANIZATION_TYPE,EXT_SOURCE_1,EXT_SOURCE_2,EXT_SOURCE_3,APARTMENTS_AVG,BASEMENTAREA_AVG,YEARS_BEGINEXPLUATATION_AVG,YEARS_BUILD_AVG,COMMONAREA_AVG,ELEVATORS_AVG,ENTRANCES_AVG,FLOORSMAX_AVG,FLOORSMIN_AVG,LANDAREA_AVG,LIVINGAPARTMENTS_AVG,LIVINGAREA_AVG,NONLIVINGAPARTMENTS_AVG,NONLIVINGAREA_AVG,APARTMENTS_MODE,YEARS_BEGINEXPLUATATION_MODE,FLOORSMIN_MODE,LIVINGAREA_MODE,LANDAREA_MEDI,FONDKAPREMONT_MODE,HOUSETYPE_MODE,TOTALAREA_MODE,WALLSMATERIAL_MODE,EMERGENCYSTATE_MODE,OBS_30_CNT_SOCIAL_CIRCLE,DEF_30_CNT_SOCIAL_CIRCLE,DEF_60_CNT_SOCIAL_CIRCLE,DAYS_LAST_PHONE_CHANGE,FLAG_DOCUMENT_2,FLAG_DOCUMENT_3,FLAG_DOCUMENT_4,FLAG_DOCUMENT_5,FLAG_DOCUMENT_6,FLAG_DOCUMENT_7,FLAG_DOCUMENT_8,FLAG_DOCUMENT_9,FLAG_DOCUMENT_10,FLAG_DOCUMENT_11,FLAG_DOCUMENT_12,FLAG_DOCUMENT_13,FLAG_DOCUMENT_14,FLAG_DOCUMENT_15,FLAG_DOCUMENT_16,FLAG_DOCUMENT_17,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR,AGE_INT,annuity_income_ratio,credit_annuity_ratio,credit_goods_price_ratio,credit_downpayment
2,334756,Cash loans,F,Y,Y,1.0,121500.0,450000.0,23107.5,450000.0,,Commercial associate,Higher education,Married,House / apartment,0.018209,-3939.0,-4367.0,-4367.0,23.0,1,1,0,1,0,1,Accountants,3.0,3.0,MONDAY,7.0,0,0,0,0,0,0,Business Entity Type 3,0.514785,0.354315,0.251239,,,,,,,,,,,,,,,,,,,,,,,,,3.0,1.0,1.0,-2581.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,45.02,5.26,19.47,1.0,0.0


In [375]:
model.predict_proba(X).shape

(2437, 2)

In [463]:
ohe_dataframe_test["DAYS_EMPLOYED"].describe()

count       2437.00000
mean     -150924.80635
std       175962.25449
min     -1178130.09487
25%      -214587.92445
50%       -95726.71576
75%       -23571.31049
max           -0.14591
Name: DAYS_EMPLOYED, dtype: float64