# Data Scientist - P7 - Laurent Trichet

## Implémentez un modèle de scoring

## TESTS

### Import

In [1]:
# Import default libraries
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

# Import Garbage Collector (empty dataFrame memory)
import gc

# Remove some warnings
import warnings
warnings.filterwarnings('ignore')
import logging
logging.disable(logging.WARNING)


# Import Imbalanced-learn necessary tools
import imblearn
from collections import Counter

# Import for classification GradientBoostingClassifier & SVC
from sklearn import ensemble
from sklearn import svm
# Import for classification xgboost
from xgboost import XGBClassifier

# Import evaluation tool for classification optimisations
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold

# Imports tools for model interpreation, AUC, roc, permutations
from sklearn import metrics

# tools for execution time estimates
from datetime import datetime

import requests
headers = {"Content-Type": "application/json"}

# Pandas parameters
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 120)
pd.set_option('display.max_info_rows', 2000)

# Matplotlib and sns visual parameters
sns.set_palette("Set1")
sns.set_style('whitegrid')
sns.set_context('paper')
mpl.rcParams['axes.labelsize'] = 14
mpl.rcParams['axes.titlesize'] = 16
mpl.rcParams['xtick.labelsize'] = 11
mpl.rcParams['ytick.labelsize'] = 11

# Constants
DIRSOURCE = '../Sources/'
DIRDATASET = '../credithome_datasets/'
NUMROWS = 15000    # 1000000 to get complete dateset
# File names with NUMROWS lines and Fill nan with zeros
FILESTD_FNAN0_REDUCED = DIRDATASET+'Credit_Home_Junction_Std_Fnan0_Reduced_'+str(NUMROWS)+'.csv'
FILEFEAT_IN = DIRDATASET+'Credit_Home_Features.csv'
FILEFILT_IN = DIRDATASET+'Credit_Home_Filters.csv'


### Load data sets & list features

In [2]:
df = pd.read_csv(FILESTD_FNAN0_REDUCED, encoding='Latin-1', sep='\t')
df_filter = pd.read_csv(FILEFILT_IN, sep='\t')

In [3]:
df_feat = pd.read_csv(FILEFEAT_IN, sep='\t')
# drop columns of features with importance == 0
df_feat = df_feat.iloc[0:12,:]

### Test of web API

In [9]:
# Test API Web
api_url = 'http://oc.14eight.com:5001/api/'
#api_url = 'http://ec2-18-218-21-153.us-east-2.compute.amazonaws.com:5001/api/'
# api_url = 'http://localhost:5001/api/'


In [15]:
api_test = 'client_list/'

data = []
data_json = {'data': [data]}
print(data_json)
print('V')
print(api_url+api_test)
print('V')
response = requests.request(method='POST',
                            headers=headers,
                            url=api_url+api_test,
                            json=data_json
                        )
if response.status_code != 200:
    print(f'HTTP error: {response.status_code}')
else:
    data_json = response.json()
    data = data_json['data']
    for i in np.arange(0,len(data)):
        print(data[i])

{'data': [[]]}
V
http://oc.14eight.com:5001/api/client_list/
V
100983 - Woman - 46 years old - Amount Loan: USD $625536
100984 - Man - 30 years old - Amount Loan: USD $675000
100987 - Woman - 33 years old - Amount Loan: USD $500490
100997 - Man - 29 years old - Amount Loan: USD $199152
100998 - Woman - 31 years old - Amount Loan: USD $601470
101002 - Woman - 54 years old - Amount Loan: USD $257391
101005 - Woman - 30 years old - Amount Loan: USD $900000
101008 - Woman - 36 years old - Amount Loan: USD $497520
101020 - Man - 46 years old - Amount Loan: USD $332946
101021 - Man - 48 years old - Amount Loan: USD $225000
101024 - Woman - 34 years old - Amount Loan: USD $582804
101025 - Woman - 31 years old - Amount Loan: USD $375408
101026 - Man - 60 years old - Amount Loan: USD $450000
101031 - Woman - 58 years old - Amount Loan: USD $174132
101041 - Man - 49 years old - Amount Loan: USD $454500
101051 - Man - 45 years old - Amount Loan: USD $400392
101055 - Man - 40 years old - Amount Lo

In [16]:
api_test = 'client_score/'
data_id_client = 100983
data_json = {
    'SK_ID_CURR': data_id_client,
    }
print(data_json)
print('V')
print(api_url+api_test)
print('V')
response = requests.request(method='POST',
                            headers=headers,
                            url=api_url+api_test,
                            json=data_json
                        )
if response.status_code != 200:
    print(f'HTTP error: {response.status_code}')
else:
    data_json = response.json()
    score = data_json['score']
    print(score)

{'SK_ID_CURR': 100983}
V
http://oc.14eight.com:5001/api/client_score/
V
HTTP error: 500


In [17]:
api_test = 'feature_list/'

data = []
data_json = {'data': [data]}
print(data_json)
print('V')
print(api_url+api_test)
print('V')
response = requests.request(method='POST',
                            headers=headers,
                            url=api_url+api_test,
                            json=data_json
                        )
if response.status_code != 200:
    print(f'HTTP error: {response.status_code}')
else:
    data_json = response.json()
    data = data_json['data']
    for i in np.arange(0,len(data)):
        print(data[i])

{'data': [[]]}
V
http://oc.14eight.com:5001/api/feature_list/
V
['EXT_SOURCE_3', 0.1, '(Application) Normalized score from external data source']
['EXT_SOURCE_2', 0.07, '(Application) Normalized score from external data source']
['AMT_ANNUITY', 0.02, '(Application) Loan annuity']
['EXT_SOURCE_1', 0.02, '(Application) Normalized score from external data source']
['DAYS_BIRTH', 0.02, "(Application) Client's age at the time of application"]
['CODE_GENDER', 0.01, '(Application) Gender of the client']
['AMT_GOODS_PRICE', 0.01, '(Application) For consumer loans it is the price of the goods for which the loan is given']
['BURO_DAYS_CREDIT_MAX', 0.01, '(Bureau) MAX, How many days before current application did client apply for Credit Bureau credit']
['INSTAL_DAYS_ENTRY_PAYMENT_MAX', 0.01, '(Installments Payments) MAX, When was the installments of previous credit paid actually (relative to application date of current loan)']
['DAYS_ID_PUBLISH', 0.01, '(Application) How many days before the appl

In [None]:
api_test = 'fnr_list/'

data = []
data_json = {'data': [data]}
print(data_json)
print('V')
print(api_url+api_test)
print('V')
response = requests.request(method='POST',
                            headers=headers,
                            url=api_url+api_test,
                            json=data_json
                        )
if response.status_code != 200:
    print(f'HTTP error: {response.status_code}')
else:
    data_json = response.json()
    data = data_json['data']
    for i in np.arange(0,len(data)):
        print(data[i])

##### false positives

In [None]:
api_test = 'fpr_list/'

data = []
data_json = {'data': [data]}
print(data_json)
print('V')
print(api_url+api_test)
print('V')
response = requests.request(method='POST',
                            headers=headers,
                            url=api_url+api_test,
                            json=data_json
                        )
if response.status_code != 200:
    print(f'HTTP error: {response.status_code}')
else:
    data_json = response.json()
    data = data_json['data']
    for i in np.arange(0,len(data)):
        print(data[i])

In [None]:
def filter_list(inlist):
    outlist = []
    for i in np.arange(len(inlist)):
        if inlist[i][0] in [0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]:
            outlist.append(inlist[i])
    return outlist


risk_fpr_list = data
risk_fpr_list = filter_list(risk_fpr_list)

risk_marks ={0:'100%'}
for i in np.arange(0, len(risk_fpr_list)):
    percent = f'max {int(100*risk_fpr_list[i][0])}%'
    mark = 100*risk_fpr_list[i][1]
    risk_marks[int(mark)] = percent
risk_marks[100] = '0%'


In [None]:
risk_fpr_list

In [None]:
risk_marks

In [None]:
import plotly.graph_objects as go

In [None]:


score = 0.78
i=0
while i < len(risk_fpr_list):
    if score > risk_fpr_list[i][1]:
        break
    i = i + 1
librisk = int(100 * risk_fpr_list[i][1])
score = int(100 * score)
n = len(risk_fpr_list)
fig = go.Figure(
        go.Indicator(
            domain={'x': [0, 1], 'y': [0, 1]},
            value=score,
            mode="gauge+number+delta",
            title={'text': "Score"},
            # delta={'reference': 70},
            gauge={'axis': {'range': [None, 100]},
                'steps': [
                    {'range': [100*risk_fpr_list[0][1], 100],
                     'color': "#FFDFDF"},
                    {'range': [100*risk_fpr_list[1][1], 100*risk_fpr_list[0][1]],
                     'color': "#FFBFBF"},
                    {'range': [100*risk_fpr_list[2][1], 100*risk_fpr_list[1][1]],
                     'color': "#FF9F9F"},
                    {'range': [100*risk_fpr_list[3][1], 100*risk_fpr_list[2][1]],
                     'color': "#FF7F7F"},
                    {'range': [100*risk_fpr_list[4][1], 100*risk_fpr_list[3][1]],
                     'color': "#FF5F5F"},
                    {'range': [100*risk_fpr_list[5][1], 100*risk_fpr_list[4][1]],
                     'color': "#FF3F3F"},
                    {'range': [100*risk_fpr_list[6][1], 100*risk_fpr_list[5][1]],
                     'color': "#FF1F1F"},
                    {'range': [0, 100*risk_fpr_list[6][1]],
                     'color': "#FF0F0F"},
                        ],
           'threshold': {'line': {'color': "green", 'width': 4},
                         'thickness': 0.75, 'value': librisk}
                    }
        )
    )
fig.show()

In [8]:
api_test = 'feature_values/'
data_id_client = 100001
data_filter = 'Y'
data = [[0.0, 1], [0, 1], [0, 999999], [0, 1], [-99, -60], [0, 1]]
data_json = {
    'SK_ID_CURR': data_id_client,
    'FILTER_Y_N': data_filter,
    'RANGES': data,
    }
print(data_json)
print('V')
print(api_url+api_test)
print('V')
response = requests.request(method='POST',
                            headers=headers,
                            url=api_url+api_test,
                            json=data_json
                        )
if response.status_code != 200:
    print(f'HTTP error: {response.status_code}')
else:
    data_json = response.json()
    data_client = data_json['data_client']
    print(data_client)
    pop = data_json['population']
    print(pop)
    data = data_json['data']
    for i in np.arange(0,len(data)):
        print(data[i])

{'SK_ID_CURR': 100001, 'FILTER_Y_N': 'Y', 'RANGES': [[0.6, 1], [0, 1], [0, 999999], [0, 1], [-99, -60], [0, 1]]}
V
http://localhost:5001/api/feature_values/
V
[[0.16, 0.79, 20560.5, 0.75, -53.0, 1.0, 450000.0, -49.0, -1628.0, -812.0, -735.0, -1740.0]]
[22, 803]
['EXT_SOURCE_3', 0.16, 0.64, 0.72, 0.83, 0.6, 0.71, 0.88]
['EXT_SOURCE_2', 0.79, 0.01, 0.38, 0.76, 0.0, 0.53, 0.8]
['AMT_ANNUITY', 20560.5, 9265.5, 23404.7, 58203.0, 2596.5, 23891.45, 110488.5]
['EXT_SOURCE_1', 0.75, 0.0, 0.15, 0.82, 0.0, 0.15, 0.93]
['DAYS_BIRTH', -53.0, -69.0, -63.59, -60.0, -69.0, -63.53, -60.0]
['CODE_GENDER', 1.0, 0.0, 0.68, 1.0, 0.0, 0.79, 1.0]


In [6]:
api_test = 'feature_values/'
data_id_client = 100001
data_filter = 'N'
data = [[0, 1], [0, 1], [0, 999999], [0, 1], [-99, 0]]
data_json = {
    'SK_ID_CURR': data_id_client,
    'FILTER_Y_N': data_filter,
    'RANGES': data,
    }
print(data_json)
print('V')
print(api_url+api_test)
print('V')
response = requests.request(method='POST',
                            headers=headers,
                            url=api_url+api_test,
                            json=data_json
                        )
if response.status_code != 200:
    print(f'HTTP error: {response.status_code}')
else:
    data_json = response.json()
    data_client = data_json['data_client']
    print(data_client)
    pop = data_json['population']
    print(pop)
    data = data_json['data']
    for i in np.arange(0,len(data)):
        print(data[i])

{'SK_ID_CURR': 100001, 'FILTER_Y_N': 'N', 'RANGES': [[0, 1], [0, 1], [0, 999999], [0, 1], [-99, 0]]}
V
http://oc.14eight.com:5001/api/feature_values/
V
[[0.16, 0.79, 20560.5, 0.75, -53.0, 1.0, 450000.0, -49.0, -1628.0, -812.0, -735.0, -1740.0]]
[1174, 13826]
['EXT_SOURCE_3', 0.16, 0.0, 0.29, 0.86, 0.0, 0.42, 0.89]
['EXT_SOURCE_2', 0.79, 0.0, 0.42, 0.8, 0.0, 0.52, 0.85]
['AMT_ANNUITY', 20560.5, 2844.0, 26742.22, 105511.5, 2596.5, 27133.93, 225000.0]
['EXT_SOURCE_1', 0.75, 0.0, 0.16, 0.93, 0.0, 0.22, 0.93]
['DAYS_BIRTH', -53.0, -69.0, -40.68, -22.0, -69.0, -44.63, -22.0]
['CODE_GENDER', 1.0, 0.0, 0.57, 1.0, 0.0, 0.66, 1.0]
['AMT_GOODS_PRICE', 450000.0, 0.0, 488310.2, 2961000.0, 0.0, 544903.73, 4050000.0]
['BURO_DAYS_CREDIT_MAX', -49.0, -2922.0, -308.86, 0.0, -2922.0, -433.87, 0.0]
['INSTAL_DAYS_ENTRY_PAYMENT_MAX', -1628.0, -2931.0, -344.77, 0.0, -2920.0, -308.18, 0.0]
['DAYS_ID_PUBLISH', -812.0, -5888.0, -2684.38, -10.0, -6228.0, -3000.77, 0.0]
['BURO_DAYS_CREDIT_MEAN', -735.0, -2922.0, 

In [None]:
api_test = 'population_count/'
data_filter = 'Y'
data = [[0.6, 1], [0, 1], [0, 999999], [0, 1], [-99, -60], [0, 1]]
data_json = {
    'FILTER_Y_N': data_filter,
    'RANGES': data,
    }
print(data_json)
print('V')
print(api_url+api_test)
print('V')
response = requests.request(method='POST',
                            headers=headers,
                            url=api_url+api_test,
                            json=data_json
                        )
if response.status_code != 200:
    print(f'HTTP error: {response.status_code}')
else:
    data_json = response.json()
    pop = data_json['population']
    print(pop)


In [None]:
api_test = 'filters/'
data = []
data_json = {'data': [data]}
print(data_json)
print('V')
print(api_url+api_test)
print('V')
response = requests.request(method='POST',
                            headers=headers,
                            url=api_url+api_test,
                            json=data_json
                        )
if response.status_code != 200:
    print(f'HTTP error: {response.status_code}')
else:
    data_json = response.json()
    data = data_json['data']
    for i in np.arange(0, len(data)):
        print(data[i])

### Test of Model API

In [None]:
c_features = [c for c in df.columns if c not in ['index', 'TARGET', 'SK_ID_CURR']]
df_test = df[df['TARGET']==999]

In [None]:
# Test API AWS OR LAPTOP
model_url = 'http://oc.14eight.com:5000/invocations'
# model_url = 'http://ec2-18-218-21-153.us-east-2.compute.amazonaws.com:5000/invocations'
# model_url = 'http://localhost:5000/invocations'

max_val = 20
tab_rep = np.zeros(max_val)
tab_id = []
first_pos = np.random.randint(1,df_test.shape[0]-max_val)
for i in np.arange(0, max_val):
    data = df_test[c_features].iloc[first_pos+i,:].to_list()
    data_json = {'data': [data]}
    response = requests.request(method='POST',
                                headers=headers,
                                url=model_url,
                                json=data_json
                            )
    if response.status_code != 200:
        print(f'HTTP error: {response.status_code}')
    else:
        tab_id.append([first_pos+i, df_test.iloc[first_pos+i,:]['SK_ID_CURR']])
        tab_rep[i] = response.json()[0]
        print(f'{i:3} / {max_val-1:3} {response.json()}        ', end='\r')
print('\n')

In [None]:
tab_rep

In [None]:
tab_id

### Other tests

In [None]:

col_names = df_feat[df_feat['mean importance']==0]['col name'].values
df.drop(columns=col_names, inplace=True)

# Retrieve train and test datasets
df_train = df[df['TARGET']!=999]
df_test = df[df['TARGET']==999]
# Keep valid columns for features and result class in future classifications
c_features = [c for c in df.columns if c not in ['index', 'TARGET', 'SK_ID_CURR']]
c_class = 'TARGET'

del df, df_feat
gc.collect()

#### Fix imbalanced data with Prototype selection (under sample of positive class included in original sample)

In [None]:
counter1 = Counter(df_train[c_class])
print(counter1)

In [None]:
undersample = imblearn.under_sampling.RandomUnderSampler(random_state=0)
X, y = undersample.fit_resample(df_train[c_features], df_train[c_class])

counter2 = Counter(y)
print(counter2)

In [None]:
fig, axes = plt.subplots(nrows=1, ncols=2,
                        sharex=False, sharey=False,
                        figsize=(16,5))
fig.subplots_adjust(hspace=0.5)

for label, _ in counter1.items():
    row_ix = np.where(df_train[c_class].values == label)[0]
    sns.scatterplot(df_train[c_features].iloc[row_ix, 8],
                    df_train[c_features].iloc[row_ix, 9],
                    label=str(label),
                    ax=axes[0]
                    )
axes[0].set_title('Imbalanced data')

for label, _ in counter2.items():
    row_ix = np.where(y.values == label)[0]
    sns.scatterplot(X.iloc[row_ix, 8],
                    X.iloc[row_ix, 9],
                    label=str(label),
                    ax=axes[1]
                    )
axes[1].set_title('Random Under Sample')
print('\n\tArbitrary selection of 2 variables to see effect of under sampling ...')
plt.show()

### 3.2 Search for Classification method & Hyperparameters. WE VERIFY THAT WE GAIN 30% OF TIME OF TREATMENT WITH REDUCED SET  OF FEATURES

#### LinearSVC, XGBCClassifier, GradientBoostingClassifier best scores

In [None]:
models=[]
iname, itype, iparam = 0, 1, 2
models.append(['LinearSVC ', svm.LinearSVC(),
               { 
                'C': np.logspace(-4, 4, 9),
                'penalty' : ['l1', 'l2'],
                'loss': ['hinge', 'squared_hinge'],
                'dual': [False],
               }
              ])
models.append(['XGBClassifier', XGBClassifier(),
               {
                 'max_depth': [3,5],
                 'min_child_weight': [1, 5, 10],
                 'gamma': [0.5, 1, 1.5, 2, 5],
                 'subsample': [0.6, 0.8, 1.0],
                 'colsample_bytree': [0.6, 0.8, 1.0],
                 'verbosity': [0],
               }
              ])
models.append(['GradBoostC', ensemble.GradientBoostingClassifier(),
               {
                'n_estimators': [200],
                'max_depth': [3,5],
                'criterion': ['friedman_mse', 'squared_error'],
                'min_samples_split': [2, 3, 4],
                'min_weight_fraction_leaf': [0.0, 0.2, 0.4],
               }
              ])
for i, model in enumerate(models):
    mdl = GridSearchCV(model[itype], model[iparam], cv=5, scoring='roc_auc')
    datedeb = datetime.now()
    mdl.fit(X, y)
    duree = datetime.now() - datedeb
    print(f'{model[iname]} \tduree: {duree.seconds}s \tbest_score: {mdl.best_score_:4.3} \tbest_params: {mdl.best_params_}')


> For a 1174 '1 and 0 balanced classes' sample WITH ALL ORIGINAL FEATURES:  
  
>> LinearSVC  	duree: 112s 	best_score: 0.696 	best_params: {'C': 0.1, 'dual': False, 'loss': 'squared_hinge', 'penalty': 'l1'}  
>>  
>>  XGBClassifier 	duree: 1663s 	best_score: 0.723 	best_params: {'colsample_bytree': 1.0, 'gamma': 1.5, 'max_depth': 5, 'min_child_weight': 10, 'subsample': 1.0, 'verbosity': 0}  
>>  
>>  GradBoostC 	duree: 1631s 	best_score: 0.732 	best_params: {'criterion': 'friedman_mse', 'max_depth': 3, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.2, 'n_estimators': 200}

In [None]:
del df_train, df_test
gc.collect()

### 3.3 Kfold Roc Curve and Feature Importances, WE VERIFY THAT WE HAVE THE SAME RESULTS AS WITH ALL ORIGINAL FEATURES

In [None]:
n_splits = 8
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

tot_valid_y = np.zeros(y.shape[0])
tot_valid_prob = np.zeros(y.shape[0])
tot_score = []
tot_feature_importances = []

for splt, (train_idx, valid_idx) in enumerate(skf.split(X, y)):

    train_x, train_y = X.iloc[train_idx], y.iloc[train_idx]
    valid_x, valid_y = X.iloc[valid_idx], y.iloc[valid_idx]
    
    # GradientBoostingClassifier
    gbc = ensemble.GradientBoostingClassifier(
        n_estimators=200,
        criterion='friedman_mse',
        max_depth=3,
        min_samples_split=2,
        min_weight_fraction_leaf=0.2,
    )
    gbc.fit(train_x, train_y)

    tot_valid_y[valid_idx] = valid_y
    
    valid_prob = gbc.predict_proba(valid_x)[:,1]
    tot_valid_prob[valid_idx] = valid_prob
    
    tot_score.append(metrics.roc_auc_score(valid_y.values, valid_prob))
    tot_feature_importances.append(gbc.feature_importances_)
    
tot_score = [round(1000*s)/1000 for s in tot_score] 
mean_score = sum(tot_score)/len(tot_score)
print(f'tot_score   = {[s for s in tot_score]}')
print(f'mean scores = {mean_score:5.3}')

In [None]:

fig, axe = plt.subplots(figsize=(10,7))
[fpr, tpr, thr] = metrics.roc_curve(tot_valid_y,
                                    tot_valid_prob,
                                    pos_label=1)
axe.plot(fpr, tpr, color='orange', lw=2)
axe.set_title(f'Roc curve ({n_splits} splits) mean AUC = {mean_score:5.3}')
axe.set_xlabel('Specificity')
axe.set_ylabel('Sensitivity')
axe.grid(visible=True, color='#eeeeee')

plt.show()


##### Threshold 1: MINIMIZE THE RISK : we want to minimize the rate of False Positive, i.e minimize the pourcentage of loans attributed to wrong clients.

In [None]:
# Display thresholds for different False Positive Rate (FPR) tolerances
for max_fpr in np.arange(0.05, 0.65, 0.05):
    idx = np.max(np.where(fpr<max_fpr))
    str1 = f'False Pos Rate max {max_fpr:.2f} '
    str2 = f'Sensitivity: {tpr[idx]:.2f},  Specificity: {1-fpr[idx]:.2f} Threshold: {thr[idx]:.2f}'
    print(str1+str2)


##### Threshold 2: OPTIMIZE THE TURNOVER : we want to minimize the rate of False Negative, i.e minimize the pourcentage of loans refused to good clients.

In [None]:
# Display thresholds for different False Negative Rate (FNR) tolerances
# FNR = 1 - TPR
for min_tpr in np.arange(0.40, 1, 0.05):
    idx = np.min(np.where(tpr>min_tpr))
    str1 = f'False Neg Rate max {1-min_tpr:.2f} '
    str2 = f'Sensitivity: {tpr[idx]:.2f},  Specificity: {1-fpr[idx]:.2f} Threshold: {thr[idx]:.2f}'
    print(str1+str2)

#### Shape features and importances to find features with main role in classification

In [None]:
importance_mean = pd.DataFrame(tot_feature_importances).mean().to_list()
importance_std = pd.DataFrame(tot_feature_importances).std().to_list()
df_features = pd.DataFrame(data=np.array([[c for c in X.columns], importance_mean, importance_std]).T,
                           columns=['col name', 'mean def', 'std def'])
df_features['mean def'] = df_features['mean def'].astype('float64')
df_features['std def'] = df_features['std def'].astype('float64')
df_features.describe()

In [None]:

df_draw = df_features.sort_values('mean def')
df_draw = df_draw.iloc[-80:,:]
fig, axes = plt.subplots(figsize=(14,int(df_draw.shape[0]//3.5)))
axes.barh([x for x in range(df_draw.shape[0])],
           df_draw['mean def'].values,
           xerr = df_draw['std def'].values,
           color = '#33aa33',
           tick_label=df_draw['col name'].values)
axes.set_title(f'Features Importance KFOLD - {df_draw.shape[0]} first features')
axes.grid(visible=True)
plt.show()


### 3.4 Save Credit Home Junction File with reduced set of features 

In [None]:
NUMROWS = 1000000    # 1000000 to get complete dateset
# File names with NUMROWS lines and Fill nan with zeros
FILESTD_FNAN0 = DIRDATASET+'Credit_Home_Junction_Std_Fnan0_'+str(NUMROWS)+'.csv'
FILESTD_FNAN0_REDUCED = DIRDATASET+'Credit_Home_Junction_Std_Fnan0_Reduced_'+str(NUMROWS)+'.csv'


In [None]:
df = pd.read_csv(FILESTD_FNAN0, encoding='Latin-1', sep='\t')

# drop columns of features with importance == 0
df_feat = pd.read_csv(FILEFEAT_IN, sep='\t')
col_names = df_feat[df_feat['mean importance']==0]['col name'].values
df.drop(columns=col_names, inplace=True)
df.info()

In [None]:
df.to_csv(FILESTD_FNAN0_REDUCED, sep='\t', index=False)