In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import seaborn as sns
%matplotlib inline

import warnings
warnings.simplefilter('ignore', UserWarning)

import gc
gc.enable()

import os
import mlflow
import mlflow.sklearn
import mlflow.lightgbm
from sklearn.metrics import f1_score
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import SMOTENC
import shap
import pickle
import requests
import json
from flask import Flask



In [2]:
mlflow.set_tracking_uri('http://localhost:5000')

In [3]:
mlflow.get_tracking_uri()

'http://localhost:5000'

In [None]:
mlflow.set_experiment('home-credit-default-risk')

In [None]:
print(os.listdir('Projet+Mise+en+prod+-+home-credit-default-risk'))

# lecture data

In [None]:

df_train = pd.read_csv("Projet+Mise+en+prod+-+home-credit-default-risk/application_train.csv")

df_test = pd.read_csv("Projet+Mise+en+prod+-+home-credit-default-risk/application_test.csv")


df_bureau = pd.read_csv("Projet+Mise+en+prod+-+home-credit-default-risk/bureau.csv")
df_previous_application = pd.read_csv("Projet+Mise+en+prod+-+home-credit-default-risk/previous_application.csv")

df_coldesc = pd.read_csv("Projet+Mise+en+prod+-+home-credit-default-risk/HomeCredit_columns_description.csv",encoding='unicode_escape')


In [None]:
print(f" The shape of the Train data = {df_train.shape}")
print(f" The shape of the bureau data = {df_bureau.shape}")
print(f" The shape of the previous_application data = {df_previous_application.shape}")

In [None]:
df_train["SK_ID_CURR"].nunique()

In [None]:
print(f"uniques dans Train data = {df_train.SK_ID_CURR.nunique()}")
print(f"uniques dans bureau data = {df_bureau.SK_ID_CURR.nunique()}")
print(f"uniques dans previous_application data = {df_previous_application.SK_ID_CURR.nunique()}")

In [None]:
df_coldesc

In [None]:
df_coldesc.loc[(df_coldesc["Table"] == 'application_{train|test}.csv')]

In [None]:
df_train.head()

In [None]:
df_bureau.head()

In [None]:
df_previous_application.head()

In [None]:
df_train.columns.values

In [None]:
df_previous_application.columns.values

In [None]:
df_train[df_train["SK_ID_CURR"] == 271877]

In [None]:
df_bureau[df_bureau["SK_ID_CURR"] == 271877]

In [None]:
df_previous_application[df_previous_application["SK_ID_CURR"] == 271877]

# nettoyage data

In [None]:
# Function to calculate missing values by column# Funct 
def missing_values_table(df):
        # Total missing values
        mis_val = df.isnull().sum()
        
        # Percentage of missing values
        mis_val_percent = 100 * df.isnull().sum() / len(df)
        
        #Column names
#         Columns = pd.Series(df_train.columns)
        # Make a table with the results
        mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1,ignore_index= True)
        
        # Rename the columns
        mis_val_table = mis_val_table.rename(
        columns = {0 : 'Missing Values', 1 : '% of Total Values'})
        
        # Sort the table by percentage of missing descending
        mis_val_table = mis_val_table[
            mis_val_table.iloc[:,1] != 0].sort_values(
        '% of Total Values', ascending=False).round(1)
        
        # Print some summary information
        print ("The dataframe has " + str(df.shape[1]) + " columns.\n"      
            "There are " + str(mis_val_table.shape[0]) +
              " columns that have missing values.")
        
        # Return the dataframe with missing information
        return mis_val_table

In [None]:
mis_val_table = missing_values_table(df_train)
mis_val_table.tail(20)

In [None]:
sns.set_theme(style="whitegrid")


f, ax = plt.subplots(figsize=(8,20))
sns.set_color_codes("pastel")

sns.barplot(y=mis_val_table.index.values, x=mis_val_table['% of Total Values'].values, label="Percentage", color="b")

ax.legend(ncol=2, loc="lower right", frameon=True)
ax.set(xlim=(0, 100), ylabel="Column name",
       xlabel="Percentage of null Values in each Column")
sns.despine(left=True, bottom=True)

## Missing Data

In [None]:
import missingno as msno

msno.matrix(df_train.iloc[0:100, 40:94], sparkline=True, figsize=(20,10), sort='ascending', fontsize=12, labels=True, color=(0.25, 0.45, 0.6))

In [None]:
df_train['incomplete'] = 1
df_train.loc[df_train.isnull().sum(axis=1) < 35, 'incomplete'] = 0

mean_c = np.mean(df_train.loc[df_train['incomplete'] == 0, 'TARGET'].values)
mean_i = np.mean(df_train.loc[df_train['incomplete'] == 1, 'TARGET'].values)
print('default ratio for more complete: {:.1%} \ndefault ratio for less complete: {:.1%}'.format(mean_c, mean_i))

In [None]:
from scipy.stats import chi2_contingency

props = pd.crosstab(df_train.incomplete, df_train.TARGET)
c = chi2_contingency(props, lambda_="log-likelihood")
print(props, "\n p-value= ", c[1])

## **Split to Numerical and Categorical Features**

In [None]:
def feature_type_split(data):
    cat_list = []
    dis_num_list = []
    num_list = []
    for i in data.columns.tolist():
        if data[i].dtype == 'object':
            cat_list.append(i)
        elif data[i].nunique() < 25:
            dis_num_list.append(i)
        else:
            num_list.append(i)
    return cat_list, dis_num_list, num_list

cat_list, dis_num_list, num_list = feature_type_split(df_train) 

# num_list= features numeriques continues

# cat_list= features categoriques

# dis_num_list=features numeriques discretes

In [None]:
df_c_numeric = df_train[num_list]
df_categorical = df_train[cat_list]
df_d_numeric = df_train[dis_num_list]

print("num_list",len(num_list))
print("cat_list",len(cat_list))
print("dis_num_list",len(dis_num_list))



# df_c_numeric= df features numeriques continues

In [None]:
df_c_numeric.head()

# df_categorical= df features categoriques

In [None]:
df_categorical.head()

# df_d_numeric= df features numeriques discretes

In [None]:
df_d_numeric.head()

# valeurs manquantes imputees par median (continues) ou most frequent (discretes ou categorielles)

In [None]:
from sklearn.impute import SimpleImputer
import time


#categoriques
df_train[cat_list] = SimpleImputer(strategy='most_frequent').fit_transform(df_train[cat_list])

#discrete
df_train[dis_num_list] = SimpleImputer(strategy='most_frequent').fit_transform(df_train[dis_num_list])

#continues

df_train[num_list] = SimpleImputer(strategy='median').fit_transform(df_train[num_list])


In [None]:
df_train

In [None]:
df_train.dropna()

#  Outliers pour features numeriques continues

In [None]:
sns.set_theme(style="whitegrid")
gs = gridspec.GridSpec(20, 3)

f, ax = plt.subplots(figsize=(18,85))

columns =df_train[num_list].columns.values[:60]
c = 0
for i in range (20):
    for j in range (3):
        ax = plt.subplot(gs[i, j])
        sns.boxplot(data=df_train[num_list].iloc[:,c], orient="h")
        ax.set_title(columns[c])
        ax.legend()
        c+=1


In [None]:
import warnings
warnings.filterwarnings('ignore')



sns.set_theme(style="whitegrid")
gs = gridspec.GridSpec(20, 3)

f, ax = plt.subplots(figsize=(18,120))

columns =df_train[num_list].columns.values[:60]

c = 0
for i in range (20):
    for j in range (3):
        ax = plt.subplot(gs[i, j])
        sns.distplot(df_train[num_list].iloc[:,c])
        c+=1

In [None]:
# suprimer outliers par interquartile exemple sur 'DAYS_REGISTRATION'
df = df_train[num_list]

#intterquartile
percentile25 = df['DAYS_REGISTRATION'].quantile(0.25)
percentile75 = df['DAYS_REGISTRATION'].quantile(0.75)

IQR = percentile75 - percentile25

#sup et inf
upper_limit = percentile75 + 1.5 * IQR
lower_limit = percentile25 - 1.5 * IQR

#Outliers

df[df['DAYS_REGISTRATION'] > upper_limit]
df[df['DAYS_REGISTRATION'] < lower_limit]

#Trimming

new_df = df.loc[(df['DAYS_REGISTRATION'] < upper_limit) & (df['DAYS_REGISTRATION'] > lower_limit)]
new_df.shape

#Capping

new_df_cap = df_train[num_list].copy()

new_df_cap["DAYS_REGISTRATION"] = np.where(new_df_cap['DAYS_REGISTRATION'] < lower_limit,lower_limit
                                          ,np.where(new_df_cap['DAYS_REGISTRATION'] > upper_limit,upper_limit,
                                                   new_df_cap["DAYS_REGISTRATION"] )
                                          )

#Compare the plots after trimming
new_df_cap.shape

#### apres et avant ourliers removal, exemples

In [None]:
plt.figure(figsize=(16,8))
plt.subplot(2,2,1)
sns.distplot(df_train[num_list]['DAYS_REGISTRATION'])
plt.subplot(2,2,2)
sns.boxplot(df_train[num_list]['DAYS_REGISTRATION'])
plt.subplot(2,2,3)
sns.distplot(new_df['DAYS_REGISTRATION'])
plt.subplot(2,2,4)
sns.boxplot(new_df['DAYS_REGISTRATION'])
plt.show()

# remove outliers pour features numeriques continues

In [None]:
# Remove outliers
def remov_outliers(df, feature):

    #Finding the IQR
    percentile25 = df[str(feature)].quantile(0.25)
    percentile75 = df[str(feature)].quantile(0.75)

    IQR = percentile75 - percentile25

    #Finding upper and lower limit
    upper_limit = percentile75 + 1.5 * IQR
    lower_limit = percentile25 - 1.5 * IQR

    #Finding Outliers

    df[df[feature] > upper_limit]
    df[df[feature] < lower_limit]

    #Trimming

    new_df = df.loc[(df[feature] < upper_limit) & (df[feature] > lower_limit)]
    new_df.shape

    #Capping

    new_df_cap = df.copy()

    new_df_cap[feature] = np.where(new_df_cap[feature] < lower_limit,lower_limit
                                              ,np.where(new_df_cap[feature] > upper_limit,upper_limit,
                                                       new_df_cap[feature] )
                                              )

    return new_df_cap

In [None]:
# Remove outliers for all features
df_imputed_new = df_c_numeric.copy()
# df_imputed_new = df_imputed_new.drop(columns=['SK_ID_CURR', 'TARGET'],axis = 1)

columns = df_imputed_new.columns.values

for i in columns:
    df_imputed_new = remov_outliers(df_imputed_new, i)




# visualize the feature distribution after capping outliers

In [None]:
df_imputed_new

In [None]:
df_c_numeric.shape

In [None]:
#visualize the feature distribution after capping outliers
import warnings
warnings.filterwarnings('ignore')



sns.set_theme(style="whitegrid")
gs = gridspec.GridSpec(20, 3)

# Initialize the matplotlib figure
f, ax = plt.subplots(figsize=(18,120))

columns = df_c_numeric.columns.values[:60]
c = 0
for i in range (20):
    for j in range (3):
        ax = plt.subplot(gs[i, j])
        sns.distplot(df_imputed_new.iloc[:,c],kde=False)

        c+=1

In [None]:
sns.distplot(df_imputed_new["DAYS_LAST_PHONE_CHANGE"])
plt.show()

# df_imputed_new= df numeriques continues et discretes

In [None]:
#Concat again the Dis Numerical Features
df_imputed_new = pd.concat([df_imputed_new,df_d_numeric ],axis = 1)

# Data Transformations: Robust scaler applied to continuous numerical features


In [None]:
len(num_list)

In [None]:
from sklearn.preprocessing import RobustScaler

transformer = RobustScaler().fit(df_imputed_new[num_list])
df_imputed_new[num_list] = transformer.transform(df_imputed_new[num_list])

# **statistical analysis for numerical features**

In [None]:
df_imputed_new.head()

In [None]:
import scipy.stats


In [None]:
plt.figure(figsize=(14,70))
y_corr = df_imputed_new.corr().loc["TARGET"].sort_values(ascending=False).drop("TARGET",axis = 0).dropna()

ax = sns.barplot(y=y_corr.index.values, x=y_corr.values,
            label="Correlation", color="r")


# Add a legend and informative axis label
ax.legend(ncol=2, loc="lower right", frameon=True)
ax.set(xlim=(-0.3, 0.3), ylabel="Column name",
       xlabel="Correlation")
sns.despine(left=True, bottom=True)

#**['OWN_CAR_AGE', 'DAYS_BIRTH', 'DAYS_LAST_PHONE_CHANGE',
'DAYS_ID_PUBLISH', 'FLAG_DOCUMENT_3', 'DAYS_REGISTRATION',
>        'NONLIVINGAPARTMENTS_MODE']  &  ['REGION_POPULATION_RELATIVE', 'AMT_GOODS_PRICE', 'EXT_SOURCE_2',
>        'EXT_SOURCE_3', 'EXT_SOURCE_1']**

In [None]:
y_corr = df_imputed_new.corr().loc["TARGET"].sort_values(ascending=False).drop("TARGET",axis = 0).dropna()


In [None]:
lst = list(df_imputed_new.columns.values[:80])

### Testing the assumptions

In [None]:
# import the data
from scipy import stats

def test(df):
    columns = df.columns.values

    test_df = pd.DataFrame(columns, columns= ['Column name'])
    test_df["normality"] = 0
    test_df["homogeneity"] = 0

    Repay = df[df['TARGET'] == 0]
    No_Repay = df[df['TARGET'] == 1]

    for col in columns:

        # homogeneity
        s, p_value = stats.levene(Repay[str(col)].values.flatten(), No_Repay[str(col)].values.flatten())
        if p_value < 0.05:
            test_df.loc[test_df["Column name"] == str(col), "homogeneity"] = 1


        # Shapiro-Wilk test normalite
        s, p_value = stats.shapiro(Repay[str(col)].values.flatten())
        s2, p_value2 = stats.shapiro(No_Repay[str(col)].values.flatten())

        if p_value < 0.05 and p_value2 <0.05:
            test_df.loc[test_df["Column name"] == str(col), "normality"] = 1
        
    return test_df



In [None]:
df_imputed_new.shape

In [None]:
test_df = test(df_imputed_new)

In [None]:
test_df

In [None]:
#Features satisfaisant l'hypothese
acc_features = test_df.loc[(test_df["normality"] == 1) &(test_df["homogeneity"] == 1)]

In [None]:
acc_features

In [None]:
test_df.loc[(test_df["normality"] == 0) | (test_df["homogeneity"] == 0)]

## **T-test**

In [None]:
import scipy
def t_test(df,test_df):
    accept_null_hypo = []
    reject_null_hypo = []
    #select only Variables that satisfy the assumptions ( Normality and homogenous)
    columns = test_df.loc[(test_df["normality"] == 1) &(test_df["homogeneity"] == 1)]["Column name"].values

    Repay = df[(df['TARGET'] == 0)]
    No_Repay = df[(df['TARGET'] == 1)]

    for col in columns:

        #conduct the Independent t-test
        s, p_value = stats.ttest_ind(Repay[str(col)], No_Repay[str(col)])
        if p_value < 0.05:
            reject_null_hypo.append(col)
        else:
            accept_null_hypo.append(col)
            
    return accept_null_hypo,reject_null_hypo

In [None]:
accept_null_hypo, reject_null_hypo = t_test(df_imputed_new,test_df)

In [None]:
accept_null_hypo

In [None]:
reject_null_hypo

### on prends les 50 features les plus correlees

In [None]:
X = df_imputed_new.drop(['SK_ID_CURR', 'TARGET'], axis=1)
y = df_imputed_new.TARGET
feature_name = X.columns.tolist()

In [None]:
def cor_selector(X, y):
    cor_list = []
    # calculate the correlation with y for each feature
    for i in X.columns.tolist():

        cor = np.corrcoef(X[i].values.flatten(), y)[0, 1]
        cor_list.append(cor)
    # replace NaN with 0
    cor_list = [0 if np.isnan(i) else i for i in cor_list]
    # feature name
    cor_feature = X.iloc[:,np.argsort(np.abs(cor_list))[-50:]].columns.tolist()
    # feature selection? 0 for not select, 1 for select
    cor_support = [True if i in cor_feature else False for i in feature_name]
    return cor_support, cor_feature

In [None]:
cor_support, cor_feature =cor_selector(X, y)

In [None]:
# feature not matched between Correlation analysis and T-test
not_matched = [i for i in cor_feature if  i not in reject_null_hypo ]
not_matched

In [None]:
# feature  matched between Correlation analysis and T-test
matched = [i for i in cor_feature if  i  in reject_null_hypo ]
matched

In [None]:
# Generate the Dataframe of selected features

df_numerical = df_imputed_new[matched]

# df_numerical= df des features de df_imputed_new corellees

In [None]:
df_numerical

## Visualize The Relationship between selected features and the target 

In [None]:
# Plots the disribution of a variable colored by value of the target
def kde_target(var_name, df):
    
    # Calculate the correlation coefficient between the new variable and the target
    corr = df['TARGET'].corr(df[var_name])
    
    # Calculate medians for repaid vs not repaid
    avg_repaid = df.loc[(df['TARGET'] == 0)][var_name].median()
    avg_not_repaid = df.loc[(df['TARGET'] == 1)][var_name].median()
    
    plt.figure(figsize = (12, 6))
    
    # Plot the distribution for target == 0 and target == 1
    sns.kdeplot(df.loc[(df['TARGET'] == 0)][var_name], label = 'TARGET == 0')
    sns.kdeplot(df.loc[(df['TARGET'] == 1)][var_name], label = 'TARGET == 1')
    
    # label the plot
    plt.xlabel(var_name); plt.ylabel('Density'); plt.title('%s Distribution' % var_name)
    plt.legend();
    
    # print out the correlation
    print('The correlation between %s and the TARGET is %0.4f' % (var_name, corr))
    # Print out average values
    print('Median value for loan that was not repaid = %0.4f' % avg_not_repaid)
    print('Median value for loan that was repaid =     %0.4f' % avg_repaid)

In [None]:
import warnings
warnings.filterwarnings('ignore')

sns.set_theme(style="whitegrid")
gs = gridspec.GridSpec(12, 4)

# Initialize the matplotlib figure
f, ax = plt.subplots(figsize=(18,120))

columns = df_numerical.columns.values
c = 0
for i in range(12):
    for j in range (4):
        ax = plt.subplot(gs[i, j])
#         kde_target(columns[c], df_imputed_new)
        
        # Plot the distribution for target == 0 and target == 1
        sns.kdeplot(df_imputed_new.loc[(df_imputed_new['TARGET'] == 0)][columns[c]], label = 'TARGET == 0')
        sns.kdeplot(df_imputed_new.loc[(df_imputed_new['TARGET'] == 1)][columns[c]], label = 'TARGET == 1')

        # label the plot
        plt.xlabel(columns[c]); plt.ylabel('Density'); plt.title('%s Distribution' % columns[c])
        plt.legend();
        
        if c==len(columns)-1:
            break
        c+=1

# **Categorical variables**

### ***Chi-sqaure test for Categorical variables null hupothesis: p-value < .05***



In [None]:
import scipy

columns =df_categorical.columns.values

for col in columns:
    chi_square_args = pd.crosstab(df_categorical[col], df_train['TARGET']).values

    _, p_value, _, _ = scipy.stats.chi2_contingency(chi_square_args)
    print(f"The P-value for {col} = " ,p_value)

# toutes les p-value sont inferieures à .05 

# get dummies

In [None]:

df_categorical_new = pd.get_dummies(df_categorical)

print('Features shape: ', df_categorical.shape)
print('new Features shape: ', df_categorical_new.shape)

In [None]:
df_categorical_new

In [None]:
id = df_imputed_new["SK_ID_CURR"]
id

# à present df_train = concat get dummies et df_numerical

In [None]:
df_train = pd.concat([df_categorical_new,df_numerical],axis = 1)

In [None]:
df_train.shape

# Reading test data

In [None]:
#Read Test dataframe
df_test = pd.read_csv('Projet+Mise+en+prod+-+home-credit-default-risk/application_test.csv')

In [None]:
df_test.head()

In [None]:
# Testing data features
df_test.head()
print('Testing data shape: ', df_test.shape)


In [None]:
df_test_new = df_test.copy()
df_test_new = pd.get_dummies(df_test_new)
print('new Features shape before dropping : ', df_test_new.shape)
df_test_new = df_test_new.drop(columns = not_matched)
print('new Features shape: ', df_test_new.shape)

In [None]:
df_test_new

In [None]:
x = [ i for i in df_train.columns.values if i not in df_test_new.columns.values]
x

In [None]:
x = [ i for i in  df_test_new.columns.values  if i not in df_train.columns.values]
x

In [None]:
train_labels = df_imputed_new['TARGET']

# Align the training and testing data, keep only columns present in both dataframes
df_train, df_test_new = df_train.align(df_test_new, join = 'inner', axis = 1)

# Add the target back in

print('Training Features shape: ', df_train.shape)
print('Testing Features shape: ', df_test_new.shape)

In [None]:
df_train

In [None]:
df_test_new

In [None]:
train_labels

# Reading other files

In [None]:
# run functions and pre_settings
def one_hot_encoder(df, nan_as_category=True):
    original_columns = list(df.columns)
    categorical_columns = [col for col in df.columns if df[col].dtype == 'object']
    df = pd.get_dummies(df, columns=categorical_columns, dummy_na=nan_as_category)
    new_columns = [c for c in df.columns if c not in original_columns]
    return df, new_columns

In [None]:
def bureau_bb():
    bureau = pd.read_csv('Projet+Mise+en+prod+-+home-credit-default-risk/bureau.csv')
    bb = pd.read_csv('Projet+Mise+en+prod+-+home-credit-default-risk/bureau_balance.csv')

    # Credit duration and credit/account end date difference
    bureau['CREDIT_DURATION'] = -bureau['DAYS_CREDIT'] + bureau['DAYS_CREDIT_ENDDATE']
    bureau['ENDDATE_DIF'] = bureau['DAYS_CREDIT_ENDDATE'] - bureau['DAYS_ENDDATE_FACT']
    
    # Credit to debt ratio and difference
    bureau['DEBT_PERCENTAGE'] = bureau['AMT_CREDIT_SUM'] / bureau['AMT_CREDIT_SUM_DEBT']
    bureau['DEBT_CREDIT_DIFF'] = bureau['AMT_CREDIT_SUM'] - bureau['AMT_CREDIT_SUM_DEBT']
    bureau['CREDIT_TO_ANNUITY_RATIO'] = bureau['AMT_CREDIT_SUM'] / bureau['AMT_ANNUITY']
    bureau['BUREAU_CREDIT_FACT_DIFF'] = bureau['DAYS_CREDIT'] - bureau['DAYS_ENDDATE_FACT']
    bureau['BUREAU_CREDIT_ENDDATE_DIFF'] = bureau['DAYS_CREDIT'] - bureau['DAYS_CREDIT_ENDDATE']
    bureau['BUREAU_CREDIT_DEBT_RATIO'] = bureau['AMT_CREDIT_SUM_DEBT'] / bureau['AMT_CREDIT_SUM']

    # CREDIT_DAY_OVERDUE :
    bureau['BUREAU_IS_DPD'] = bureau['CREDIT_DAY_OVERDUE'].apply(lambda x: 1 if x > 0 else 0)
    bureau['BUREAU_IS_DPD_OVER120'] = bureau['CREDIT_DAY_OVERDUE'].apply(lambda x: 1 if x > 120 else 0)
    
    nan_as_category = True

    bb, bb_cat = one_hot_encoder(bb, nan_as_category)
    bureau, bureau_cat = one_hot_encoder(bureau, nan_as_category)

    # Bureau balance: Perform aggregations and merge with bureau.csv
    bb_aggregations = {'MONTHS_BALANCE': ['min', 'max', 'size', 'mean']}
    for col in bb_cat:
        bb_aggregations[col] = ['mean']

    #Status of Credit Bureau loan during the month
    bb_agg = bb.groupby('SK_ID_BUREAU').agg(bb_aggregations)
    bb_agg.columns = pd.Index([e[0] + "_" + e[1].upper() for e in bb_agg.columns.tolist()])
    bureau = bureau.join(bb_agg, how='left', on='SK_ID_BUREAU')

    # Bureau and bureau_balance numeric features
    num_aggregations = {
        'DAYS_CREDIT': ['min', 'max', 'mean', 'var'],
        'DAYS_CREDIT_ENDDATE': ['min', 'max', 'mean'],
        'DAYS_CREDIT_UPDATE': ['mean'],
        'CREDIT_DAY_OVERDUE': ['max', 'mean', 'min'],
        'AMT_CREDIT_MAX_OVERDUE': ['mean', 'max'],
        'AMT_CREDIT_SUM': ['max', 'mean', 'sum'],
        'AMT_CREDIT_SUM_DEBT': ['max', 'mean', 'sum'],
        'AMT_CREDIT_SUM_OVERDUE': ['mean', 'max', 'sum'],
        'AMT_CREDIT_SUM_LIMIT': ['mean', 'sum'],
        'AMT_ANNUITY': ['max', 'mean', 'sum'],
        'CNT_CREDIT_PROLONG': ['sum'],
        'MONTHS_BALANCE_MIN': ['min'],
        'MONTHS_BALANCE_MAX': ['max'],
        'MONTHS_BALANCE_SIZE': ['mean', 'sum'],
        'SK_ID_BUREAU': ['count'],
        'DAYS_ENDDATE_FACT': ['min', 'max', 'mean'],
        'ENDDATE_DIF': ['min', 'max', 'mean'],
        'BUREAU_CREDIT_FACT_DIFF': ['min', 'max', 'mean'],
        'BUREAU_CREDIT_ENDDATE_DIFF': ['min', 'max', 'mean'],
        'BUREAU_CREDIT_DEBT_RATIO': ['min', 'max', 'mean'],
        'DEBT_CREDIT_DIFF': ['min', 'max', 'mean'],
        'BUREAU_IS_DPD': ['mean', 'sum'],
        'BUREAU_IS_DPD_OVER120': ['mean', 'sum']
        }

    # Bureau and bureau_balance categorical features
    cat_aggregations = {}
    for cat in bureau_cat: cat_aggregations[cat] = ['mean']
    for cat in bb_cat: cat_aggregations[cat + "_MEAN"] = ['mean']
    bureau_agg = bureau.groupby('SK_ID_CURR').agg({**num_aggregations, **cat_aggregations})
    bureau_agg.columns = pd.Index(['BURO_' + e[0] + "_" + e[1].upper() for e in bureau_agg.columns.tolist()])

    # Bureau: Active credits - using only numerical aggregations
    active = bureau[bureau['CREDIT_ACTIVE_Active'] == 1]
    active_agg = active.groupby('SK_ID_CURR').agg(num_aggregations)
    active_agg.columns = pd.Index(['ACTIVE_' + e[0] + "_" + e[1].upper() for e in active_agg.columns.tolist()])
    bureau_agg = bureau_agg.join(active_agg, how='left', on='SK_ID_CURR')

    # Bureau: Closed credits - using only numerical aggregations
    closed = bureau[bureau['CREDIT_ACTIVE_Closed'] == 1]
    closed_agg = closed.groupby('SK_ID_CURR').agg(num_aggregations)
    closed_agg.columns = pd.Index(['CLOSED_' + e[0] + "_" + e[1].upper() for e in closed_agg.columns.tolist()])
    bureau_agg = bureau_agg.join(closed_agg, how='left', on='SK_ID_CURR')

    print('"Bureau/Bureau Balance" final shape:', bureau_agg.shape)
    return bureau_agg


## View new Features

In [None]:

New_df = bureau_bb()

In [None]:
New_df.head()

## Analysis For The new Features and Wheather they are More Correlated to the Target Variable

In [None]:
df_trainx = pd.read_csv("Projet+Mise+en+prod+-+home-credit-default-risk/application_train.csv")

df_train_new = df_trainx[["SK_ID_CURR","TARGET"]].merge(bureau_bb(), how='left', on='SK_ID_CURR')


In [None]:
df_train_new.shape

### **Visualize The Correlation between the new Features and the target**

In [None]:
plt.figure(figsize=(14,80))
# sns.set_color_codes("dark")
y_corr = df_train_new.corr().loc["TARGET"].sort_values(ascending=False).drop("TARGET",axis = 0).dropna()

ax = sns.barplot(y=y_corr.index.values, x=y_corr.values,
            label="Correlation", color="r")


# Add a legend and informative axis label
ax.legend(ncol=2, loc="lower right", frameon=True)
ax.set(xlim=(-0.3, 0.3), ylabel="Column name",
       xlabel="Correlation")
sns.despine(left=True, bottom=True)

In [None]:
df_train_new

In [None]:
df_train_new.isnull().sum()

In [None]:
df_train_new = df_train_new.fillna(df_train_new.median())

In [None]:
test_df2 = test(df_train_new)

In [None]:
test_df2

In [None]:
test_df2.loc[(test_df2["normality"] == 1) &(test_df2["homogeneity"] == 1)]

## Make t-test for the new Variables to see if they were independent from the TARGET

In [None]:
acc, rej  = t_test(df_train_new,test_df2)

In [None]:
rej

## Merge The New Data and Features with the main Dataframes

In [None]:
df_test_new = pd.concat([df_test_new,df_test["SK_ID_CURR"]],axis = 1)
df_train_new = pd.concat([df_train,df_trainx["SK_ID_CURR"]],axis = 1)

df_train_2 = df_train_new.merge(bureau_bb(), how='left', on='SK_ID_CURR')
df_test_2 = df_test_new.merge(bureau_bb(), how='left', on='SK_ID_CURR')

print('--=> df after merge with bureau:', df_train.shape)
print('--=> df_train_2 after merge with bureau:', df_train_2.shape)

print('--=> df after merge with bureau:', df_test.shape)
print('--=> df_test_2 after merge with bureau:', df_test_2.shape)

In [None]:
df_train_2

In [None]:
df_train_2.dtypes.value_counts()

In [None]:
df_train_3= df_train_2.astype(float)


In [None]:
df_train_3.replace([np.inf, -np.inf], 0, inplace=True)

df_train_3

In [None]:
train_3 = SimpleImputer(strategy='median').fit_transform(df_train_3)

In [None]:
train_3

In [None]:
df_train_3=pd.DataFrame(train_3)
df_train_3.columns=df_train_2.columns
df_train_3

In [None]:
df_test_2

### light GBM

In [None]:
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold

In [None]:
mlflow.lightgbm.autolog()  # Enable auto logging.


In [None]:
import re
df_train = df_train_3.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))

df_test = df_test_2.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))

folds = KFold(n_splits=5, shuffle=True, random_state=2020)

for n_fold, (train_idx, valid_idx) in enumerate(folds.split(df_train, train_labels)):
        train_x, train_y = df_train.iloc[train_idx], train_labels.iloc[train_idx]
        valid_x, valid_y = df_train.iloc[valid_idx], train_labels.iloc[valid_idx]
        
        clf = LGBMClassifier(nthread=-1,
                            n_estimators=5000,
                            learning_rate=0.01,
                            max_depth=11,
                            num_leaves=58,
                            colsample_bytree=0.613,
                            subsample=0.708,
                            max_bin=407,
                            reg_alpha=3.564,
                            reg_lambda=4.930,
                            min_child_weight=6,
                            min_child_samples=165,
                            silent=-1,
                            verbose=-1,)

        with mlflow.start_run():
            clf.fit(train_x, train_y, eval_set=[(train_x, train_y), (valid_x, valid_y)], eval_metric='auc', verbose=500, early_stopping_rounds=500)
        
        # Create arrays and dataframes to store results
        train_preds = np.zeros(df_train.shape[0])
        test_preds = np.zeros(df_test.shape[0])
        
        train_preds[valid_idx] = clf.predict_proba(valid_x, num_iteration=clf.best_iteration_)[:, 1]
        test_preds += clf.predict_proba(df_test, num_iteration=clf.best_iteration_)[:, 1] / folds.n_splits

        print('Fold %2d AUC : %.6f' % (n_fold + 1, roc_auc_score(valid_y, train_preds[valid_idx])))


print('Full AUC score %.6f' % roc_auc_score(df_imputed_new['TARGET'], train_preds))


df_test['TARGET'] = test_preds


In [None]:
train_preds

In [None]:
df_test = pd.read_csv("Projet+Mise+en+prod+-+home-credit-default-risk/application_test.csv")

In [None]:
df_test['TARGET'] = test_preds
df_test[['SK_ID_CURR', 'TARGET']].to_csv('submission.csv', index=False)
df_test

In [None]:
df_train

In [None]:
mlflow.end_run()

In [None]:
train_x, train_y = df_train_3.iloc[train_idx], train_labels.iloc[train_idx]
valid_x, valid_y = df_train_3.iloc[valid_idx], train_labels.iloc[valid_idx]

In [None]:
train_x

In [None]:
train_labels

In [None]:
train_preds_2=np.rint (train_preds)
train_preds_2

In [None]:
for x in range(len(train_preds_2)):
    if(train_preds_2[x]==1):
        train_preds_2[x] = 0
    else:
        train_preds_2[x] = 1
  

In [None]:
f1_score(train_preds_2, train_labels)


#  Oversampling


In [None]:
sm=SMOTE(random_state=42)
X_resampled, y_resampled=sm.fit_resample(train_x, train_y)

In [None]:
X_resampled

In [None]:
y_resampled

In [None]:
df_train = X_resampled.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))

df_test = df_test_2.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))

folds = KFold(n_splits=5, shuffle=True, random_state=2020)

for n_fold, (train_idx, valid_idx) in enumerate(folds.split(df_train, y_resampled)):
        train_x, train_y = df_train.iloc[train_idx], y_resampled.iloc[train_idx]
        valid_x, valid_y = df_train.iloc[valid_idx], y_resampled.iloc[valid_idx]
        
        clf = LGBMClassifier(nthread=-1,
                            n_estimators=5000,
                            learning_rate=0.01,
                            max_depth=11,
                            num_leaves=58,
                            colsample_bytree=0.613,
                            subsample=0.708,
                            max_bin=407,
                            reg_alpha=3.564,
                            reg_lambda=4.930,
                            min_child_weight=6,
                            min_child_samples=165,
                            silent=-1,
                            verbose=-1,)

        with mlflow.start_run():
            clf.fit(train_x, train_y, eval_set=[(train_x, train_y), (valid_x, valid_y)], eval_metric='auc', verbose=500, early_stopping_rounds=500)
        
        # Create arrays and dataframes to store results
        train_preds = np.zeros(df_train.shape[0])
        test_preds = np.zeros(df_test.shape[0])
        
        train_preds[valid_idx] = clf.predict_proba(valid_x, num_iteration=clf.best_iteration_)[:, 1]
        test_preds += clf.predict_proba(df_test, num_iteration=clf.best_iteration_)[:, 1] / folds.n_splits

        print('Fold %2d AUC : %.6f' % (n_fold + 1, roc_auc_score(valid_y, train_preds[valid_idx])))


print('Full AUC score %.6f' % roc_auc_score(y_resampled, train_preds))


df_test['TARGET'] = test_preds


In [None]:
shap_values = shap.TreeExplainer(clf).shap_values(valid_x)

In [None]:
shap.summary_plot(shap_values, valid_x)

In [None]:
pickle.dump(clf, open('model.pkl','wb'))

In [None]:
mlflow.end_run()

In [None]:
X_resampled

In [None]:
X_resampled['FLAG_OWN_CAR_N'].value_counts().idxmax()

In [None]:
most_common=[]
for i in X_resampled.columns:
    most_common.append(X_resampled[i].value_counts().idxmax())
    

In [None]:
most_common_df=pd.DataFrame(most_common).T
most_common_df.columns=X_resampled.columns
most_common_df

In [None]:
train_preds

In [None]:
y_resampled

In [None]:
train_preds_2=np.rint (train_preds)
train_preds_2

In [None]:
f1_score(train_preds_2, y_resampled)