## Information About Dataset


The objective of this competition is to predict the probability that a customer does not pay back their credit card balance amount in the future based on their monthly customer profile. The target binary variable is calculated by observing 18 months performance window after the latest credit card statement, and if the customer does not pay due amount in 120 days after their latest statement date it is considered a default event.

The dataset contains aggregated profile features for each customer at each statement date. Features are anonymized and normalized, and fall into the following general categories:

D_* = Delinquency variables
S_* = Spend variables
P_* = Payment variables
B_* = Balance variables
R_* = Risk variables
with the following features being categorical:

['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 'D_64', 'D_66', 'D_68']
Your task is to predict, for each customer_ID, the probability of a future payment default (target = 1).

#### Source link: https://www.kaggle.com/competitions/amex-default-prediction/data 

NOTE: Train_Data.csv and Train_Label.csv was used for this working.

## Import Libraries 

In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 
import os 

pd.set_option("display.max_columns", 500)
pd.set_option("display.max_rows", 1000)

import warnings 
warnings.filterwarnings('ignore')

In [2]:
#Read Data
df_inputs = pd.read_csv("C:/Users/yemre/Masaüstü/Data_Science_Projects/Default_Prediction/data/data.csv")
df_output = pd.read_csv("C:/Users/yemre/Masaüstü/Data_Science_Projects/Default_Prediction/data/labels.csv")

#Concat tables
df = pd.concat([df_inputs, df_output['target']], axis=1)

#Display First 3 Rows of the Dataframe
display(df.head(3))

Unnamed: 0,customer_ID,S_2,P_2,D_39,B_1,B_2,R_1,S_3,D_41,B_3,D_42,D_43,D_44,B_4,D_45,B_5,R_2,D_46,D_47,D_48,D_49,B_6,B_7,B_8,D_50,D_51,B_9,R_3,D_52,P_3,B_10,D_53,S_5,B_11,S_6,D_54,R_4,S_7,B_12,S_8,D_55,D_56,B_13,R_5,D_58,S_9,B_14,D_59,D_60,D_61,B_15,S_11,D_62,D_63,D_64,D_65,B_16,B_17,B_18,B_19,D_66,B_20,D_68,S_12,R_6,S_13,B_21,D_69,B_22,D_70,D_71,D_72,S_15,B_23,D_73,P_4,D_74,D_75,D_76,B_24,R_7,D_77,B_25,B_26,D_78,D_79,R_8,R_9,S_16,D_80,R_10,R_11,B_27,D_81,D_82,S_17,R_12,B_28,R_13,D_83,R_14,R_15,D_84,R_16,B_29,B_30,S_18,D_86,D_87,R_17,R_18,D_88,B_31,S_19,R_19,B_32,S_20,R_20,R_21,B_33,D_89,R_22,R_23,D_91,D_92,D_93,D_94,R_24,R_25,D_96,S_22,S_23,S_24,S_25,S_26,D_102,D_103,D_104,D_105,D_106,D_107,B_36,B_37,R_26,R_27,B_38,D_108,D_109,D_110,D_111,B_39,D_112,B_40,S_27,D_113,D_114,D_115,D_116,D_117,D_118,D_119,D_120,D_121,D_122,D_123,D_124,D_125,D_126,D_127,D_128,D_129,B_41,B_42,D_130,D_131,D_132,D_133,R_28,D_134,D_135,D_136,D_137,D_138,D_139,D_140,D_141,D_142,D_143,D_144,D_145,target
0,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-03-09,0.938469,0.001733,0.008724,1.006838,0.009228,0.124035,0.008771,0.004709,,,0.00063,0.080986,0.708906,0.1706,0.006204,0.358587,0.525351,0.255736,,0.063902,0.059416,0.006466,0.148698,1.335856,0.008207,0.001423,0.207334,0.736463,0.096219,,0.023381,0.002768,0.008322,1.001519,0.008298,0.161345,0.148266,0.922998,0.354596,0.152025,0.118075,0.001882,0.158612,0.065728,0.018385,0.063646,0.199617,0.308233,0.016361,0.401619,0.091071,CR,O,0.007126,0.007665,,0.652984,0.00852,,0.00473,6.0,0.272008,0.008363,0.515222,0.002644,0.009013,0.004808,0.008342,0.119403,0.004802,0.108271,0.050882,,0.007554,0.080422,0.069067,,0.004327,0.007562,,0.007729,0.000272,0.001576,0.004239,0.001434,,0.002271,0.004061,0.007121,0.002456,0.00231,0.003532,0.506612,0.008033,1.009825,0.084683,0.00382,0.007043,0.000438,0.006452,0.00083,0.005055,,0.0,0.00572,0.007084,,0.000198,0.008907,,1,0.002537,0.005177,0.006626,0.009705,0.007782,0.00245,1.001101,0.002665,0.007479,0.006893,1.503673,1.006133,0.003569,0.008871,0.00395,0.003647,0.00495,0.89409,0.135561,0.911191,0.974539,0.001243,0.766688,1.008691,1.004587,0.893734,,0.670041,0.009968,0.004572,,1.008949,2.0,,0.004326,,,,1.007336,0.21006,0.676922,0.007871,1.0,0.23825,0.0,4.0,0.23212,0.236266,0.0,0.70228,0.434345,0.003057,0.686516,0.00874,1.0,1.003319,1.007819,1.00008,0.006805,,0.002052,0.005972,,0.004345,0.001535,,,,,,0.002427,0.003706,0.003818,,0.000569,0.00061,0.002674,0.0
1,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-04-07,0.936665,0.005775,0.004923,1.000653,0.006151,0.12675,0.000798,0.002714,,,0.002526,0.069419,0.712795,0.113239,0.006206,0.35363,0.521311,0.223329,,0.065261,0.057744,0.001614,0.149723,1.339794,0.008373,0.001984,0.202778,0.720886,0.099804,,0.030599,0.002749,0.002482,1.009033,0.005136,0.140951,0.14353,0.919414,0.326757,0.156201,0.118737,0.00161,0.148459,0.093935,0.013035,0.065501,0.151387,0.265026,0.017688,0.406326,0.086805,CR,O,0.002413,0.007148,,0.647093,0.002238,,0.003879,6.0,0.18897,0.00403,0.509048,0.004193,0.007842,0.001283,0.006524,0.140611,9.4e-05,0.101018,0.040469,,0.004832,0.081413,0.074166,,0.004203,0.005304,,0.001864,0.000979,0.009896,0.007597,0.000509,,0.00981,0.000127,0.005966,0.000395,0.001327,0.007773,0.500855,0.00076,1.009461,0.081843,0.000347,0.007789,0.004311,0.002332,0.009469,0.003753,,0.0,0.007584,0.006677,,0.001142,0.005907,,1,0.008427,0.008979,0.001854,0.009924,0.005987,0.002247,1.006779,0.002508,0.006827,0.002837,1.503577,1.005791,0.000571,0.000391,0.008351,0.00885,0.00318,0.902135,0.136333,0.919876,0.975624,0.004561,0.786007,1.000084,1.004118,0.906841,,0.668647,0.003921,0.004654,,1.003205,2.0,,0.008707,,,,1.007653,0.184093,0.822281,0.003444,1.0,0.247217,0.0,4.0,0.243532,0.241885,0.0,0.707017,0.430501,0.001306,0.686414,0.000755,1.0,1.008394,1.004333,1.008344,0.004407,,0.001034,0.004838,,0.007495,0.004931,,,,,,0.003954,0.003167,0.005032,,0.009576,0.005492,0.009217,0.0
2,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-05-28,0.95418,0.091505,0.021655,1.009672,0.006815,0.123977,0.007598,0.009423,,,0.007605,0.068839,0.720884,0.060492,0.003259,0.33465,0.524568,0.189424,,0.066982,0.056647,0.005126,0.151955,1.337179,0.009355,0.007426,0.206629,0.738044,0.134073,,0.048367,0.010077,0.00053,1.009184,0.006961,0.112229,0.137014,1.001977,0.304124,0.153795,0.114534,0.006328,0.139504,0.084757,0.056653,0.070607,0.305883,0.212165,0.063955,0.406768,0.094001,CR,O,0.001878,0.003636,,0.645819,0.000408,,0.004578,6.0,0.495308,0.006838,0.679257,0.001337,0.006025,0.009393,0.002615,0.075868,0.007152,0.103239,0.047454,,0.006561,0.078891,0.07651,,0.001782,0.001422,,0.005419,0.006149,0.009629,0.003094,0.008295,,0.009362,0.000954,0.005447,0.007345,0.007624,0.008811,0.504606,0.004056,1.004291,0.081954,0.002709,0.004093,0.007139,0.008358,0.002325,0.007381,,0.0,0.005901,0.001185,,0.008013,0.008882,,1,0.007327,0.002016,0.008686,0.008446,0.007291,0.007794,1.001014,0.009634,0.00982,0.00508,1.503359,1.005801,0.007425,0.009234,0.002471,0.009769,0.005433,0.939654,0.134938,0.958699,0.974067,0.011736,0.80684,1.003014,1.009285,0.928719,,0.670901,0.001264,0.019176,,1.000754,2.0,,0.004092,,,,1.004312,0.154837,0.853498,0.003269,1.0,0.239867,0.0,4.0,0.240768,0.23971,0.0,0.704843,0.434409,0.003954,0.690101,0.009617,1.0,1.009307,1.007831,1.006878,0.003221,,0.005681,0.005497,,0.009227,0.009123,,,,,,0.003269,0.007329,0.000427,,0.003429,0.006986,0.002603,0.0


## Exploratory Data Analysis 

In [3]:
# Categorize features according to their main data types
def categorize_columns(data):
    numeric_cols = []
    categoric_cols = []
    date_cols = []

    for column in data.columns:
        if pd.api.types.is_numeric_dtype(data[column]):
            numeric_cols.append(column)
        elif pd.api.types.is_string_dtype(data[column]):
            is_date_col = False 
            for date_format in ['%Y%m%d', '%Y-%m-%d']:
                try:
                    pd.to_datetime(df[column], format=date_format)
                    is_date_col = True
                    break
                except:
                    pass 
            if is_date_col:
                date_cols.append(column) 
            else:
                categoric_cols.append(column)
        else:
            date_cols.append(column)

    return numeric_cols, categoric_cols, date_cols

In [4]:
numeric_cols, categoric_cols, date_cols = categorize_columns(df)

In [5]:
print(numeric_cols)

['P_2', 'D_39', 'B_1', 'B_2', 'R_1', 'S_3', 'D_41', 'B_3', 'D_42', 'D_43', 'D_44', 'B_4', 'D_45', 'B_5', 'R_2', 'D_46', 'D_47', 'D_48', 'D_49', 'B_6', 'B_7', 'B_8', 'D_50', 'D_51', 'B_9', 'R_3', 'D_52', 'P_3', 'B_10', 'D_53', 'S_5', 'B_11', 'S_6', 'D_54', 'R_4', 'S_7', 'B_12', 'S_8', 'D_55', 'D_56', 'B_13', 'R_5', 'D_58', 'S_9', 'B_14', 'D_59', 'D_60', 'D_61', 'B_15', 'S_11', 'D_62', 'D_65', 'B_16', 'B_17', 'B_18', 'B_19', 'D_66', 'B_20', 'D_68', 'S_12', 'R_6', 'S_13', 'B_21', 'D_69', 'B_22', 'D_70', 'D_71', 'D_72', 'S_15', 'B_23', 'D_73', 'P_4', 'D_74', 'D_75', 'D_76', 'B_24', 'R_7', 'D_77', 'B_25', 'B_26', 'D_78', 'D_79', 'R_8', 'R_9', 'S_16', 'D_80', 'R_10', 'R_11', 'B_27', 'D_81', 'D_82', 'S_17', 'R_12', 'B_28', 'R_13', 'D_83', 'R_14', 'R_15', 'D_84', 'R_16', 'B_29', 'B_30', 'S_18', 'D_86', 'D_87', 'R_17', 'R_18', 'D_88', 'B_31', 'S_19', 'R_19', 'B_32', 'S_20', 'R_20', 'R_21', 'B_33', 'D_89', 'R_22', 'R_23', 'D_91', 'D_92', 'D_93', 'D_94', 'R_24', 'R_25', 'D_96', 'S_22', 'S_23', 'S

In [6]:
print(categoric_cols)

['customer_ID', 'D_63', 'D_64']


In [7]:
print(date_cols)

['S_2']


In [8]:
def descriptive_stats_for_numeric_cols(data):
    result_df = pd.DataFrame(columns=['Feature_Name', 'Count', 'Missing_Count', 'Missing_Percentage' ,'Min', 'Max',\
                                        'Std', '25P', '50P', '75P', '95P'])

    for column in data.columns:
        count_ = data[column].count()
        missing_count = data[column].isna().sum()
        missing_percentage = (missing_count / len(data[column])) * 100
        min_value = data[column].min()
        max_value= data[column].max()
        std_value = data[column].std()
        p25 = data[column].quantile(0.25)
        p50 = data[column].quantile(0.50)
        p75 = data[column].quantile(0.75)
        p95 = data[column].quantile(0.95)

        result_df = result_df.append({
            'Feature_Name': column,
            'Count': count_,
            'Missing_Count': missing_count,
            'Missing_Percentage': missing_percentage,
            'Min': min_value,
            'Max': max_value,
            'Std': std_value,
            '25P': p25,
            '50P': p50,
            '75P': p75,
            '95P': p95,
        }, ignore_index=True)
    
    return result_df

In [9]:
descriptive_stats_numeric_cols = descriptive_stats_for_numeric_cols(df.loc[:, numeric_cols])

In [10]:
display(descriptive_stats_numeric_cols)

Unnamed: 0,Feature_Name,Count,Missing_Count,Missing_Percentage,Min,Max,Std,25P,50P,75P,95P
0,P_2,5485466,45985,0.831337,-0.458955,1.01,0.244649,0.480331,0.694295,0.864816,0.976297
1,D_39,5531451,0,0.0,0.0,5.389619,0.270071,0.004528,0.009057,0.236641,0.653379
2,B_1,5531451,0,0.0,-7.588799,1.32406,0.211987,0.008864,0.03133,0.125902,0.603555
3,B_2,5529435,2016,0.036446,0.0,1.01,0.401488,0.105331,0.814333,1.002403,1.008479
4,R_1,5531451,0,0.0,0.0,3.256284,0.226397,0.002896,0.005782,0.008661,0.507362
5,S_3,4510907,1020544,18.449843,-0.627132,5.482888,0.193347,0.127259,0.163908,0.258102,0.606294
6,D_41,5529435,2016,0.036446,0.0,8.988807,0.202544,0.002873,0.005747,0.008616,0.381905
7,B_3,5529435,2016,0.036446,0.0,1.625262,0.234993,0.005228,0.009777,0.155051,0.707351
8,D_42,791314,4740137,85.694278,-0.000454,4.191119,0.228185,0.037516,0.120519,0.250869,0.575941
9,D_43,3873055,1658396,29.981211,0.0,10.111619,0.213398,0.042275,0.088512,0.184321,0.513366


In [11]:
def description_for_categoric_cols(data):
    result_df = pd.DataFrame(columns=['Feature_Name', 'Count', 'Missing_Count', 'Missing_Percentage', 'Value_Counts', \
                                        'Number_Of_Unique', 'Mode'])
    
    for column in data.columns:
        if column != 'customer_ID':
            count_ = data[column].count()
            missing_count = data[column].isna().sum()
            missing_percentage = (missing_count / len(data[column]))*100
            value_counts = data[column].value_counts()
            unique_values = data[column].unique()
            mode_ = data[column].mode()[0]

            result_df = result_df.append({
                'Feature_Name': column,
                'Count': count_,
                'Missing_Count': missing_count,
                'Missing_Percentage': missing_percentage,
                'Value_Counts': value_counts,
                'Number_Of_Unique': unique_values,
                'Mode': mode_,
            }, ignore_index=True)
        else:
            pass

    return result_df

In [12]:
description_categoric_cols = description_for_categoric_cols(df.loc[:, categoric_cols])

In [13]:
display(description_categoric_cols.head(10))

Unnamed: 0,Feature_Name,Count,Missing_Count,Missing_Percentage,Value_Counts,Number_Of_Unique,Mode
0,D_63,5531451,0,0.0,CO 4119621 CR 930133 CL 438390 XZ ...,"[CR, CO, CL, XZ, XM, XL]",CO
1,D_64,5314009,217442,3.931012,O 2913244 U 1523448 R 840112 -1 ...,"[O, R, nan, U, -1]",O


In [None]:
def check_duplicate_cols(data):
    duplicate_cols = data.T.duplicated()
    duplicate_columns = data.columns[duplicate_cols].tolist()
    return duplicate_columns

def check_duplicate_rows(data):
    duplicate_rows = data[data.duplicated()]
    return duplicate_rows

In [None]:
duplicate_cols = check_duplicate_cols(df)
duplicate_rows= check_duplicate_rows(df)

print(f"duplicate cols: \n{duplicate_cols}")
print("*"*20)
print(f"duplicate rows: \n{duplicate_rows}")

In [None]:
def plot_histogram(data, feature_name, bins=20):
    plt.hist(data[feature_name], bins=bins)
    plt.xlabel('Values')
    plt.ylabel('Frequency')
    plt.title(f'Histogram of {feature_name}')
    plt.show()


def plot_kde(data, feature_name):
    sns.kdeplot(data[feature_name], shade=True)
    plt.xlabel('Values')
    plt.ylabel('Density')
    plt.title(f'KDE Plot of {feature_name}')
    plt.show()


def plot_boxplot(data, feature_name):
    sns.boxplot(x=data[feature_name])
    plt.xlabel(f'{feature_name}')
    plt.title(f'Box Plot of {feature_name}')
    plt.show()

def plot_scatter(data, feature_name1, feature_name2):
    plt.scatter(data[feature_name1], data[feature_name2])
    plt.xlabel(feature_name1)
    plt.ylabel(feature_name2)
    plt.title(f'Scatter Plot of {feature_name1} vs {feature_name2}')
    plt.show()


In [None]:
from sklearn.impute import SimpleImputer, IterativeImputer

def simple_imputer(data, strategy):
    imputer = SimpleImputer(strategy=strategy)
    data_imputed = imputer.fit_transform(data)
    return data_imputed

def iterative_imputer(data, max_iter=10):
    imputer = IterativeImputer(max_iter=max_iter, random_state=0)
    data_imputed = imputer.fit_transform(data)
    return data_imputed

In [None]:
from sklearn.preprocessing import LabelEncoder

def label_encoder(data, column_name):
    label_encoder = LabelEncoder()
    data[column_name + '_encoded'] = label_encoder.fit_transform(data[column_name])
    return data 

def one_hot_encoder(data, column_name):
    ohe = pd.get_dummies(data[column_name], prefix=column_name)
    data = pd.concat([data, ohe], axis=1)
    return data 

In [None]:
from sklearn.model_selection import train_test_split
import xgboost as xgb 
from sklearn.metrics import roc_auc_score

def split_data(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0, stratify=y)
    return X_train, X_test, y_train, y_test

def calculate_univariate_gini(data, target_name, X_test, y_test):
    univariate_gini = {}

    for feature in data.columns:
        if feature != target_name:
            X = data[[feature]]
            y = data[target_name]

            model = xgb.XGBClassifier()
            model.fit(X,y)

            y_pred = model.predict_proba(X_test[feature])[:,1]
            roc_auc = roc_auc_score(y_test, y_pred)
            gini = 2 * roc_auc - 1 

            univariate_gini[feature] = gini 
    
    return univariate_gini