# Predicting Default Risk Of Credit Card Customers Using Gradient Descent

In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import classification_report
from sklearn.utils.class_weight import compute_class_weight
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV
from sklearn.preprocessing import MinMaxScaler

## Load the dataset into a Pandas DataFrame

In [2]:
df = pd.read_csv('Credit_card_data.csv')
df.head()

Unnamed: 0,CLIENTNUM,Attrition_Flag,Customer_Age,Gender,Dependent_count,Education_Level,Marital_Status,Income_Category,Card_Category,Months_on_book,...,Credit_Limit,Total_Revolving_Bal,Avg_Open_To_Buy,Total_Amt_Chng_Q4_Q1,Total_Trans_Amt,Total_Trans_Ct,Total_Ct_Chng_Q4_Q1,Avg_Utilization_Ratio,Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1,Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2
0,768805383,Existing Customer,45,M,3,High School,Married,$60K - $80K,Blue,39,...,12691.0,777,11914.0,1.335,1144,42,1.625,0.061,9.3e-05,0.99991
1,818770008,Existing Customer,49,F,5,Graduate,Single,Less than $40K,Blue,44,...,8256.0,864,7392.0,1.541,1291,33,3.714,0.105,5.7e-05,0.99994
2,713982108,Existing Customer,51,M,3,Graduate,Married,$80K - $120K,Blue,36,...,3418.0,0,3418.0,2.594,1887,20,2.333,0.0,2.1e-05,0.99998
3,769911858,Existing Customer,40,F,4,High School,Unknown,Less than $40K,Blue,34,...,3313.0,2517,796.0,1.405,1171,20,2.333,0.76,0.000134,0.99987
4,709106358,Existing Customer,40,M,3,Uneducated,Married,$60K - $80K,Blue,21,...,4716.0,0,4716.0,2.175,816,28,2.5,0.0,2.2e-05,0.99998


## Data Processing Starts Here

In [3]:
df = df.rename(columns={df.columns[-2]: 'attrition1', df.columns[-1]: 'attrition2'})
df.duplicated().sum()

0

In [4]:
df_processed = pd.DataFrame()

In [5]:
df['Attrition_Flag'].unique()

df_processed['attrition_flag'] = df['Attrition_Flag'].map({'Existing Customer': 0, 'Attrited Customer': 1})

df_processed['age'] = df['Customer_Age']

df_processed['gender'] = df['Gender'].map({'F': 0, 'M': 1})

df_processed['dependent_count'] = df['Dependent_count']


In [6]:
education_unknown_indexes = df[df['Education_Level'] == 'Unknown'].index

df = df[~df.index.isin(education_unknown_indexes)]
df_processed = df_processed[~df_processed.index.isin(education_unknown_indexes)]

education_level_dict = {
    'Uneducated': 0,
    'High School': 1,
    'College': 2,
    'Graduate': 3,
    'Post-Graduate': 4,
    'Doctorate': 5
}

df_processed['education'] = df['Education_Level'].map(education_level_dict)

In [7]:
df['Marital_Status'].unique()

unknown_marital_indexes = df[df['Marital_Status'] == 'Unknown'].index

df = df[~df.index.isin(unknown_marital_indexes)]
df_processed = df_processed[~df_processed.index.isin(unknown_marital_indexes)]

marital_dummies = pd.get_dummies(df['Marital_Status'], prefix='status', drop_first=True)

df_processed = pd.concat([df_processed, marital_dummies], axis='columns')

In [8]:
df['Income_Category'].unique()

unknown_income_indexes = df[df['Income_Category'] == 'Unknown'].index

df = df[~df.index.isin(unknown_income_indexes)]
df_processed = df_processed[~df_processed.index.isin(unknown_income_indexes)]

income_dict = {
    'Less than $40K': 0,
    '$40K - $60K': 1,
    '$60K - $80K': 2,
    '$80K - $120K': 3,
    '$120K +': 4
}

df_processed['income_category'] = df['Income_Category'].map(income_dict)

In [9]:
df['Card_Category'].unique()

card_dummies = pd.get_dummies(df['Card_Category'], prefix='card', drop_first=True)

df_processed = pd.concat([df_processed, card_dummies], axis='columns')

In [10]:
rest_of_data = df.iloc[:,9:21]
rest_of_data.head()
df_processed = pd.concat([df_processed, rest_of_data], axis='columns')

## Prepare Data for training 

In [11]:
y = df_processed['attrition_flag']
y.value_counts()

0    5968
1    1113
Name: attrition_flag, dtype: int64

In [12]:
class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y), y=y)
class_weights

array([0.59324732, 3.18104223])

In [13]:
class_weight_dict = {
    0: class_weights[0],
    1: class_weights[1]
}
class_weight_dict


{0: 0.5932473190348525, 1: 3.1810422282120396}

In [14]:
scaler = MinMaxScaler()

In [15]:
X = scaler.fit_transform(df_processed.drop('attrition_flag', axis='columns'))

In [16]:
model_dict = {
    'Random Forest': RandomForestClassifier(class_weight=class_weight_dict),
    'Gradient Boosting': GradientBoostingClassifier(),
}
model_dict

{'Random Forest': RandomForestClassifier(class_weight={0: 0.5932473190348525,
                                      1: 3.1810422282120396}),
 'Gradient Boosting': GradientBoostingClassifier()}

In [17]:
for name, model in model_dict.items():
    scores = cross_val_score(model, X, y)
    print(f'{name} scores: ', scores)
    print(f'{name} mean: ', scores.mean())

Random Forest scores:  [0.88073394 0.95056497 0.96045198 0.96257062 0.84533898]
Random Forest mean:  0.919932099725289
Gradient Boosting scores:  [0.90190543 0.94279661 0.96892655 0.96398305 0.85381356]
Gradient Boosting mean:  0.9262850416053651


In [18]:
GB_optimizer = GridSearchCV(GradientBoostingClassifier(), {
    'loss': ['log_loss', 'exponential'],
    'n_estimators': [50, 100, 150, 200],
    'criterion': ['friedman_mse', 'squared_error'],
    'max_features': [1]
})

In [19]:
GB_optimizer.fit(X, y)

In [20]:
print(GB_optimizer.best_params_, GB_optimizer.best_score_)

{'criterion': 'squared_error', 'loss': 'log_loss', 'max_features': 1, 'n_estimators': 200} 0.9161158491122727


## Train the Model

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=2022)

In [22]:
gb_model = GradientBoostingClassifier(criterion='friedman_mse', loss='exponential', max_features=1, n_estimators=200)

In [23]:
gb_model.fit(X_train, y_train)

## Validating Results

In [24]:
gb_model.score(X_test, y_test)

0.9350741002117149

In [25]:
y_predicted = gb_model.predict(X_test)
y_predicted

array([1, 0, 0, ..., 0, 0, 0])

In [26]:
print(classification_report(y_test, y_predicted))

              precision    recall  f1-score   support

           0       0.94      0.99      0.96      1194
           1       0.90      0.66      0.76       223

    accuracy                           0.94      1417
   macro avg       0.92      0.82      0.86      1417
weighted avg       0.93      0.94      0.93      1417

