# 1. Load and Preprocess Data


Here is where we are going to prepare the dataset for training our machine learning models.

Let's load the data into a Pandas DataFrame. Each row represents a sample (client), and each column represents a feature (such as number of children, annual income, income category) or the label (Outcome).

We will also check and remove duplicates. Duplicate rows can bias the model, so it's important we remove them.

Check and clean rows with NULL entries. Missing values can prevent the model from training.

In [48]:
import kagglehub
import pandas as pd
import os

# Download latest version
path = kagglehub.dataset_download("samuelcortinhas/credit-card-approval-clean-data")

print("Path to dataset files:", path)

df_application = pd.read_csv(os.path.join(path, 'clean_dataset.csv'))

Path to dataset files: /Users/henrywei/.cache/kagglehub/datasets/samuelcortinhas/credit-card-approval-clean-data/versions/2


Investigate the dataset

In [49]:
df_application.head(20)

Unnamed: 0,Gender,Age,Debt,Married,BankCustomer,Industry,Ethnicity,YearsEmployed,PriorDefault,Employed,CreditScore,DriversLicense,Citizen,ZipCode,Income,Approved
0,1,30.83,0.0,1,1,Industrials,White,1.25,1,1,1,0,ByBirth,202,0,1
1,0,58.67,4.46,1,1,Materials,Black,3.04,1,1,6,0,ByBirth,43,560,1
2,0,24.5,0.5,1,1,Materials,Black,1.5,1,0,0,0,ByBirth,280,824,1
3,1,27.83,1.54,1,1,Industrials,White,3.75,1,1,5,1,ByBirth,100,3,1
4,1,20.17,5.625,1,1,Industrials,White,1.71,1,0,0,0,ByOtherMeans,120,0,1
5,1,32.08,4.0,1,1,CommunicationServices,White,2.5,1,0,0,1,ByBirth,360,0,1
6,1,33.17,1.04,1,1,Transport,Black,6.5,1,0,0,1,ByBirth,164,31285,1
7,0,22.92,11.585,1,1,InformationTechnology,White,0.04,1,0,0,0,ByBirth,80,1349,1
8,1,54.42,0.5,0,0,Financials,Black,3.96,1,0,0,0,ByBirth,180,314,1
9,1,42.5,4.915,0,0,Industrials,White,3.165,1,0,0,1,ByBirth,52,1442,1


In [50]:
df_application.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 690 entries, 0 to 689
Data columns (total 16 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Gender          690 non-null    int64  
 1   Age             690 non-null    float64
 2   Debt            690 non-null    float64
 3   Married         690 non-null    int64  
 4   BankCustomer    690 non-null    int64  
 5   Industry        690 non-null    object 
 6   Ethnicity       690 non-null    object 
 7   YearsEmployed   690 non-null    float64
 8   PriorDefault    690 non-null    int64  
 9   Employed        690 non-null    int64  
 10  CreditScore     690 non-null    int64  
 11  DriversLicense  690 non-null    int64  
 12  Citizen         690 non-null    object 
 13  ZipCode         690 non-null    int64  
 14  Income          690 non-null    int64  
 15  Approved        690 non-null    int64  
dtypes: float64(3), int64(10), object(3)
memory usage: 86.4+ KB


In [51]:
print(df_application['Industry'].unique())
print(df_application['Ethnicity'].unique())
print(df_application['Citizen'].unique())

['Industrials' 'Materials' 'CommunicationServices' 'Transport'
 'InformationTechnology' 'Financials' 'Energy' 'Real Estate' 'Utilities'
 'ConsumerDiscretionary' 'Education' 'ConsumerStaples' 'Healthcare'
 'Research']
['White' 'Black' 'Asian' 'Latino' 'Other']
['ByBirth' 'ByOtherMeans' 'Temporary']


#  Feature Engineering
We convert the categorical variables into numbers for machine learning by using one-hot encoding and ordinal encoding based on the nature of the data type.



In [60]:
def features(df):
    X = df[['Gender', 'Age', 'Debt', 'Married', 'BankCustomer', 'YearsEmployed', 'PriorDefault', 'Employed', 'CreditScore', 'DriversLicense', 'Income']].copy()
    X = pd.concat([X, pd.get_dummies(df['Industry'], prefix='Industry')], axis=1)
    X = pd.concat([X, pd.get_dummies(df['Ethnicity'], prefix='Ethnicity')], axis=1)
    citizen_groups = {
        'ByBirth': 0,
        'ByOtherMeans': 1,
        'Temporary': 2         
    }

    X["Citizen_number"] = [citizen_groups[x] for x in df["Citizen"]]
    return X
clean_data = features(df_application)
clean_data.head(20)

Unnamed: 0,Gender,Age,Debt,Married,BankCustomer,YearsEmployed,PriorDefault,Employed,CreditScore,DriversLicense,...,Industry_Real Estate,Industry_Research,Industry_Transport,Industry_Utilities,Ethnicity_Asian,Ethnicity_Black,Ethnicity_Latino,Ethnicity_Other,Ethnicity_White,Citizen_number
0,1,30.83,0.0,1,1,1.25,1,1,1,0,...,False,False,False,False,False,False,False,False,True,0
1,0,58.67,4.46,1,1,3.04,1,1,6,0,...,False,False,False,False,False,True,False,False,False,0
2,0,24.5,0.5,1,1,1.5,1,0,0,0,...,False,False,False,False,False,True,False,False,False,0
3,1,27.83,1.54,1,1,3.75,1,1,5,1,...,False,False,False,False,False,False,False,False,True,0
4,1,20.17,5.625,1,1,1.71,1,0,0,0,...,False,False,False,False,False,False,False,False,True,1
5,1,32.08,4.0,1,1,2.5,1,0,0,1,...,False,False,False,False,False,False,False,False,True,0
6,1,33.17,1.04,1,1,6.5,1,0,0,1,...,False,False,True,False,False,True,False,False,False,0
7,0,22.92,11.585,1,1,0.04,1,0,0,0,...,False,False,False,False,False,False,False,False,True,0
8,1,54.42,0.5,0,0,3.96,1,0,0,0,...,False,False,False,False,False,True,False,False,False,0
9,1,42.5,4.915,0,0,3.165,1,0,0,1,...,False,False,False,False,False,False,False,False,True,0


In [61]:
print(clean_data.dtypes)

Gender                              int64
Age                               float64
Debt                              float64
Married                             int64
BankCustomer                        int64
YearsEmployed                     float64
PriorDefault                        int64
Employed                            int64
CreditScore                         int64
DriversLicense                      int64
Income                              int64
Industry_CommunicationServices       bool
Industry_ConsumerDiscretionary       bool
Industry_ConsumerStaples             bool
Industry_Education                   bool
Industry_Energy                      bool
Industry_Financials                  bool
Industry_Healthcare                  bool
Industry_Industrials                 bool
Industry_InformationTechnology       bool
Industry_Materials                   bool
Industry_Real Estate                 bool
Industry_Research                    bool
Industry_Transport                

# Split the dataset into testing data and training data

In [62]:
y = df_application['Approved'].copy()

In [63]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(clean_data, y, test_size = 0.2, random_state = 42)

In [64]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [65]:
# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train the dataset

###  Logistic Regression

In [66]:
# Logistic Regression
model = LogisticRegression()
model.fit(X_train_scaled, y_train)

y_pred = model.predict(X_test_scaled)
print(f"Logistic Regression Accuracy: {accuracy_score(y_test, y_pred)}")
print(classification_report(y_test, y_pred))

Logistic Regression Accuracy: 0.8260869565217391
              precision    recall  f1-score   support

           0       0.81      0.84      0.83        68
           1       0.84      0.81      0.83        70

    accuracy                           0.83       138
   macro avg       0.83      0.83      0.83       138
weighted avg       0.83      0.83      0.83       138



### K-Neighbors Classifier

In [43]:
from sklearn.neighbors import KNeighborsClassifier
model_K = KNeighborsClassifier(n_neighbors=5)
model_K.fit(X_train_scaled, y_train)
y_pred =model_K.predict(X_test_scaled)
print(f"KNeighbors Classifier Accuracy: {accuracy_score(y_test, y_pred)}")
print(classification_report(y_test, y_pred))

KNeighbors Classifier Accuracy: 0.7463768115942029
              precision    recall  f1-score   support

           0       0.71      0.82      0.76        68
           1       0.80      0.67      0.73        70

    accuracy                           0.75       138
   macro avg       0.75      0.75      0.75       138
weighted avg       0.75      0.75      0.75       138



###  MLP Classifier

In [47]:
from sklearn.neural_network import MLPClassifier
mlp_classifier = MLPClassifier(solver='lbfgs', alpha=1e-5,
                               hidden_layer_sizes=(8, 2), random_state=11,max_iter=10000)
mlp_classifier.fit(X_train_scaled, y_train)
y_pred =model_K.predict(X_test_scaled)
print(f"MLP Classifier Accuracy: {accuracy_score(y_test, y_pred)}")
print(classification_report(y_test, y_pred))


MLP Classifier Accuracy: 0.7463768115942029
              precision    recall  f1-score   support

           0       0.71      0.82      0.76        68
           1       0.80      0.67      0.73        70

    accuracy                           0.75       138
   macro avg       0.75      0.75      0.75       138
weighted avg       0.75      0.75      0.75       138



# Evaluate the dataset