In [84]:
import pandas as pd 
import numpy as np
import seaborn as sns 
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split,StratifiedKFold,cross_val_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix,precision_score,recall_score,f1_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier

In [85]:
df=pd.read_csv('synthetic_dataset_10000x20.csv')
df.head()

Unnamed: 0,customer_id,age,income,savings,monthly_expenses,num_dependents,credit_score,loan_amount,loan_term_months,employment_years,...,education,marital_status,region,recent_default,has_credit_card,signup_date,signup_dayofweek,debt_to_income,sin_age,target_default_risk
0,CUST006253,30,66737.0,11155.0,2272.0,2,605.076204,26965.0,48,3.9,...,HS,Single,West,1,1,2020-07-05,6,0.404,0.14112,1
1,CUST004685,22,70740.0,997.0,1934.0,1,683.291967,4681.0,36,0.7,...,Bachelors,Married,East,0,0,2018-10-03,2,0.066,0.808496,1
2,CUST001732,68,38890.0,1929.0,1696.0,0,658.00336,12633.0,72,2.2,...,Bachelors,Single,East,0,1,2018-05-30,2,0.325,0.494113,0
3,CUST004743,49,29049.0,6284.0,2485.0,1,707.477864,20881.0,36,2.7,...,HS,Married,South,0,1,2018-04-22,6,0.719,-0.982453,0
4,CUST004522,74,60063.0,924.0,3179.0,2,564.768511,19438.0,36,10.3,...,Masters,Single,West,0,0,2019-12-03,1,0.324,0.898708,1


In [86]:
df.isnull().sum()

customer_id              0
age                      0
income                 318
savings                311
monthly_expenses       325
num_dependents           0
credit_score           326
loan_amount              0
loan_term_months         0
employment_years         0
home_ownership           0
education                0
marital_status           0
region                   0
recent_default           0
has_credit_card          0
signup_date              0
signup_dayofweek         0
debt_to_income           0
sin_age                  0
target_default_risk      0
dtype: int64

In [87]:
df1=df.copy()

In [88]:
df1.dtypes

customer_id             object
age                      int64
income                 float64
savings                float64
monthly_expenses       float64
num_dependents           int64
credit_score           float64
loan_amount            float64
loan_term_months         int64
employment_years       float64
home_ownership          object
education               object
marital_status          object
region                  object
recent_default           int64
has_credit_card          int64
signup_date             object
signup_dayofweek         int64
debt_to_income         float64
sin_age                float64
target_default_risk      int64
dtype: object

In [89]:
df1=df.copy()

In [90]:
df1['education'] = df1['education'].replace({
    'Bachlors': 'Bachelors',
    'HS': 'High School'
})

In [91]:
X=df1.drop(columns=['customer_id','signup_date'],axis=1)
y=df1[['target_default_risk']]

In [92]:
X

Unnamed: 0,age,income,savings,monthly_expenses,num_dependents,credit_score,loan_amount,loan_term_months,employment_years,home_ownership,education,marital_status,region,recent_default,has_credit_card,signup_dayofweek,debt_to_income,sin_age,target_default_risk
0,30,66737.0,11155.0,2272.0,2,605.076204,26965.0,48,3.9,RENT,High School,Single,West,1,1,6,0.404,0.141120,1
1,22,70740.0,997.0,1934.0,1,683.291967,4681.0,36,0.7,RENT,Bachelors,Married,East,0,0,2,0.066,0.808496,1
2,68,38890.0,1929.0,1696.0,0,658.003360,12633.0,72,2.2,OWN,Bachelors,Single,East,0,1,2,0.325,0.494113,0
3,49,29049.0,6284.0,2485.0,1,707.477864,20881.0,36,2.7,OWN,High School,Married,South,0,1,6,0.719,-0.982453,0
4,74,60063.0,924.0,3179.0,2,564.768511,19438.0,36,10.3,MORTGAGE,Masters,Single,West,0,0,1,0.324,0.898708,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,54,44507.0,5975.0,2520.0,1,699.633352,31089.0,48,5.3,RENT,High School,Single,East,0,1,3,0.699,-0.772764,1
9996,50,20651.0,10203.0,1020.0,3,680.774066,8977.0,60,9.6,RENT,PhD,Divorced,North,0,0,3,0.435,-0.958924,0
9997,43,33827.0,3848.0,2562.0,1,655.562748,24319.0,60,4.3,OTHER,High School,Married,West,0,0,4,0.719,-0.916166,0
9998,44,38273.0,18880.0,1060.0,2,653.277645,1000.0,24,11.4,MORTGAGE,Other,Single,North,0,1,6,0.026,-0.951602,0


In [93]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

In [94]:
other_num_cols=['age', 'num_dependents', 'credit_score','loan_term_months', 
                  'employment_years','debt_to_income']

cat_columns=['marital_status','region','home_ownership']

num_columns=['income','savings','monthly_expenses',	'credit_score']

cat_ordinal_columns= ['education']


# Categorical pipeline
cat_pipe = Pipeline([
    ('imputer1',SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore'))
])

cat_ordinal_pipe=Pipeline([ 
    ('imputer2',SimpleImputer(strategy='most_frequent')),
    ('ordinal_encoder', OrdinalEncoder())
])
# Numerical pipeline
num_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', MinMaxScaler())
])

# Column transformer
preprocessor = ColumnTransformer([
    ('cat_ord',cat_ordinal_pipe,cat_ordinal_columns),
    ('cat', cat_pipe, cat_columns),
     ('other_num',num_pipe,other_num_cols),
    ('num', num_pipe,num_columns)
], remainder='drop')


gb_clf = GradientBoostingClassifier(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=3,
    random_state=42
)
pipe_gb = Pipeline([
    ('preprocess', preprocessor),
    ('model_gb', gb_clf)
])
pipe_gb.fit(X_train,y_train)
X_train_trans = pipe_gb.named_steps['preprocess'].transform(X_train)
X_test_trans  = pipe_gb.named_steps['preprocess'].transform(X_test)


y_pred_gb = pipe_gb.named_steps['model_gb'].predict(X_test_trans)
print('accuracy of gb:',accuracy_score(y_test,y_pred_gb))
print('Precision:',precision_score(y_test,y_pred_gb))
print("Recall:", recall_score(y_test, y_pred_gb))
print("F1-score:", f1_score(y_test, y_pred_gb))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_gb))

  y = column_or_1d(y, warn=True)


accuracy of gb: 0.9415
Precision: 0.9480392156862745
Recall: 0.9379243452958292
F1-score: 0.9429546562652364
Confusion Matrix:
 [[916  53]
 [ 64 967]]


In [95]:
model_log=LogisticRegression(max_iter=1000,
            class_weight='balanced')
pipe_log= Pipeline([
    ('preprocess_log', preprocessor),
    ('model_log', model_log)
])
pipe_log.fit(X_train,y_train)
y_pred_log = pipe_log.predict(X_test)
print('accuracy of logistic:',accuracy_score(y_test,y_pred_log))
print('Precision:',precision_score(y_test,y_pred_log))
print("Recall:", recall_score(y_test, y_pred_log))
print("F1-score:", f1_score(y_test, y_pred_log))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_log))


accuracy of logistic: 0.9155
Precision: 0.9624463519313304
Recall: 0.8700290979631425
F1-score: 0.9139072847682119
Confusion Matrix:
 [[934  35]
 [134 897]]


  y = column_or_1d(y, warn=True)


In [120]:
Decision_Tree= DecisionTreeClassifier(
            max_depth= 6,
            min_samples_split=20,
            random_state=42)
pipe_DT= Pipeline([
    ('preprocess_DT', preprocessor),
    ('model_DT',Decision_Tree )
])
pipe_DT.fit(X_train,y_train)
y_pred_DT= pipe_DT.predict(X_test)
print('accuracy of DT:',accuracy_score(y_test,y_pred_DT))
print('Precision:',precision_score(y_test,y_pred_DT))
print("Recall:", recall_score(y_test, y_pred_DT))
print("F1-score:", f1_score(y_test,y_pred_DT))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_DT))




accuracy of DT: 0.936
Precision: 0.923943661971831
Recall: 0.9544131910766246
F1-score: 0.9389312977099237
Confusion Matrix:
 [[888  81]
 [ 47 984]]


In [97]:
# Column transformer
preprocessor1 = ColumnTransformer([
    ('cat_ord',cat_ordinal_pipe,cat_ordinal_columns),
    ('cat', cat_pipe, cat_columns),
    ('other_num',num_pipe,other_num_cols),
    ('num', num_pipe,num_columns)
], remainder='drop')

Random_Forest= RandomForestClassifier(  n_estimators=400,
            max_depth=8,
            min_samples_split=15,
            n_jobs=-1,
            random_state=42)

pipe_randomforest= Pipeline([
    ('preprocess_r', preprocessor1),
    ('model_randomforest', Random_Forest)
])
pipe_randomforest.fit(X_train,y_train)
X_train_trans = pipe_randomforest.named_steps['preprocess_r'].transform(X_train)
X_test_trans  = pipe_randomforest.named_steps['preprocess_r'].transform(X_test)

y_pred_RF= pipe_randomforest.named_steps['model_randomforest'].predict(X_test_trans)
print('accuracy of RF:',accuracy_score(y_test,y_pred_RF))
print('Precision:',precision_score(y_test,y_pred_RF))
print("Recall:", recall_score(y_test, y_pred_RF))
print("F1-score:", f1_score(y_test, y_pred_RF))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_RF))


  return fit_method(estimator, *args, **kwargs)


accuracy of RF: 0.933
Precision: 0.9544072948328267
Recall: 0.9136760426770126
F1-score: 0.933597621407334
Confusion Matrix:
 [[924  45]
 [ 89 942]]


In [115]:
preprocessor2= ColumnTransformer([ 
    ('other_num',num_pipe,other_num_cols),
    ('num', num_pipe,num_columns)
], remainder='drop')
svm=SVC(C=2.0,kernel='rbf',degree=3,gamma=2.0)
pipe_svm= Pipeline([
    ('preprocess_svm', preprocessor2),
    ('model_svm',svm)
])
pipe_svm.fit(X_train,y_train)
y_pred_svm=pipe_svm.predict(X_test)
print('accuracy of svm:',accuracy_score(y_test,y_pred_svm))
print('Precision:',precision_score(y_test,y_pred_svm))
print("Recall:", recall_score(y_test, y_pred_svm))
print("F1-score:", f1_score(y_test, y_pred_svm))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_svm))


  y = column_or_1d(y, warn=True)


accuracy of svm: 0.9245
Precision: 0.9536082474226805
Recall: 0.8971871968962173
F1-score: 0.9245377311344328
Confusion Matrix:
 [[924  45]
 [106 925]]


In [118]:
XGBoost=XGBClassifier(
            n_estimators=500,
            max_depth=3,
            learning_rate=0.03,
            subsample=0.95,
            colsample_bytree=0.95,
            reg_alpha=0.5,
            reg_lambda=1.5,
            gamma=0.1,
            random_state=42,
            eval_metric='logloss'
        )
pipe_xgboost= Pipeline([
    ('preprocess_xg', preprocessor),
    ('model_log', XGBoost)
])
pipe_xgboost.fit(X_train,y_train)
y_pred_xg= pipe_xgboost.predict(X_test)
print('accuracy of xgboost:',accuracy_score(y_test,y_pred_xg))
print('Precision:',precision_score(y_test,y_pred_xg))
print("Recall:", recall_score(y_test, y_pred_xg))
print("F1-score:", f1_score(y_test,y_pred_xg))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_xg))



accuracy of xgboost: 0.9365
Precision: 0.9493041749502982
Recall: 0.9262851600387972
F1-score: 0.937653411880216
Confusion Matrix:
 [[918  51]
 [ 76 955]]


In [123]:
# Column transformer
preprocessor = ColumnTransformer([
    ('cat_ord',cat_ordinal_pipe,cat_ordinal_columns),
    ('cat', cat_pipe, cat_columns),
     ('other_num',num_pipe,other_num_cols),
    ('num', num_pipe,num_columns)
], remainder='drop')


gb_clf = GradientBoostingClassifier(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=3,
    random_state=42
)
pipe_gb = Pipeline([
    ('preprocess', preprocessor),
    ('model', gb_clf)
])
def evaluate_stage2_models(X, y):

    models = {
        "Logistic Regression": Pipeline([
            ('preprocess', preprocessor),
            ('model', LogisticRegression(max_iter=1000))
        ]),

        "Decision Tree": Pipeline([
            ('preprocess', preprocessor),
            ('model', DecisionTreeClassifier(
                max_depth=8,
                min_samples_split=15,
                
            ))
        ]),

        "Random Forest": Pipeline([
            ('preprocess', preprocessor),
            ('model', RandomForestClassifier(
                n_estimators=400,
                max_depth=8,
                min_samples_split=15,
                n_jobs=-1,
                random_state=42
            ))
        ]),

        "SVC": Pipeline([
            ('preprocess', preprocessor2),
            ('model', SVC(
                kernel='rbf',
                C=1.5
                
            ))
        ]),

        "XGBoost": Pipeline([
            ('preprocess', preprocessor),
            ('model', XGBClassifier(
                n_estimators=900,
                max_depth=3,
                learning_rate=0.04,
                subsample=0.85,
                colsample_bytree=0.85,
                reg_alpha=0.3,
                reg_lambda=1.2,
                gamma=0.1,
                random_state=42,
                eval_metric='logloss'
            ))
        ])
    }

    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    for name, model in models.items():
        scores = cross_val_score(
            model,
            X,
            y,
            cv=cv,
            scoring='accuracy',
            n_jobs=-1
        )
        print(f"{name} Accuracy: {scores.mean():.4f}")

evaluate_stage2_models(X_train, y_train)


Logistic Regression Accuracy: 0.9109
Decision Tree Accuracy: 0.9223
Random Forest Accuracy: 0.9235
SVC Accuracy: 0.9181
XGBoost Accuracy: 0.9282


In [121]:
print('accuracy of logistic:',accuracy_score(y_test,y_pred_log))
print('accuracy of DT:',accuracy_score(y_test,y_pred_DT))
print('accuracy of svm:',accuracy_score(y_test,y_pred_svm))
print('accuracy of RF:',accuracy_score(y_test,y_pred_RF))
print('accuracy of xgboost:',accuracy_score(y_test,y_pred_xg))

accuracy of logistic: 0.9155
accuracy of DT: 0.936
accuracy of svm: 0.9245
accuracy of RF: 0.933
accuracy of xgboost: 0.9365


In [None]:
preprocessor.get_feature_names_out()

In [None]:
df1['education'].value_counts()

In [None]:
model_svm=SVC(C=2.0,kernel='rbf',degree=3)
model_svm.fit(X_train,y_train)
y_pred_svm=model_svm.predict(x_test)
y_pred_svm

In [108]:
def run_stage2_gridsearch_jupyter(X_train, y_train, preprocessor):
    results = []

    # Model definitions + smaller test grids for Jupyter
    model_dict = {
        "Decision Tree": (
            DecisionTreeClassifier(random_state=42),
            {
                'model__max_depth': [6, 8],          # smaller grid for testing
                'model__min_samples_split': [10, 15],
                'model__min_samples_leaf': [1, 2]
            }
        ),
        "SVC": (
            SVC(probability=True, class_weight='balanced'),
            {
                'model__C': [1.0, 1.5],
                'model__gamma': ['scale', 0.05],
                'model__kernel': ['rbf']
            }
        ),
        "Random Forest": (
            RandomForestClassifier(random_state=42),
            {
                'model__n_estimators': [400, 600],
                'model__max_depth': [8, 10],
                'model__min_samples_split': [10, 15],
                'model__min_samples_leaf': [1, 2]
            }
        ),
        "XGBoost": (
            XGBClassifier(random_state=42, eval_metric='logloss'),
            {
                'model__n_estimators': [500, 700],
                'model__max_depth': [3, 4],
                'model__learning_rate': [0.04, 0.05],
                'model__subsample': [0.85],
                'model__colsample_bytree': [0.85],
                'model__gamma': [0, 0.1],
                'model__reg_alpha': [0, 0.3],
                'model__reg_lambda': [1, 1.2]
            }
        )
    }

    # Loop through models
    for name, (estimator, param_grid) in model_dict.items():
        print(f"Running GridSearchCV for {name}...")
        pipe = Pipeline([
            ('preprocess', preprocessor),
            ('model', estimator)
        ])
        grid = GridSearchCV(pipe, param_grid, cv=3, scoring='accuracy', n_jobs=-1)  # cv=3 for faster run
        grid.fit(X_train, y_train)
        
        print(f"Done with {name}: Best CV Accuracy = {grid.best_score_:.4f}\n")
        
        results.append({
            'Model': name,
            'Best CV Accuracy': grid.best_score_,
            'Best Parameters': grid.best_params_
        })

    # Convert to DataFrame and sort
    results_df = pd.DataFrame(results).sort_values(by='Best CV Accuracy', ascending=False).reset_index(drop=True)
    return results_df