In [1]:
import pandas as pd

df = pd.read_csv("credit_risk_dataset.csv")
df.head()

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
0,22,59000,RENT,123.0,PERSONAL,D,35000,16.02,1,0.59,Y,3
1,21,9600,OWN,5.0,EDUCATION,B,1000,11.14,0,0.1,N,2
2,25,9600,MORTGAGE,1.0,MEDICAL,C,5500,12.87,1,0.57,N,3
3,23,65500,RENT,4.0,MEDICAL,C,35000,15.23,1,0.53,N,2
4,24,54400,RENT,8.0,MEDICAL,C,35000,14.27,1,0.55,Y,4


In [2]:
df.shape

(32581, 12)

In [3]:
df.columns

Index(['person_age', 'person_income', 'person_home_ownership',
       'person_emp_length', 'loan_intent', 'loan_grade', 'loan_amnt',
       'loan_int_rate', 'loan_status', 'loan_percent_income',
       'cb_person_default_on_file', 'cb_person_cred_hist_length'],
      dtype='object')

In [4]:
df["loan_status"].value_counts()

loan_status
0    25473
1     7108
Name: count, dtype: int64

In [5]:
df.dtypes

person_age                      int64
person_income                   int64
person_home_ownership          object
person_emp_length             float64
loan_intent                    object
loan_grade                     object
loan_amnt                       int64
loan_int_rate                 float64
loan_status                     int64
loan_percent_income           float64
cb_person_default_on_file      object
cb_person_cred_hist_length      int64
dtype: object

In [6]:
X = df.drop("loan_status", axis=1)
y = df["loan_status"]

X.head(), y.head()

(   person_age  person_income person_home_ownership  person_emp_length  \
 0          22          59000                  RENT              123.0   
 1          21           9600                   OWN                5.0   
 2          25           9600              MORTGAGE                1.0   
 3          23          65500                  RENT                4.0   
 4          24          54400                  RENT                8.0   
 
   loan_intent loan_grade  loan_amnt  loan_int_rate  loan_percent_income  \
 0    PERSONAL          D      35000          16.02                 0.59   
 1   EDUCATION          B       1000          11.14                 0.10   
 2     MEDICAL          C       5500          12.87                 0.57   
 3     MEDICAL          C      35000          15.23                 0.53   
 4     MEDICAL          C      35000          14.27                 0.55   
 
   cb_person_default_on_file  cb_person_cred_hist_length  
 0                         Y         

In [7]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

X_train.shape, X_test.shape

((26064, 11), (6517, 11))

In [13]:
cat_cols = X.select_dtypes(include="object").columns
num_cols = X.select_dtypes(exclude="object").columns

cat_cols, num_cols

(Index(['person_home_ownership', 'loan_intent', 'loan_grade',
        'cb_person_default_on_file'],
       dtype='object'),
 Index(['person_age', 'person_income', 'person_emp_length', 'loan_amnt',
        'loan_int_rate', 'loan_percent_income', 'cb_person_cred_hist_length'],
       dtype='object'))

In [9]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [19]:
from sklearn.impute import SimpleImputer
num_pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

cat_pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

In [24]:
preprocessor = ColumnTransformer(
    transformers=[
        ("num", num_pipeline, num_cols),
        ("cat", cat_pipeline, cat_cols)
    ]
)

In [25]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(
    max_iter=1000,
    class_weight="balanced"
)

clf = Pipeline(steps=[
    ("preprocess", preprocessor),
    ("model", model)
])

In [26]:
clf.fit(X_train, y_train)

In [28]:
df.isna().sum()

person_age                       0
person_income                    0
person_home_ownership            0
person_emp_length              895
loan_intent                      0
loan_grade                       0
loan_amnt                        0
loan_int_rate                 3116
loan_status                      0
loan_percent_income              0
cb_person_default_on_file        0
cb_person_cred_hist_length       0
dtype: int64

In [29]:
from sklearn.metrics import classification_report, roc_auc_score

y_pred = clf.predict(X_test)
y_prob = clf.predict_proba(X_test)[:, 1]

print(classification_report(y_test, y_pred))
print("ROC AUC:", roc_auc_score(y_test, y_prob))

              precision    recall  f1-score   support

           0       0.93      0.82      0.87      5095
           1       0.55      0.78      0.65      1422

    accuracy                           0.81      6517
   macro avg       0.74      0.80      0.76      6517
weighted avg       0.85      0.81      0.82      6517

ROC AUC: 0.8711793780339513


In [30]:
feature_names = clf.named_steps["preprocess"].get_feature_names_out()

coefficients = clf.named_steps["model"].coef_[0]

coef_df = (
    pd.DataFrame({
        "feature": feature_names,
        "coef": coefficients
    })
    .sort_values("coef", ascending=False)
)

coef_df.head(10), coef_df.tail(10)

(                               feature      coef
 23                   cat__loan_grade_G  2.602140
 5             num__loan_percent_income  1.367689
 22                   cat__loan_grade_F  1.217945
 10     cat__person_home_ownership_RENT  0.904510
 21                   cat__loan_grade_E  0.872200
 13    cat__loan_intent_HOMEIMPROVEMENT  0.603599
 20                   cat__loan_grade_D  0.576376
 8     cat__person_home_ownership_OTHER  0.429990
 11  cat__loan_intent_DEBTCONSOLIDATION  0.346778
 14            cat__loan_intent_MEDICAL  0.266355,
                            feature      coef
 2           num__person_emp_length -0.001810
 0                  num__person_age -0.020314
 15       cat__loan_intent_PERSONAL -0.052278
 12      cat__loan_intent_EDUCATION -0.268647
 3                   num__loan_amnt -0.650635
 16        cat__loan_intent_VENTURE -0.691132
 9   cat__person_home_ownership_OWN -1.365161
 19               cat__loan_grade_C -1.471971
 18               cat__loan_grade_B

In [31]:
df["default_probability"] = clf.predict_proba(X)[:, 1]
df[["default_probability"]].head()

Unnamed: 0,default_probability
0,0.992052
1,0.046437
2,0.993134
3,0.907576
4,0.924378


In [32]:
df["risk_segment"] = pd.cut(
    df["default_probability"],
    bins=[0, 0.3, 0.6, 1.0],
    labels=["Low Risk", "Medium Risk", "High Risk"]
)

df["risk_segment"].value_counts()

risk_segment
Low Risk       16744
High Risk       8144
Medium Risk     7693
Name: count, dtype: int64

In [33]:
y_prob = clf.predict_proba(X_test)[:, 1]

In [34]:
import numpy as np
from sklearn.metrics import confusion_matrix

thresholds = np.linspace(0.01, 0.99, 99)

C_FN = 10   # batığı kaçırmak
C_FP = 1    # sağlamı reddetmek

costs = []

In [35]:
for t in thresholds:
    y_pred_t = (y_prob >= t).astype(int)
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred_t).ravel()
    cost = C_FN * fn + C_FP * fp
    costs.append(cost)

In [36]:
best_t_cost = thresholds[np.argmin(costs)]
best_t_cost

0.37

In [37]:
from sklearn.metrics import classification_report

y_pred_best = (y_prob >= best_t_cost).astype(int)

print("Best threshold:", best_t_cost)
print(confusion_matrix(y_test, y_pred_best))
print(classification_report(y_test, y_pred_best))

Best threshold: 0.37
[[3647 1448]
 [ 209 1213]]
              precision    recall  f1-score   support

           0       0.95      0.72      0.81      5095
           1       0.46      0.85      0.59      1422

    accuracy                           0.75      6517
   macro avg       0.70      0.78      0.70      6517
weighted avg       0.84      0.75      0.77      6517



In [38]:
best_t_cost = 0.37

In [40]:
from sklearn.metrics import confusion_matrix

confusion_matrix(y_test, y_pred_best)

array([[3647, 1448],
       [ 209, 1213]])

In [41]:
precision = 0.46
recall    = 0.85

In [42]:
THRESHOLD = 0.37

def credit_decision(applicant_dict, clf):
    import pandas as pd

    X_new = pd.DataFrame([applicant_dict])
    p_default = clf.predict_proba(X_new)[:, 1][0]

    decision = "REJECT" if p_default >= THRESHOLD else "APPROVE"

    return {
        "default_probability": round(float(p_default), 3),
        "threshold": THRESHOLD,
        "decision": decision
    }

In [43]:
import joblib
joblib.dump(clf, "credit_risk_pipeline.joblib")

['credit_risk_pipeline.joblib']