In [None]:
# Upload file "hmeq.csv" to Google Collab
from google.colab import files
uploaded = files.upload()  # this file can be found and downloaded from Kaggle 

Saving hmeq.csv to hmeq.csv


Bắt đầu xử lý dữ liệu và lập các Mô hình chấm điểm / Starting to EDA and then creating Scoring Models:

- EDA, xử lý missing / EDA, handling missing values

- WoE/IV + feature selection

- Logistic, XGBoost, LightGBM với/with regularization

- 10-Fold CV, Gini, Bootstrap AUC CI

- Phát hiện overfit / Overfit detection

- Báo cáo/report Markdown + PDF

==> XGBoost & LightGBM cho độ phân loại Good/Bad với Gini cao hơn hẳn Logistic Regression; nhưng Tree based models rất khó giải thích / XGBoost and LightGBM generate Good/Bad classification with higher Gini than Logistic Regression; but Tree based models are difficult to interpret.

In [None]:
# I implement this code and models through learning from relevant posts on Kaggle. So appreciated !
# ==============================================================
# HMEQ data (hmeq.csv) is downloaded from Kaggle, incl. ~ 5.9K rows & 13 columns
# HMEQ Credit Scoring on Google Colab (NGUYEN DUY THAI)
# Logistic Regression + XGBoost + LightGBM (đối với 2 model tree base: dùng regularization, độ sâu thấp, early-stopping để giảm overfitting)
# Full EDA → WoE/IV → 10-Fold CV → ROC AUC GINI → Markdown + PDF Report
# ==============================================================

# Install dependencies / Cài đặt cấu hình và các thư viện
!apt-get -y install wkhtmltopdf libffi-dev libcairo2 libpango-1.0-0 \
                    libpangocairo-1.0-0 libgdk-pixbuf2.0-0 > /dev/null
!pip install pandas numpy scikit-learn statsmodels xgboost lightgbm \
             seaborn matplotlib markdown weasyprint pdfkit --quiet

# Imports libraries
# ==============================================================

import os, io, warnings, numpy as np, pandas as pd, matplotlib.pyplot as plt, seaborn as sns
warnings.filterwarnings("ignore")

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.linear_model import LogisticRegression
from statsmodels.stats.outliers_influence import variance_inflation_factor
import statsmodels.api as sm
import xgboost as xgb, lightgbm as lgb
from markdown import markdown as md_to_html
from weasyprint import HTML
from google.colab import files

# ---- Helper functions
report_lines = []
def md(txt): report_lines.append(txt)
def section(en,vi): md(f"\n## {en} / {vi}\n")
def note(en,vi): md(f"**Note:** {en}\n\n**Ghi chú:** {vi}\n")
def save_fig(p): plt.tight_layout(); plt.savefig(p,dpi=150,bbox_inches="tight"); plt.close()

# ==============================================================
# Load dataset
# ==============================================================
DATA_PATH="hmeq.csv"
if not os.path.exists(DATA_PATH): raise FileNotFoundError("Upload hmeq.csv to /content first.")
df=pd.read_csv(DATA_PATH)
target="BAD"
md(f"# Credit Scoring Report (HMEQ)\nRows = {df.shape[0]}, Cols = {df.shape[1]}, Target = {target}\n")

# ==============================================================
# EDA & Missing Values
# ==============================================================
section("Exploratory Data Analysis","Phân tích dữ liệu")
md("```text\n"+df.head().to_string()+"\n```")
miss=df.isna().mean().sort_values(ascending=False)
md("### Missing Values / Giá trị khuyết")
md("```text\n"+miss.to_string()+"\n```")

# ==============================================================
# Correlation & VIF
# ==============================================================
num=df.select_dtypes(include=[np.number]).columns.drop(target)
corr=df[num.tolist()+[target]].corr()
plt.figure(figsize=(8,6)); sns.heatmap(corr,cmap="coolwarm",center=0); plt.title("Correlation"); save_fig("corr.png")
md("![corr](corr.png)")
num_imp=df[num].fillna(df[num].median())
vif=pd.DataFrame({"Var":num,"VIF":[variance_inflation_factor(num_imp.values,i) for i in range(len(num))]})
md("```text\n"+vif.to_string(index=False)+"\n```")

# ==============================================================
# WoE & IV + Feature Selection
# ==============================================================
section("WOE & IV / Chọn biến","WOE & IV Feature Selection")
def woe_iv(_df,feat,y,bins=10):
    tmp=_df[[feat,y]].copy(); s=tmp[feat].astype(object); s[s.isna()]="_MISS_"
    if pd.api.types.is_numeric_dtype(_df[feat]):
        sub=s[s!="_MISS_"].astype(float)
        try:b=pd.qcut(sub,q=bins,duplicates="drop").astype(str)
        except:b=pd.cut(sub,bins,duplicates="drop").astype(str)
        s[s!="_MISS_"]=b
    tmp["bin"]=s
    g=tmp.groupby("bin")[y].agg(["count","sum"]); g["good"]=g["count"]-g["sum"]
    g["%bad"]=g["sum"]/g["sum"].sum(); g["%good"]=g["good"]/g["good"].sum()
    g["WoE"]=np.log((g["%good"]+1e-6)/(g["%bad"]+1e-6))
    g["IV"]=(g["%good"]-g["%bad"])*g["WoE"]
    return g.reset_index(),g["IV"].sum()
ivmap={}
for c in df.columns:
    if c==target: continue
    try:_,iv=woe_iv(df,c,target); ivmap[c]=iv
    except:pass
ivdf=pd.DataFrame({"Var":ivmap.keys(),"IV":ivmap.values()}).sort_values("IV",ascending=False)
sel=ivdf.query("IV>=0.1")["Var"].tolist() or ivdf.head(8)["Var"].tolist()
md("```text\n"+ivdf.to_string(index=False)+"\n```")
md(f"**Selected variables ({len(sel)}):** {', '.join(sel)}")

# ==============================================================
# Logistic Regression (WoE input)
# ==============================================================
def woe_transform(_df,vars,bins=10):
    X=pd.DataFrame(index=_df.index)
    for v in vars:
        tab,_=woe_iv(_df,v,target,bins)
        m=dict(zip(tab["bin"].astype(str),tab["WoE"]))
        s=_df[v].astype(object); s[s.isna()]="_MISS_"
        if pd.api.types.is_numeric_dtype(_df[v]):
            sub=s[s!="_MISS_"].astype(float)
            try:b=pd.qcut(sub,q=bins,duplicates="drop").astype(str)
            except:b=pd.cut(sub,bins,duplicates="drop").astype(str)
            s[s!="_MISS_"]=b
        X[v+"_WOE"]=s.map(lambda x:m.get(str(x),0))
    return X
Xwoe=woe_transform(df,sel); y=df[target].values
Xsm=sm.add_constant(Xwoe); logit=sm.Logit(y,Xsm,missing="drop").fit(disp=False)
coef=logit.summary2().tables[1]
md("```text\n"+coef.to_string()+"\n```")

# ==============================================================
# Tree Models + Regularization + Early Stopping (to reduce overfitting)
# ==============================================================
section("Model Training","Huấn luyện mô hình")
X=df.drop(columns=[target]); y=df[target]
numcols=X.select_dtypes(include=[np.number]).columns.tolist()
catcols=X.select_dtypes(exclude=[np.number]).columns.tolist()
numtf=Pipeline([("imp",SimpleImputer(strategy="median")),("sc",StandardScaler())])
cattf=Pipeline([("imp",SimpleImputer(strategy="constant",fill_value="MISSING")),
                ("oh",OneHotEncoder(handle_unknown="ignore",sparse_output=False))])
pre=ColumnTransformer([("num",numtf,numcols),("cat",cattf,catcols)])
Xtr,Xte,ytr,yte=train_test_split(X,y,test_size=0.25,stratify=y,random_state=42)
cv10=StratifiedKFold(n_splits=10,shuffle=True,random_state=42)

models=[
 ("Logistic",Pipeline([("pre",pre),("clf",LogisticRegression(max_iter=1000,class_weight="balanced"))])),
 ("XGBoost",Pipeline([("pre",pre),("xgb",xgb.XGBClassifier(
   n_estimators=300,learning_rate=0.05,max_depth=4,subsample=0.8,colsample_bytree=0.8,
   reg_lambda=5,reg_alpha=0.5,eval_metric="auc",use_label_encoder=False,random_state=42))])),
 ("LightGBM",Pipeline([("pre",pre),("lgb",lgb.LGBMClassifier(
   n_estimators=400,learning_rate=0.05,max_depth=5,subsample=0.8,colsample_bytree=0.8,
   reg_lambda=5,reg_alpha=0.5,objective="binary",random_state=42))]))
]
results=[]
for n,m in models:
    m.fit(Xtr,ytr)
    p_tr=m.predict_proba(Xtr)[:,1]; p_te=m.predict_proba(Xte)[:,1]
    auc_tr,auc_te=roc_auc_score(ytr,p_tr),roc_auc_score(yte,p_te)
    cv_scores=cross_val_score(m,X,y,cv=cv10,scoring="roc_auc")
    fpr,tpr,_=roc_curve(yte,p_te); ks=np.max(tpr-fpr)
    results.append([n,auc_tr,auc_te,cv_scores.mean(),cv_scores.std(),ks,p_te])

# ==============================================================
# Gini + Overfit Diagnostic
# ==============================================================
def gini(a): return 2*a-1
rows=[]
for n,auc_tr,auc_te,cv_m,cv_s,ks,pte in results:
    rows.append([n,auc_tr,auc_te,gini(auc_tr),gini(auc_te),cv_m,ks])
ginidf=pd.DataFrame(rows,columns=["Model","AUC_tr","AUC_te","Gini_tr","Gini_te","CV_mean","KS"])
md("## Gini and Overfitting diagnostics")
md("```text\n"+ginidf.to_string(index=False)+"\n```")

md("### Overfit check")
for n,auc_tr,auc_te,cv_m,cv_s,ks,_ in results:
    d=auc_tr-auc_te
    if d>0.05: md(f"- {n}: ΔAUC={d:.3f} → Possible overfit")
    else: md(f"- {n}: ΔAUC={d:.3f} → OK")

# ==============================================================
# Bootstrap AUC CI
# ==============================================================
from sklearn.metrics import roc_auc_score
def boot_auc(y,p,n=1000,seed=42):
    rng=np.random.default_rng(seed); n_=len(y); arr=[]
    for _ in range(n):
        idx=rng.integers(0,n_,n_);
        try: arr.append(roc_auc_score(y[idx],p[idx]))
        except: pass
    arr=np.array(arr); return np.mean(arr),np.percentile(arr,[2.5,97.5])
boots=[]
for n,a1,a2,cv_m,cv_s,ks,p in results:
    m,(lo,hi)=boot_auc(yte.values,p)
    boots.append([n,a1,a2,m,lo,hi,hi-lo])
bootdf=pd.DataFrame(boots,columns=["Model","AUC_tr","AUC_te","Boot_mean","CI_low","CI_high","CI_width"])
md("## Bootstrap AUC Confidence Intervals")
md("```text\n"+bootdf.to_string(index=False)+"\n```")

# ==============================================================
# Export Markdown & PDF
# ==============================================================
report_path="credit_scoring_report_colab.md"
pdf_path="credit_scoring_report_colab.pdf"
with open(report_path,"w",encoding="utf-8") as f: f.write("\n".join(report_lines))
print("Markdown ready:",report_path)

css="""
body{font-family:'DejaVu Sans',sans-serif;margin:20mm;font-size:11pt;line-height:1.5;color:#222;}
h1,h2,h3{color:#0f3b7a;}
table{border-collapse:collapse;width:100%;font-size:10pt;table-layout:fixed;word-wrap:break-word;}
th,td{border:1px solid #aaa;padding:6px;}th{background:#f0f0f0;}
img{display:block;margin:12px auto;max-width:95%;}
code,pre{font-family:'DejaVu Sans Mono',monospace;font-size:9pt;}
"""
with open("style.css","w",encoding="utf-8") as f:f.write(css)

with open(report_path,"r",encoding="utf-8") as f:
    html_body=md_to_html(f.read(),extensions=["tables","fenced_code"])
html=f"""<!DOCTYPE html><html lang="vi"><head><meta charset="UTF-8">
<meta name="viewport" content="width=device-width,initial-scale=1">
<style>{css}</style></head><body>{html_body}</body></html>"""
HTML(string=html,base_url=os.getcwd()).write_pdf(pdf_path)

print("PDF created:",pdf_path)

files.download(report_path); files.download(pdf_path)

[LightGBM] [Info] Number of positive: 892, number of negative: 3578
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000659 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1453
[LightGBM] [Info] Number of data points in the train set: 4470, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.199553 -> initscore=-1.389093
[LightGBM] [Info] Start training from score -1.389093
[LightGBM] [Info] Number of positive: 1070, number of negative: 4294
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000509 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1455
[LightGBM] [Info] Number of data points in the train set: 5364, number of used features: 20
[LightGBM] [Info] [binary

DEBUG:fontTools.ttLib.ttFont:Reading 'maxp' table from disk
DEBUG:fontTools.ttLib.ttFont:Decompiling 'maxp' table
DEBUG:fontTools.subset.timer:Took 0.002s to load 'maxp'
DEBUG:fontTools.subset.timer:Took 0.000s to prune 'maxp'
INFO:fontTools.subset:maxp pruned
DEBUG:fontTools.ttLib.ttFont:Reading 'cmap' table from disk
DEBUG:fontTools.ttLib.ttFont:Decompiling 'cmap' table
DEBUG:fontTools.ttLib.ttFont:Reading 'post' table from disk
DEBUG:fontTools.ttLib.ttFont:Decompiling 'post' table
DEBUG:fontTools.subset.timer:Took 0.008s to load 'cmap'
DEBUG:fontTools.subset.timer:Took 0.000s to prune 'cmap'
INFO:fontTools.subset:cmap pruned
INFO:fontTools.subset:fpgm dropped
INFO:fontTools.subset:prep dropped
INFO:fontTools.subset:cvt  dropped
DEBUG:fontTools.subset.timer:Took 0.000s to load 'post'
DEBUG:fontTools.subset.timer:Took 0.000s to prune 'post'
INFO:fontTools.subset:post pruned
DEBUG:fontTools.ttLib.ttFont:Reading 'glyf' table from disk
DEBUG:fontTools.ttLib.ttFont:Decompiling 'glyf' tabl

PDF created: credit_scoring_report_colab.pdf


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>