In [1]:
# Upload file "hmeq.csv" to Google Collab
from google.colab import files
uploaded = files.upload()

Saving hmeq.csv to hmeq.csv


Bắt đầu xử lý dữ liệu và lập các Mô hình chấm điểm / Starting to EDA and then creating Scoring Models

In [None]:
# I implement this code and models through learning from relevent posts on Kaggle. Dataset is download from Kaggle, too.
# ==============================================================
# HMEQ Credit Scoring on Google Colab
# Logistic Regression + XGBoost + LightGBM
# Full EDA → WoE/IV → 10-Fold CV → Markdown + PDF Report
# ==============================================================

# Install packages
!apt-get -y install wkhtmltopdf
!pip install pandas numpy scikit-learn xgboost lightgbm statsmodels seaborn matplotlib markdown pdfkit --quiet

# ==============================================================
# Imports
# ==============================================================

import os, io, warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.linear_model import LogisticRegression

import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

import xgboost as xgb
import lightgbm as lgb

import markdown, pdfkit
from google.colab import files

# ==============================================================
# Load dataset
# ==============================================================

DATA_PATH = "hmeq.csv"
if not os.path.exists(DATA_PATH):
    raise FileNotFoundError("Please upload 'hmeq.csv' to Colab first.")

df = pd.read_csv(DATA_PATH)
target = "BAD"

report_lines = []
def md(text): report_lines.append(text)
def section(en, vi): md(f"\n## {en} / {vi}\n")
def note(en, vi): md(f"**Note:** {en}\n\n**Ghi chú:** {vi}\n")
def save_fig(path): plt.tight_layout(); plt.savefig(path, dpi=150, bbox_inches="tight"); plt.close()

md("# Credit Scoring Report (HMEQ)\n")
md(f"**Rows:** {df.shape[0]:,}, **Columns:** {df.shape[1]}, **Target:** `{target}` (1 = bad, 0 = good)\n")

# ==============================================================
# EDA
# ==============================================================

section("Exploratory Data Analysis", "Phân tích khám phá dữ liệu")
md("### Dataset head")
md("```text\n" + df.head(10).to_string(index=False) + "\n```")
buf = io.StringIO(); df.info(buf=buf); md("```text\n" + buf.getvalue() + "\n```")

section("Descriptive Statistics", "Thống kê mô tả")
desc = df.describe(include="all").transpose()
md("```text\n" + desc.to_string() + "\n```")
md("### Target distribution")
md("```text\n" + df[target].value_counts(dropna=False).to_string() + "\n```")

# ==============================================================
# Missing values
# ==============================================================

section("Missing Values", "Giá trị khuyết")
miss = df.isna().mean().sort_values(ascending=False)
md("```text\n" + miss.to_string() + "\n```")

# ==============================================================
# Correlation & Multicollinearity
# ==============================================================

section("Correlation & Multicollinearity", "Tương quan & Đa cộng tuyến")
num_cols_all = df.select_dtypes(include=[np.number]).columns.tolist()
if target in num_cols_all: num_cols_all.remove(target)

corr = df[num_cols_all + [target]].corr()
plt.figure(figsize=(10,8))
sns.heatmap(corr, cmap="coolwarm", center=0)
plt.title("Correlation Heatmap")
save_fig("corr_heatmap.png")
md("![Correlation](corr_heatmap.png)")

num_imp = df[num_cols_all].copy().fillna(df[num_cols_all].median())
vif = [(c, variance_inflation_factor(num_imp.values, i)) for i,c in enumerate(num_imp.columns)]
vif_df = pd.DataFrame(vif, columns=["Variable","VIF"]).sort_values("VIF", ascending=False)
md("```text\n" + vif_df.to_string(index=False) + "\n```")

# ==============================================================
# WoE / IV + Feature selection
# ==============================================================

section("WoE & IV + Feature Selection", "WoE & IV + Chọn biến")

def compute_woe_iv(_df, feature, y_col, bins=10):
    temp = _df[[feature, y_col]].copy()
    s = temp[feature].astype(object)
    s[s.isna()] = "__MISSING__"
    if pd.api.types.is_numeric_dtype(_df[feature]):
        sub = s[s!="__MISSING__"].astype(float)
        try: b = pd.qcut(sub, q=bins, duplicates="drop").astype(str)
        except: b = pd.cut(sub, bins, duplicates="drop").astype(str)
        s[:] = "__MISSING__"; s.loc[b.index]=b
    temp["bin"] = s
    g = temp.groupby("bin")[y_col].agg(["count","sum"]).rename(columns={"sum":"bad"})
    g["good"] = g["count"] - g["bad"]
    g["%bad"] = g["bad"]/g["bad"].sum(); g["%good"] = g["good"]/g["good"].sum()
    g["WoE"] = np.log((g["%good"]+1e-6)/(g["%bad"]+1e-6))
    g["IV"] = (g["%good"]-g["%bad"])*g["WoE"]
    return g.reset_index(), g["IV"].sum()

iv_map={}
for col in df.columns:
    if col==target: continue
    try: _,iv=compute_woe_iv(df,col,target); iv_map[col]=iv
    except: pass

iv_df = pd.DataFrame({"Variable":iv_map.keys(),"IV":iv_map.values()}).sort_values("IV",ascending=False)
IV_THRESH=0.1
selected = iv_df.loc[iv_df["IV"]>=IV_THRESH,"Variable"].tolist() or iv_df.head(8)["Variable"].tolist()
md("```text\n" + iv_df.to_string(index=False) + "\n```")
md(f"**Selected features:** {', '.join(selected)}")

# ==============================================================
# Logistic (WoE) analysis
# ==============================================================

def woe_transform(_df, y_col, feats, bins=10):
    Xw=pd.DataFrame(index=_df.index)
    for col in feats:
        tab,_=compute_woe_iv(_df,col,y_col,bins)
        wmap=dict(zip(tab["bin"].astype(str),tab["WoE"]))
        s=_df[col].astype(object); s[s.isna()]="__MISSING__"
        if pd.api.types.is_numeric_dtype(_df[col]):
            sub=s[s!="__MISSING__"].astype(float)
            try:b=pd.qcut(sub,q=bins,duplicates="drop").astype(str)
            except:b=pd.cut(sub,bins,duplicates="drop").astype(str)
            s[:]="__MISSING__"; s.loc[b.index]=b
        Xw[col+"_WOE"]=s.map(lambda x:wmap.get(str(x),0.0))
    return Xw

X_woe = woe_transform(df, target, selected)
y = df[target].values
Xsm = sm.add_constant(X_woe)
logit = sm.Logit(y, Xsm, missing="drop").fit(disp=False)
coef = logit.summary2().tables[1]
ci95 = logit.conf_int(alpha=0.05); ci95.columns=["CI95_low","CI95_high"]
coef_tbl = coef.join(ci95)
md("```text\n" + coef_tbl.to_string() + "\n```")

# ==============================================================
# Model training + 10-Fold CV
# ==============================================================

section("Model Training + Cross Validation", "Huấn luyện mô hình + Cross Validation")

X = df.drop(columns=[target]); y = df[target]
num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = X.select_dtypes(include=["object","category"]).columns.tolist()

num_tf = Pipeline([("imp",SimpleImputer(strategy="median")),("sc",StandardScaler())])
cat_tf = Pipeline([("imp",SimpleImputer(strategy="constant",fill_value="MISSING")),
                   ("oh",OneHotEncoder(handle_unknown="ignore",sparse_output=False))])
pre = ColumnTransformer([("num",num_tf,num_cols),("cat",cat_tf,cat_cols)])
X_train,X_test,y_train,y_test = train_test_split(X,y,stratify=y,test_size=0.25,random_state=42)
cv10 = StratifiedKFold(n_splits=10,shuffle=True,random_state=42)

models=[
 ("Logistic",Pipeline([("pre",pre),("clf",LogisticRegression(max_iter=1000,class_weight="balanced"))])),
 ("XGBoost",Pipeline([("pre",pre),("xgb",xgb.XGBClassifier(
   n_estimators=300,learning_rate=0.05,max_depth=4,subsample=0.8,colsample_bytree=0.8,
   eval_metric="auc",random_state=42,use_label_encoder=False))])),
 ("LightGBM",Pipeline([("pre",pre),("lgb",lgb.LGBMClassifier(
   n_estimators=500,learning_rate=0.05,max_depth=5,subsample=0.8,colsample_bytree=0.8,objective="binary"))]))
]

results=[]
for name,model in models:
    model.fit(X_train,y_train)
    ptrain=model.predict_proba(X_train)[:,1]; ptest=model.predict_proba(X_test)[:,1]
    auc_tr,auc_te=roc_auc_score(y_train,ptrain),roc_auc_score(y_test,ptest)
    cv_scores=cross_val_score(model,X,y,cv=cv10,scoring="roc_auc")
    cv_mean,cv_std=cv_scores.mean(),cv_scores.std()
    fpr,tpr,_=roc_curve(y_test,ptest); ks=np.max(tpr-fpr)
    results.append([name,auc_tr,auc_te,cv_mean,cv_std,ks,ptest])

# ==============================================================
# Visualization & Comparison
# ==============================================================

plt.figure(figsize=(7,5))
for name,_,_,_,_,_,ptest in results:
    fpr,tpr,_=roc_curve(y_test,ptest)
    plt.plot(fpr,tpr,label=f"{name} (AUC={roc_auc_score(y_test,ptest):.3f})")
plt.plot([0,1],[0,1],'--',color='gray')
plt.xlabel("FPR"); plt.ylabel("TPR"); plt.title("ROC Curves")
plt.legend(); plt.grid(True)
save_fig("roc_curves.png")
md("![ROC Curves](roc_curves.png)")

labels=[r[0] for r in results]
auc_trs=[r[1] for r in results]; auc_tes=[r[2] for r in results]
cv_means=[r[3] for r in results]; ks_vals=[r[5] for r in results]
comp=pd.DataFrame({"Model":labels,"AUC_Train":auc_trs,"AUC_Test":auc_tes,"CV_Mean":cv_means,"KS":ks_vals})
md("```text\n" + comp.to_string(index=False) + "\n```")

# ==============================================================
# Export Markdown → PDF
# ==============================================================

from markdown import markdown as md_to_html
from weasyprint import HTML

report_path = "credit_scoring_report_colab.md"
pdf_path = "credit_scoring_report_colab.pdf"

# Dùng cùng CSS
with open("style.css", "r", encoding="utf-8") as f:
    css_text = f.read()

with open(report_path, "r", encoding="utf-8") as f:
    html_content = md_to_html(f.read(), extensions=["tables", "fenced_code"])

# Gói HTML chuẩn UTF-8
html = f"""
<!DOCTYPE html>
<html lang="vi">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1">
<style>{css_text}</style>
</head>
<body>
{html_content}
</body>
</html>
"""

HTML(string=html, base_url=os.getcwd()).write_pdf(pdf_path)
print(f"PDF created successfully → {pdf_path}")

from google.colab import files
files.download(pdf_path)
files.download(report_path)


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
[LightGBM] [Info] Number of positive: 1070, number of negative: 4294
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000468 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1456
[LightGBM] [Info] Number of data points in the train set: 5364, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.199478 -> initscore=-1.389560
[LightGBM] [Info] Start training from score -1.389560
[LightGBM] [Info] Number of positive: 1070, number of negative: 4294
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000595 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1454
[LightGBM] [Info] Number of data points in the trai

DEBUG:fontTools.ttLib.ttFont:Reading 'maxp' table from disk
DEBUG:fontTools.ttLib.ttFont:Decompiling 'maxp' table
DEBUG:fontTools.subset.timer:Took 0.002s to load 'maxp'
DEBUG:fontTools.subset.timer:Took 0.000s to prune 'maxp'
INFO:fontTools.subset:maxp pruned
DEBUG:fontTools.ttLib.ttFont:Reading 'cmap' table from disk
DEBUG:fontTools.ttLib.ttFont:Decompiling 'cmap' table
DEBUG:fontTools.ttLib.ttFont:Reading 'post' table from disk
DEBUG:fontTools.ttLib.ttFont:Decompiling 'post' table
DEBUG:fontTools.subset.timer:Took 0.008s to load 'cmap'
DEBUG:fontTools.subset.timer:Took 0.000s to prune 'cmap'
INFO:fontTools.subset:cmap pruned
INFO:fontTools.subset:fpgm dropped
INFO:fontTools.subset:prep dropped
INFO:fontTools.subset:cvt  dropped
DEBUG:fontTools.subset.timer:Took 0.000s to load 'post'
DEBUG:fontTools.subset.timer:Took 0.000s to prune 'post'
INFO:fontTools.subset:post pruned
DEBUG:fontTools.ttLib.ttFont:Reading 'glyf' table from disk
DEBUG:fontTools.ttLib.ttFont:Decompiling 'glyf' tabl

PDF created successfully → credit_scoring_report_colab.pdf


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>