# LightGBM for 血栓 Reduction Prediction

## Task
- Binary classification
- Target: 血栓 reduction (0 = 无, 1 = 有)

## Model
- LightGBM Classifier (lightgbm)

## Metrics
- Accuracy
- Precision
- Recall
- F1-score
- ROC-AUC
- PR-AUC

## Notes
This notebook serves as the comparison model.

In [10]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_predict, StratifiedKFold, LeaveOneOut
from lightgbm import LGBMClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    average_precision_score,
    roc_curve,
    precision_recall_curve
)

plt.rcParams["figure.dpi"] = 120
sns.set_style("whitegrid")

In [11]:
RANDOM_STATE = 42
TEST_SIZE = 0.2

TARGET_COL = "动脉血栓（有1，无0）"
DATA_PATH = "../data/PLTdataset.xlsx"
SHEET_NAME = "血栓形成预测"

In [12]:
df = pd.read_excel(DATA_PATH, sheet_name=SHEET_NAME)

print("Data shape:", df.shape)
df.head()

Data shape: (35, 33)


Unnamed: 0.1,Unnamed: 0,动脉血栓（有1，无0）,"FHb(1,>40mg/L;0,<40mg/L)","FHb(1,>40mg/L;0,<40mg/L).1","TM(>5450,1;≤5450，0，正常范围5450±2140)pg/mL)","TM(>5450,1;≤5450，0，正常范围5450±2140)pg/mL).1","脾大（1,是，0否）","PLA2（0<659IU/L,1≥659IU/L）","PLA2（0<659IU/L,1≥659IU/L）.1","HGB（1<100mg/L,0≥100mg/L）",...,STO,CRP,FIB,D dimer,NO,LDH,CHO,LDL,Pla(脂蛋白a),SOD
0,N1,0,19.974734,0,1260.02129,0,0,697,1,84,...,25.2,30.08,2.13,8.26,28.5,237,4.28,1.88,0.55,72
1,N10,1,20.367691,0,3922.861245,0,0,838,1,131,...,18.0,10.02,2.3,0.39,49.0,273,3.17,1.39,0.1,45
2,N12,0,14.744621,0,728.80882,0,0,811,1,145,...,5.0,26.0,2.3,0.66,30.0,216,7.82,3.94,0.31,90
3,N13,0,15.565815,0,947.03733,0,0,730,1,136,...,5.0,16.31,2.42,0.73,35.5,125,3.48,1.66,0.15,133
4,N14,0,14.744621,0,629.04086,0,0,504,0,128,...,3.0,62.74,5.58,2.0,32.9,152,4.08,2.2,0.08,139


In [13]:
df.info()

df[TARGET_COL].value_counts(normalize=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35 entries, 0 to 34
Data columns (total 33 columns):
 #   Column                                     Non-Null Count  Dtype  
---  ------                                     --------------  -----  
 0   Unnamed: 0                                 35 non-null     object 
 1   动脉血栓（有1，无0）                                35 non-null     int64  
 2   FHb(1,>40mg/L;0,<40mg/L)                   35 non-null     float64
 3   FHb(1,>40mg/L;0,<40mg/L).1                 35 non-null     int64  
 4   TM(>5450,1;≤5450，0，正常范围5450±2140)pg/mL)    35 non-null     float64
 5   TM(>5450,1;≤5450，0，正常范围5450±2140)pg/mL).1  35 non-null     int64  
 6   脾大（1,是，0否）                                 35 non-null     int64  
 7   PLA2（0<659IU/L,1≥659IU/L）                  35 non-null     int64  
 8   PLA2（0<659IU/L,1≥659IU/L）.1                35 non-null     int64  
 9    HGB（1<100mg/L,0≥100mg/L）                  35 non-null     int64  
 10   HGB（1<100mg/L,2≥100mg/L）   

动脉血栓（有1，无0）
0    0.8
1    0.2
Name: proportion, dtype: float64

In [14]:
id_col_name = df.columns[0]
DROP_COLS = [id_col_name, TARGET_COL, "FHb(1,>40mg/L;0,<40mg/L).1", "TM(>5450,1;≤5450，0，正常范围5450±2140)pg/mL).1", "PLA2（0<659IU/L,1≥659IU/L）.1", " HGB（1<100mg/L,2≥100mg/L）", "MCV(0<100fL,1≥100mfL).1", "RDW1(0<15%,1≥15%.1", "HDL（0≥0.8mmol/L,1<0.8mmol/L）.1"]
X = df.drop(columns=DROP_COLS)
y = df[TARGET_COL]

In [15]:
print("Missing values:")
print(X.isna().sum().sort_values(ascending=False))

X = X.dropna()
y = y.loc[X.index]

X = pd.get_dummies(X, drop_first=True)
print("Feature shape after encoding:", X.shape)

Missing values:
FHb(1,>40mg/L;0,<40mg/L)                   0
TM(>5450,1;≤5450，0，正常范围5450±2140)pg/mL)    0
Pla(脂蛋白a)                                  0
LDL                                        0
CHO                                        0
LDH                                        0
NO                                         0
D dimer                                    0
FIB                                        0
CRP                                        0
STO                                        0
NC                                         0
PLT count                                  0
WBC                                        0
Age                                        0
FVIII(0.<120;1,>120%).1                    0
FVIII(0.<120;1,>120%)                      0
HDL（0≥0.8mmol/L,1<0.8mmol/L）               0
RDW1(0<15%,1≥15%                           0
MCV(0<100fL,1≥100mfL)                      0
 HGB（1<100mg/L,0≥100mg/L）                  0
PLA2（0<659IU/L,1≥659IU/L）              

In [16]:
# X_train, X_test, y_train, y_test = train_test_split(
#     X,
#     y,
#     test_size=TEST_SIZE,
#     random_state=RANDOM_STATE,
#     stratify=y
# )

# from sklearn.preprocessing import StandardScaler

# scaler = StandardScaler()
# X_train = scaler.fit_transform(X_train)
# X_test = scaler.transform(X_test)

# print("Train:", X_train.shape)
# print("Test:", X_test.shape)

In [17]:
# model = LGBMClassifier(
#     n_estimators=200,
#     learning_rate=0.05,
#     class_weight="balanced",
#     random_state=RANDOM_STATE
# )

In [18]:
# model.fit(X_train, y_train)

loo = LeaveOneOut()

lgbm_pipe = Pipeline([
    # ('selector', SelectKBest(f_classif, k=5)), # 强制只看最强的5个特征
    ('scaler', StandardScaler()),              # 必须标准化！
    ('lgbm', LGBMClassifier(class_weight='balanced', random_state=42))
])

param_grid_lgbm = {
    # 极度限制叶子数，试试 3 到 7
    'num_leaves': [3, 5, 7],
    # 树深也要限制
    'max_depth': [1, 2, 3],
    # 学习率
    'learning_rate': [0.01, 0.05, 0.1],
    # 非常重要！一个叶子上至少要有比如 10 个样本，防止它把个例当规律
    'min_child_samples': [10, 15] 
}
grid_search = GridSearchCV(
    estimator=LGBMClassifier(random_state=42),
    param_grid=param_grid_lgbm,
    cv=loo,            # 五折交叉验证
    scoring='roc_auc',    # 以 F1 分数作为评价标准
    n_jobs=-1
)
grid_search.fit(X, y)
model = grid_search.best_estimator_
print("经过LOOCV选出的最佳参数:", grid_search.best_params_)

[LightGBM] [Fatal] Do not support special JSON characters in feature name.
[LightGBM] [Fatal] Do not support special JSON characters in feature name.
[LightGBM] [Fatal] Do not support special JSON characters in feature name.
[LightGBM] [Fatal] Do not support special JSON characters in feature name.
[LightGBM] [Fatal] Do not support special JSON characters in feature name.
[LightGBM] [Fatal] Do not support special JSON characters in feature name.
[LightGBM] [Fatal] Do not support special JSON characters in feature name.
[LightGBM] [Fatal] Do not support special JSON characters in feature name.
[LightGBM] [Fatal] Do not support special JSON characters in feature name.
[LightGBM] [Fatal] Do not support special JSON characters in feature name.
[LightGBM] [Fatal] Do not support special JSON characters in feature name.
[LightGBM] [Fatal] Do not support special JSON characters in feature name.
[LightGBM] [Fatal] Do not support special JSON characters in feature name.
[LightGBM] [Fatal] Do not

ValueError: 
All the 1890 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
1890 fits failed with the following error:
Traceback (most recent call last):
  File "/root/miniconda3/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/root/miniconda3/lib/python3.8/site-packages/lightgbm/sklearn.py", line 1560, in fit
    super().fit(
  File "/root/miniconda3/lib/python3.8/site-packages/lightgbm/sklearn.py", line 1049, in fit
    self._Booster = train(
  File "/root/miniconda3/lib/python3.8/site-packages/lightgbm/engine.py", line 297, in train
    booster = Booster(params=params, train_set=train_set)
  File "/root/miniconda3/lib/python3.8/site-packages/lightgbm/basic.py", line 3656, in __init__
    train_set.construct()
  File "/root/miniconda3/lib/python3.8/site-packages/lightgbm/basic.py", line 2590, in construct
    self._lazy_init(
  File "/root/miniconda3/lib/python3.8/site-packages/lightgbm/basic.py", line 2227, in _lazy_init
    return self.set_feature_name(feature_name)
  File "/root/miniconda3/lib/python3.8/site-packages/lightgbm/basic.py", line 3046, in set_feature_name
    _safe_call(
  File "/root/miniconda3/lib/python3.8/site-packages/lightgbm/basic.py", line 313, in _safe_call
    raise LightGBMError(_LIB.LGBM_GetLastError().decode("utf-8"))
lightgbm.basic.LightGBMError: Do not support special JSON characters in feature name.


In [None]:
# y_pred = model.predict(X_test)
# y_prob = model.predict_proba(X_test)[:, 1]

y_probas_loo = cross_val_predict(model, X, y, cv=loo, method='predict_proba')[:, 1]
y_preds_loo = (y_probas_loo > 0.5).astype(int)

In [None]:
metrics = {
    "accuracy": accuracy_score(y, y_preds_loo),
    "precision": precision_score(y, y_preds_loo),
    "recall": recall_score(y, y_preds_loo),
    "f1": f1_score(y, y_preds_loo),
    "roc_auc": roc_auc_score(y, y_probas_loo),
    "pr_auc": average_precision_score(y, y_probas_loo)
}

metrics_df = pd.DataFrame(metrics, index=["LightGBM"])
metrics_df

In [None]:
fpr, tpr, _ = roc_curve(y, y_probas_loo)

plt.figure()
plt.plot(fpr, tpr, label=f"AUC = {metrics['roc_auc']:.3f}")
plt.plot([0, 1], [0, 1], "k--")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve - LightGBM")
plt.legend()
plt.show()

In [None]:
precision, recall, _ = precision_recall_curve(y, y_probas_loo)

plt.figure()
plt.plot(recall, precision, label=f"AP = {metrics['pr_auc']:.3f}")
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title("Precision-Recall Curve - LightGBM")
plt.legend()
plt.show()

In [None]:
selector = model.named_steps['selector']  # 获取筛选器
lgbm_model = model.named_steps['lgbm']        # 获取LR模型

# A. 获取被选中的特征的名称
# get_support() 返回一个布尔掩码，告诉我们需要保留 X 中的哪些列
selected_mask = selector.get_support()
selected_features = X.columns[selected_mask]

# B. 获取系数 (绝对值代表重要性)
coefficients = np.abs(lgbm_model.coef_[0])

# C. 创建 Series 并排序
feat_imp = pd.Series(
    coefficients,
    index=selected_features  # 只使用被选中的特征名
).sort_values(ascending=False)

# D. 绘图
plt.figure(figsize=(6, 8))
# 只需要画出存在的特征（因为经过筛选，可能只有 5-10 个了）
sns.barplot(x=feat_imp, y=feat_imp.index)
plt.title(f"Feature Importances (Top {len(feat_imp)} Selected by LOOCV)")
plt.xlabel("Absolute Coefficient (Importance)")
plt.tight_layout()
plt.show()

# feat_imp = pd.Series(
#     model.feature_importances_,
#     index=X.columns
# ).sort_values(ascending=False)

# plt.figure(figsize=(6, 8))
# sns.barplot(x=feat_imp.head(20), y=feat_imp.head(20).index)
# plt.title("Top 20 Feature Importances (LightGBM)")
# plt.xlabel("Importance")
# plt.tight_layout()
# plt.show()

In [None]:
metrics_df.to_csv("../outputs/lgbm_metrics_03.csv")
feat_imp.to_csv("../outputs/lgbm_feature_importance_03.csv")

plt.savefig("../outputs/lgbm_feature_importance_03.png", dpi=300)

## Summary

- Random Forest baseline completed
- PR-AUC suggests model performance under class imbalance
- Top features identified for further analysis

Next steps:
- XGBoost
- LightGBM
- Threshold tuning