In [2]:
import lightgbm as lgbm
import pandas as pd


In [19]:
red_win_date=pd.read_csv(r'wine+quality/winequality-red.csv',sep=';')
white_win_date=pd.read_csv(r'wine+quality/winequality-white.csv',sep=';')

In [20]:
feature=red_win_date.columns

In [14]:
red_win_date['target']=1
white_win_date['target']=0

In [15]:
win_date=pd.concat([red_win_date,white_win_date])

In [28]:
win_date.reset_index(drop=True,inplace=True)

In [21]:
import lightgbm as lgb
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.datasets import make_classification

# 1. 生成示例数据集
# X, y = make_classification(n_samples=1000, n_features=20, n_classes=2, random_state=42)

# 2. 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(win_date[feature], win_date['target'], test_size=0.2, random_state=42)

# 3. 创建 LightGBM 数据集
train_data = lgb.Dataset(X_train, label=y_train)
test_data = lgb.Dataset(X_test, label=y_test, reference=train_data)

# 4. 设置模型参数
params = {
    'objective': 'binary',  # 二分类任务
    'metric': 'binary_logloss',  # 评估指标
    'boosting_type': 'gbdt',  # 梯度提升决策树
    'num_leaves': 31,  # 叶子节点数
    'learning_rate': 0.05,  # 学习率
    'feature_fraction': 0.9,  # 特征采样比例
    'bagging_fraction': 0.8,  # 数据采样比例
    'bagging_freq': 5,  # 每 5 次迭代进行一次 bagging
    'verbose': 0  # 不输出日志
}

# 5. 训练模型
model = lgb.train(
    params,
    train_data,
    num_boost_round=100,  # 迭代次数
    valid_sets=[test_data],  # 验证集
    callbacks=[lgb.early_stopping(stopping_rounds=10)]  # 早停法
)

# 6. 预测
y_pred = model.predict(X_test, num_iteration=model.best_iteration)  # 预测概率
y_pred_class = np.round(y_pred)  # 将概率转换为类别（0 或 1）

# 7. 评估模型
accuracy = accuracy_score(y_test, y_pred_class)
auc = roc_auc_score(y_test, y_pred)

print(f"Accuracy: {accuracy:.4f}")
print(f"AUC: {auc:.4f}")

# 8. 保存模型
model.save_model('lightgbm_model.txt')

# 9. 加载模型（可选）
# loaded_model = lgb.Booster(model_file='lightgbm_model.txt')

Training until validation scores don't improve for 10 rounds
Did not meet early stopping. Best iteration is:
[100]	valid_0's binary_logloss: 0.0261059
Accuracy: 0.9946
AUC: 0.9980


<lightgbm.basic.Booster at 0x1c78de01060>

In [23]:
def _parse_tree(node, prefix):
    if 'split_index' in node:  # 非叶子节点
        feature = node['split_feature']
        threshold = node['threshold']
        print(f"{prefix} -> Split on feature {feature} (threshold: {threshold:.4f})")
        _parse_tree(node['left_child'], prefix + " -> Left")
        _parse_tree(node['right_child'], prefix + " -> Right")
    else:  # 叶子节点
        print(f"{prefix} -> Leaf (prediction: {node['leaf_value']:.4f})")

# 6. 获取单一样本的预测路径（可选）
def get_prediction_path(model, sample):
    pred_path = []
    for tree in trees:
        node = tree['tree_structure']
        while 'split_index' in node:
            feature = node['split_feature']
            threshold = node['threshold']
            if sample[feature] <= threshold:
                node = node['left_child']
                pred_path.append(f"Feature {feature} <= {threshold:.4f}")
            else:
                node = node['right_child']
                pred_path.append(f"Feature {feature} > {threshold:.4f}")
        pred_path.append(f"Leaf (prediction: {node['leaf_value']:.4f})")
    return pred_path


In [31]:
# 3. 获取模型的所有树
trees = model.dump_model()['tree_info']

# 4. 遍历每棵树，提取特征组合和预测值
for tree_id, tree in enumerate(trees):
    print(f"\nTree {tree_id + 1}:")
    tree_structure = tree['tree_structure']
    _parse_tree(tree_structure, prefix="Root")

# 5. 定义递归函数解析树结构

# 7. 测试单一样本的预测路径



Tree 1:
Root -> Split on feature 6 (threshold: 65.5000)
Root -> Left -> Split on feature 4 (threshold: 0.0465)
Root -> Left -> Left -> Split on feature 8 (threshold: 3.2350)
Root -> Left -> Left -> Left -> Split on feature 0 (threshold: 7.4500)
Root -> Left -> Left -> Left -> Left -> Leaf (prediction: -1.2074)
Root -> Left -> Left -> Left -> Right -> Leaf (prediction: -1.2074)
Root -> Left -> Left -> Right -> Leaf (prediction: -1.1130)
Root -> Left -> Right -> Split on feature 7 (threshold: 0.9932)
Root -> Left -> Right -> Left -> Leaf (prediction: -1.0127)
Root -> Left -> Right -> Right -> Split on feature 4 (threshold: 0.0525)
Root -> Left -> Right -> Right -> Left -> Leaf (prediction: -0.9868)
Root -> Left -> Right -> Right -> Right -> Split on feature 9 (threshold: 0.4550)
Root -> Left -> Right -> Right -> Right -> Left -> Leaf (prediction: -0.9449)
Root -> Left -> Right -> Right -> Right -> Right -> Split on feature 1 (threshold: 0.7825)
Root -> Left -> Right -> Right -> Right ->

In [30]:
sample = win_date.loc[0,]
pred_path = get_prediction_path(model, sample)
print("\nPrediction path for a single sample:")
for step in pred_path:
    print(step)


Prediction path for a single sample:
Feature 6 <= 65.5000
Feature 4 > 0.0465
Feature 7 > 0.9932
Feature 4 > 0.0525
Feature 9 > 0.4550
Feature 1 <= 0.7825
Leaf (prediction: -0.9348)
Feature 6 <= 65.5000
Feature 7 > 0.9932
Feature 9 > 0.4350
Feature 7 > 0.9937
Feature 6 <= 54.5000
Leaf (prediction: 0.1777)
Feature 4 > 0.0615
Feature 6 <= 113.5000
Feature 7 > 0.9933
Feature 3 <= 6.9500
Feature 7 > 0.9938
Feature 1 > 0.2675
Feature 6 <= 100.5000
Feature 3 > 1.5250
Feature 6 <= 66.5000
Feature 9 > 0.4650
Leaf (prediction: 0.1567)
Feature 4 > 0.0615
Feature 6 <= 113.5000
Feature 7 > 0.9934
Feature 3 <= 6.9500
Feature 1 > 0.2675
Feature 3 > 1.5250
Feature 6 <= 81.5000
Feature 9 > 0.4550
Feature 6 <= 66.5000
Leaf (prediction: 0.1413)
Feature 4 > 0.0615
Feature 6 <= 113.5000
Feature 7 > 0.9929
Feature 3 <= 6.9500
Feature 7 > 0.9936
Feature 3 <= 5.4250
Feature 3 > 1.6250
Feature 5 <= 34.5000
Feature 6 <= 66.5000
Leaf (prediction: 0.1294)
Feature 6 <= 70.5000
Feature 4 > 0.0465
Feature 1 > 0.227

  if sample[feature] <= threshold:
