## 原生形式lightgbm(import lightgbm as lgb)

In [1]:
# pip install lightgbm

In [2]:
# 从sklearn 调入所需要的包
from sklearn import datasets
from sklearn.model_selection import train_test_split #数据分隔出训练集和验证集
import lightgbm as lgb
import numpy as np
import pandas as pd
#导入精度和召回
from sklearn.metrics import precision_score, recall_score
#导入鸢尾花数据
iris = datasets.load_iris()
data = iris.data
label = iris.target
print(pd.DataFrame(data).head())
print(pd.DataFrame(label).head())
data1 = pd.DataFrame(data)
## 花萼长宽花瓣长宽
data1.columns = ['sepal_l','sepal_w','petal_l','petal_w']
print(data1.head())
label1 =pd.DataFrame(label)
label1.columns=['label']
print(label1.head())
#注意这里data label顺序是一致的，千万别打乱

     0    1    2    3
0  5.1  3.5  1.4  0.2
1  4.9  3.0  1.4  0.2
2  4.7  3.2  1.3  0.2
3  4.6  3.1  1.5  0.2
4  5.0  3.6  1.4  0.2
   0
0  0
1  0
2  0
3  0
4  0
   sepal_l  sepal_w  petal_l  petal_w
0      5.1      3.5      1.4      0.2
1      4.9      3.0      1.4      0.2
2      4.7      3.2      1.3      0.2
3      4.6      3.1      1.5      0.2
4      5.0      3.6      1.4      0.2
   label
0      0
1      0
2      0
3      0
4      0


In [3]:
# 划分训练集和测试集
train_x, test_x, train_y, test_y = train_test_split(data1, label1, test_size=0.3, random_state=42)
print("训练集长度:", len(train_x))
print("测试集长度:", len(test_x))

训练集长度: 105
测试集长度: 45


## 1. 调参
        https://www.cnblogs.com/chenxiangzhen/p/10894306.html
        https://blog.csdn.net/CherDW/article/details/86517162
        https://lightgbm.readthedocs.io/en/latest/Python-API.html#scikit-learn-api （官方文件）
        https://blog.csdn.net/u012735708/article/details/83749703
        https://blog.csdn.net/dzysunshine/article/details/92124011

In [4]:
# 转换为DMatrix数据格式
train_data = lgb.Dataset(train_x,train_y)
test_data = lgb.Dataset(test_x,test_y)
# 设置参数
lgb_params = {
   'boosting_type': 'gbdt',  
    'objective': 'multiclass',
    'metric': 'multi_error', 
    'verbose': 1 , # <0 显示致命的, =0 显示错误 (警告), >0 显示信息
     'num_class':3 #lightgbm.basic.LightGBMError: b'Number of classes should be specified and greater than 1 for multiclass training'
    }

# 模型训练
clf = lgb.train(lgb_params,train_data,num_boost_round =10,
                valid_sets = [train_data,test_data], 
                verbose_eval = 10)   #10步
# 模型预测
test_pre = clf.predict(test_x, num_iteration=clf.best_iteration)
# print(test_pre)
print(test_pre[:5])

# 选择表示最高概率的列
test_pre_1 = np.asarray([np.argmax(row) for row in test_pre])
print("test的预测结果:",test_pre_1)

# 模型评估
print('验证集精准率：',precision_score(test_y, test_pre_1, average='macro')) 
print('验证集召回率：',recall_score(test_y, test_pre_1, average='macro'))  

[10]	training's multi_error: 0.0666667	valid_1's multi_error: 0
[[0.13683286 0.63500393 0.22816321]
 [0.69436834 0.15467706 0.15095461]
 [0.12934308 0.16125127 0.70940565]
 [0.14172417 0.62195656 0.23631927]
 [0.13683286 0.63500393 0.22816321]]
test的预测结果: [1 0 2 1 1 0 1 2 1 1 2 0 0 0 0 1 2 1 1 2 0 2 0 2 2 2 2 2 0 0 0 0 1 0 0 2 1
 0 0 0 2 1 1 0 0]
验证集精准率： 1.0
验证集召回率： 1.0


## Sklearn接口形式使用lightgbm

In [4]:
import lightgbm as lgb
lgb_params = {
    'learning_rate':0.1,
    'max_bin':150,     # 分箱
    'num_leaves':32,   # 叶子树 
    'max_depth':11,  
    'objective':'multiclass',
    'n_estimators':300    # 300步
}
model=lgb.LGBMClassifier(**lgb_params)

model.fit(train_x,train_y)
# 预测
#输出预测结果
test_pre2 = model.predict(test_x)
print(test_pre2)
# 模型评估
print('验证集精准率：',precision_score(test_y, test_pre2, average='macro')) 
print('验证集召回率：',recall_score(test_y, test_pre2, average='macro'))  

[1 0 2 1 1 0 1 2 1 1 2 0 0 0 0 1 2 1 1 2 0 2 0 2 2 2 2 2 0 0 0 0 1 0 0 2 1
 0 0 0 2 1 1 0 0]
验证集精准率： 1.0
验证集召回率： 1.0


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


总结：
1. lgb.train中正则化参数为"lambda_l1", "lambda_l1"，sklearn中则为'reg_alpha', 'reg_lambda'。  ## 避免过拟合，加入的惩罚项
2. 多分类时lgb.train除了'objective':'multiclass',还要指定"num_class":5，而sklearn接口只需要指定'objective':'multiclass'。
3. 迭代次数在sklearn中是'n_estimators':20，在初始化模型时指定


## CSND上示例

In [1]:

# coding: utf-8
# pylint: disable = invalid-name, C0111
 
# 函数的更多使用方法参见LightGBM官方文档：http://lightgbm.readthedocs.io/en/latest/Python-Intro.html
 
import json
import lightgbm as lgb
import pandas as pd
from sklearn.metrics import mean_squared_error
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.datasets import  make_classification
 
iris = load_iris()   # 载入鸢尾花数据集
data=iris.data
target = iris.target
X_train,X_test,y_train,y_test =train_test_split(data,target,test_size=0.2)
 
 
# 加载你的数据
# print('Load data...')
# df_train = pd.read_csv('../regression/regression.train', header=None, sep='\t')
# df_test = pd.read_csv('../regression/regression.test', header=None, sep='\t')
#
# y_train = df_train[0].values
# y_test = df_test[0].values
# X_train = df_train.drop(0, axis=1).values
# X_test = df_test.drop(0, axis=1).values
 
# 创建成lgb特征的数据集格式
lgb_train = lgb.Dataset(X_train, y_train) # 将数据保存到LightGBM二进制文件将使加载更快
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)  # 创建验证数据
 
# 将参数写成字典下形式
params = {
    'task': 'train',
    'boosting_type': 'gbdt',  # 设置提升类型
    'objective': 'regression', # 目标函数
    'metric': {'l2', 'auc'},  # 评估函数
    'num_leaves': 31,   # 叶子节点数
    'learning_rate': 0.05,  # 学习速率
    'feature_fraction': 0.9, # 建树的特征选择比例
    'bagging_fraction': 0.8, # 建树的样本采样比例
    'bagging_freq': 5,  # k 意味着每 k 次迭代执行bagging
    'verbose': 1 # <0 显示致命的, =0 显示错误 (警告), >0 显示信息
}
 
print('Start training...')
# 训练 cv and train
gbm = lgb.train(params,lgb_train,num_boost_round=20,valid_sets=lgb_eval,early_stopping_rounds=5) # 训练数据需要参数列表和数据集
 
print('Save model...') 
 
gbm.save_model('model.txt')   # 训练后保存模型到文件
 
print('Start predicting...')
# 预测数据集
y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration) #如果在训练期间启用了早期停止，可以通过best_iteration方式从最佳迭代中获得预测
# 评估模型
print('The rmse of prediction is:', mean_squared_error(y_test, y_pred) ** 0.5) # 计算真实值和预测值之间的均方根误差


#原文链接：https://blog.csdn.net/huacha__/article/details/81057150

Start training...
[1]	valid_0's l2: 0.659605	valid_0's auc: 1
Training until validation scores don't improve for 5 rounds
[2]	valid_0's l2: 0.598339	valid_0's auc: 1
[3]	valid_0's l2: 0.540953	valid_0's auc: 1
[4]	valid_0's l2: 0.490961	valid_0's auc: 1
[5]	valid_0's l2: 0.444002	valid_0's auc: 1
[6]	valid_0's l2: 0.404297	valid_0's auc: 1
Early stopping, best iteration is:
[1]	valid_0's l2: 0.659605	valid_0's auc: 1
Save model...
Start predicting...
The rmse of prediction is: 0.8121608885821442
