In [1]:
import lightgbm as lgb
import numpy as np
import pandas as pd

In [2]:
def GetNewDataByPandas():
    wine = pd.read_csv("./wine.csv")
    wine['alcohol**2'] = pow(wine["alcohol"], 2)
    wine['volatileAcidity*alcohol'] = wine["alcohol"] * wine['volatile acidity']
    y = np.array(wine.quality)
    X = np.array(wine.drop("quality", axis=1))

    columns = np.array(wine.columns)

    return X, y, columns

In [3]:
from sklearn.model_selection import train_test_split
# Read wine quality data from file
X, y, wineNames = GetNewDataByPandas()

# split data to [[0.8,0.2],01]
x_train_all, x_predict, y_train_all, y_predict = train_test_split(X, y, test_size=0.10, random_state=100)

x_train, x_test, y_train, y_test = train_test_split(x_train_all, y_train_all, test_size=0.2, random_state=100)

train_data = lgb.Dataset(data=x_train,label=y_train)
test_data = lgb.Dataset(data=x_test,label=y_test)

train_data.save_binary("./wine_lightgbm_train.bin")




<lightgbm.basic.Dataset at 0x1a18332b910>

In [4]:
# 可以在lightgbm加载数据的时候同时指明类别特征，这对模型精度提升有一定的好处。
# train_data = lgb.Dataset(data, label=label, feature_name=['c1', 'c2', 'c3'], categorical_feature=['c3'])

In [5]:
param = {'num_leaves':31, 'num_trees':100, 'objective':'regression'}
param['metric'] = 'rmse'

In [6]:
# Training a model requires a parameter list and data set:
num_round = 10
bst = lgb.train(param, train_data, num_round, valid_sets=[test_data])
# After training, the model can be saved:
bst.save_model('model.txt')
# A saved model can be loaded:
bst = lgb.Booster(model_file='model.txt')

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000170 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1308
[LightGBM] [Info] Number of data points in the train set: 1151, number of used features: 13
[LightGBM] [Info] Start training from score 5.608167




In [7]:
num_round = 10
lgb.cv(param, train_data, num_round, nfold=5)



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000159 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1308
[LightGBM] [Info] Number of data points in the train set: 920, number of used features: 13
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000093 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1308
[LightGBM] [Info] Number of data points in the train set: 921, number of used features: 13
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000235 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1308
[LightGBM] [Info] Number of data points in the train set: 921, number of used features: 13
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of t

{'valid rmse-mean': [0.774987106820873,
  0.7491220863708051,
  0.7281922212930283,
  0.7097824692770105,
  0.6940759739607876,
  0.6797153946092306,
  0.6696840593771072,
  0.6609663890335338,
  0.6535890703363206,
  0.6473198288767565,
  0.6425806790148506,
  0.6373665111602508,
  0.6347244528484731,
  0.6308625770807975,
  0.628918594642353,
  0.6272230212085251,
  0.6253250770201633,
  0.6223593012391595,
  0.6212648873132847,
  0.6199514729892114,
  0.6203873284980943,
  0.6203033852456977,
  0.6189822624878787,
  0.6183410979829776,
  0.6181484102219923,
  0.6180162127636785,
  0.6177792986445303,
  0.6174518337809696,
  0.617555391334362,
  0.6162424295549256,
  0.6167122570640725,
  0.6163427083126598,
  0.6159321116214705,
  0.616212019117553,
  0.6165976887053624,
  0.6158503087885474,
  0.6149133667697786,
  0.6149134893419984,
  0.6145255939620513,
  0.6149597021436323,
  0.6153579614072315,
  0.6160740480612332,
  0.6168216684639328,
  0.6164941580283196,
  0.6170504164797

In [8]:
# 早停阈值则通过early_stopping_rounds
param = {'num_leaves':31, 'num_trees':100, 'objective':'regression', 'early_stopping_rounds': 10}
param['metric'] = 'rmse'
bst = lgb.train(param, train_data, num_round, valid_sets=[test_data])
bst.save_model('model.txt', num_iteration=bst.best_iteration)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000514 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1308
[LightGBM] [Info] Number of data points in the train set: 1151, number of used features: 13
[LightGBM] [Info] Start training from score 5.608167
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[67]	valid_0's rmse: 0.568264




<lightgbm.basic.Booster at 0x1a183359190>

In [11]:
ypred = bst.predict(x_predict, num_iteration=bst.best_iteration)

from sklearn.metrics import mean_squared_error
RMSE = np.sqrt(mean_squared_error(y_predict, ypred))

print("RMSE of predict :",RMSE)

RMSE of predict : 0.5956830546692128
