In [82]:
import lightgbm as lgb
from sklearn import datasets
from sklearn.model_selection import train_test_split
import numpy as np

In [83]:
X = datasets.fetch_covtype().data[:3000]
y = datasets.fetch_covtype().target[:3000]
X_train, X_test, y_train, y_test = train_test_split(X, y)

print(X_train.shape)
print(y_train.shape)
print(np.unique(y_train))  # 7分类任务

(2250, 54)
(2250,)
[1 2 3 4 5 6 7]


In [84]:
weight = 1 / np.sqrt(y_train)
weight

array([0.70710678, 0.5       , 0.70710678, ..., 1.        , 0.4472136 ,
       0.4472136 ])

In [85]:
'''
weight (list, numpy 1-D array, pandas Series or None, optional (default=None))
    – Weight for each instance.


categorical_feature (list of strings or int, or 'auto', optional (default="auto"))
    – Categorical features.
    If list of int, interpreted as indices.
    If list of strings, interpreted as feature names (need to specify feature_name as well).
    If ‘auto’ and data is pandas DataFrame, pandas unordered categorical columns are used.
    All negative values in categorical features will be treated as missing values
'''

# parms字典参数:
'''
max_bin, default = 255, type = int, constraints: max_bin > 1
    max number of bins that feature values will be bucketed in
    small number of bins may reduce training accuracy but may increase general power (deal with over-fitting)
'''
train_dataset = lgb.Dataset(data=X_train, label=y_train,
                            weight=weight,
                            categorical_feature=[0, 1],  # 指定分类特征(lightgbm自动理解列是否为分类特征)
                            params={"max_bin": 510})
train_dataset

<lightgbm.basic.Dataset at 0x24fc4ddb4f0>

In [86]:
# Get the label of the Dataset.
train_dataset.get_label()

array([2, 4, 2, ..., 1, 5, 5])

In [87]:
# Get the weight of the Dataset.
train_dataset.get_weight()

array([0.70710678, 0.5       , 0.70710678, ..., 1.        , 0.4472136 ,
       0.4472136 ])

In [88]:
train_dataset.categorical_feature  # 获取分类特征

[0, 1]

In [89]:
# Get the used parameters in the Dataset
train_dataset.get_params()

{'max_bin': 510}