In [1]:
from sklearn import datasets as ds
import numpy as np
def load_data(feats, group):
    '''
    加载数据
    分别加载feature,label,query
    :param feats:
    :param group:
    :return:
    '''

    x_train, y_train = ds.load_svmlight_file(feats)
    q_train = np.loadtxt(group)

    return x_train, y_train, q_train

In [2]:
!pwd


<function __main__.load_data(feats, group)>

In [4]:
feats_path='data/train/feats.txt'
group_path='data/train/group.txt'

x_train, y_train, q_train = load_data(feats_path,group_path)

In [5]:
x_train

<7796x46 sparse matrix of type '<class 'numpy.float64'>'
	with 358616 stored elements in Compressed Sparse Row format>

In [6]:
import lightgbm as lgb

train_data = lgb.Dataset(x_train, label=y_train, group=q_train)
params = {
    'task': 'train',  # 执行的任务类型
    'boosting_type': 'gbrt',  # 基学习器
    'objective': 'lambdarank',  # 排序任务(目标函数)
    'metric': 'ndcg',  # 度量的指标(评估函数)
    'max_position': 10,  # @NDCG 位置优化
    'metric_freq': 1,  # 每隔多少次输出一次度量结果
    'train_metric': True,  # 训练时就输出度量结果
    'ndcg_at': [10],
    'max_bin': 255,  # 一个整数，表示最大的桶的数量。默认值为 255。lightgbm 会根据它来自动压缩内存。如max_bin=255 时，则lightgbm 将使用uint8 来表示特征的每一个值。
    'num_iterations': 200,  # 迭代次数，即生成的树的棵数
    'learning_rate': 0.01,  # 学习率
    'num_leaves': 31,  # 叶子数
    # 'max_depth':6,
    'tree_learner': 'serial',  # 用于并行学习，‘serial’： 单台机器的tree learner
    'min_data_in_leaf': 30,  # 一个叶子节点上包含的最少样本数量
    'verbose': 2  # 显示训练时的信息
}
gbm = lgb.train(params, train_data, valid_sets=[train_data])





[LightGBM] [Debug] Dataset::GetMultiBinFromSparseFeatures: sparse rate 0.743757
[LightGBM] [Debug] Dataset::GetMultiBinFromAllFeatures: sparse rate 0.391412
[LightGBM] [Debug] init for col-wise cost 0.001598 seconds, init for row-wise cost 0.002496 seconds
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9171
[LightGBM] [Info] Number of data points in the train set: 7796, number of used features: 40
[LightGBM] [Debug] Trained a tree with leaves = 31 and max_depth = 9
[1]	training's ndcg@10: 0.796443
[LightGBM] [Debug] Trained a tree with leaves = 31 and max_depth = 12
[2]	training's ndcg@10: 0.828216
[LightGBM] [Debug] Trained a tree with leaves = 31 and max_depth = 10
[3]	training's ndcg@10: 0.842904
[LightGBM] [Debug] Trained a tree with leaves = 31 and max_depth = 12
[4]	training's ndcg@10: 0.860481
[LightGBM] [Debug] Trained a tree with leaves = 31 and max_depth = 11
[5]	training's ndcg@10: 0.864379
[LightGBM] [Debug] Trained a tree with leaves

In [9]:
"""

Various utilities for converting data from/to Microsoft's LETOR format.

"""

import numpy as np
# import sklearn.externals.six

def iter_lines(lines, has_targets=True, one_indexed=True, missing=0.0):
    """Transforms an iterator of lines to an iterator of LETOR rows.

    Each row is represented by a (x, y, qid, comment) tuple.

    Parameters
    ----------
    lines : iterable of lines
        Lines to parse.
    has_targets : bool, optional
        Whether the file contains targets. If True, will expect the first token
        of every line to be a real representing the sample's target (i.e.
        score). If False, will use -1 as a placeholder for all targets.
    one_indexed : bool, optional 特征id从1开始的转为从0开始
        Whether feature ids are one-indexed. If True, will subtract 1 from each
        feature id.
    missing : float, optional
        Placeholder to use if a feature value is not provided for a sample.

    Yields
    ------
    x : array of floats
        Feature vector of the sample.
    y : float
        Target value (score) of the sample, or -1 if no target was parsed.
    qid : object
        Query id of the sample. This is currently guaranteed to be a string.
    comment : str
        Comment accompanying the sample.

    """
    for line in lines:
        data, _, comment = line.rstrip().partition('#')
        toks = data.strip().split()
        # toks = line.rstrip()
        # toks = re.split('\s+', toks.strip())
        # print("toks: ", toks)
        # comment = "no comment"
        num_features = 0  # 统计特征个数
        x = np.repeat(missing, 8)
        y = -1.0
        if has_targets:
            y = float(toks[0].strip())  # 相关度label
            toks = toks[1:]
        # qid:1 => 1
        qid = _parse_qid_tok(toks[0].strip())

        # feature(id:value)
        for tok in toks[1:]:
            # fid, _, val = tok.strip().partition(':') # fid,_,val => featureID,:,featureValue
            fid, val = tok.split(":")  # featureID:featureValue
            fid = int(fid)
            val = float(val)
            if one_indexed:
                fid -= 1
            assert fid >= 0
            while len(x) <= fid:
                orig = len(x)
                # x=np.resize(x,(len(x) * 2))
                x.resize(len(x) * 2)
                x[orig:orig * 2] = missing
            x[fid] = val
            num_features = max(fid + 1, num_features)

        assert num_features > 0
        x.resize(num_features)

        yield (x, y, qid, comment)


def read_dataset(source, has_targets=True, one_indexed=True, missing=0.0):
    """Parses a LETOR dataset from `source`.

    Parameters
    ----------
    source : string or iterable of lines
        String, file, or other file-like object to parse.
    has_targets : bool, optional
        See `iter_lines`.
    one_indexed : bool, optional
        See `iter_lines`.
    missing : float, optional
        See `iter_lines`.

    Returns
    -------
    X : array of arrays of floats
      Feature matrix (see `iter_lines`).
    y : array of floats
        Target vector (see `iter_lines`).
    qids : array of objects
        Query id vector (see `iter_lines`).
    comments : array of strs
        Comment vector (see `iter_lines`).
    """
    # if isinstance(source, sklearn.externals.six.string_types):
    #     source = source.splitlines(True)

    max_width = 0  # 某行最多特征个数
    xs, ys, qids, comments = [], [], [], []
    iter_content = iter_lines(source, has_targets=has_targets,
                              one_indexed=one_indexed, missing=missing)
    # x:特征向量; y:float 相关度值[0-4]; qid:string query id; comment: #后面内容
    for x, y, qid, comment in iter_content:
        xs.append(x)
        ys.append(y)
        qids.append(qid)
        comments.append(comment)
        max_width = max(max_width, len(x))

    assert max_width > 0
    # X.shape = [len(xs), max_width]
    X = np.ndarray((len(xs), max_width), dtype=np.float64)
    X.fill(missing)
    for i, x in enumerate(xs):
        X[i, :len(x)] = x
    ys = np.array(ys) if has_targets else None
    qids = np.array(qids)
    comments = np.array(comments)

    return (X, ys, qids, comments)


def _parse_qid_tok(tok):
    assert tok.startswith('qid:')
    return tok[4:]

In [11]:
test_path='data/test/test.txt'

with open(test_path, 'r', encoding='utf-8') as testfile:
    test_X, test_y, test_qids, comments = read_dataset(testfile)

In [13]:
pre_y=gbm.predict(test_X)

In [14]:
pre_y

array([ 8.28653643e-01, -1.84790338e+00,  6.28460133e-01,  3.88748943e-01,
       -1.82119377e-01, -5.97297350e-01, -1.87969974e+00, -1.85166921e+00,
       -1.75601130e+00, -1.12507393e+00, -2.04025694e-01,  6.13358850e-02,
        4.43838050e-01, -1.38145962e+00, -3.31403080e-01, -7.14954979e-01,
       -1.45957587e+00,  1.93886930e-01,  7.50698640e-02, -1.75601130e+00,
       -6.60452370e-01, -6.20252554e-01, -1.87550818e+00, -1.70558595e+00,
       -1.54403456e+00, -1.87338586e+00, -1.41958973e+00,  4.62068713e-01,
       -1.59909283e+00,  2.71303457e-01,  4.74996309e-01, -1.39975685e+00,
       -1.17611961e+00, -1.37615695e-01, -1.34400010e+00, -1.11060853e+00,
        4.77172281e-02, -1.87550818e+00, -8.47657388e-01, -1.87550818e+00,
        4.61301112e-01, -1.87550818e+00, -6.28599839e-01, -3.46319451e-01,
        2.55898846e-01,  1.75989083e-01,  1.10848328e-01, -5.51663064e-01,
        2.68132528e-01,  8.02965697e-03, -4.51652997e-01, -9.35367832e-02,
       -2.56032322e-01, -