In [4]:
import pandas as pd
import numpy as np
import itertools
from IPython.display import display
import math
import os
import gc
import lightgbm as lg

In [64]:
def augment(data, label):
    assert len(data) == len(label)
    def get_map(num_p, wind_p):
        conv = [0]*34
        inv = [0]*34
        for i, p in enumerate(num_p):
            for k in range(9):
                conv[i*9+k] = p*9+k
                inv[p*9+k] = i*9+k
        for k in range(27, 31):
            conv[k] = k
            inv[k] = k
        for k in range(3):
            conv[31+(k+wind_p)%3] = 31+k
            inv[31+(k-wind_p)%3] = 31+k
        return np.array(conv, dtype=np.uint8), np.array(inv, dtype=np.uint8)
    
    data_columns = data.columns
    label_columns = label.columns
    
    data = data.to_numpy()
    label = label.to_numpy()
    
    rows = len(data)
    augmented_data = np.empty((rows*18, 20), dtype=np.uint8)
    augmented_label = np.empty((rows*18, 34), dtype=np.uint8)
    
    k = 0
    for num_p in itertools.permutations(range(3)):
        for wind_p in range(3):
            conv, inv = get_map(num_p, wind_p)
            conv = np.concatenate(([0], conv+1))
            augmented_data[k:k+rows] = conv[data]
            augmented_label[k:k+rows] = label[:,inv]
            k += rows
    augmented_data = pd.DataFrame(augmented_data, columns=data_columns)
    augmented_label = pd.DataFrame(augmented_label, columns=label_columns)
    return augmented_data, augmented_label

def featurize(data):
    TILE_NAME = [f'{i+1}萬' for i in range(9)] + [f'{i+1}筒' for i in range(9)] + [f'{i+1}索' for i in range(9)] + list('東南西北白発中')
    FEATURE_NAME = [f'捨牌{i+1}' for i in range(20)] + [f'リーチ前{i}' for i in range(20)] + [f'{t}捨てカウント' for t in TILE_NAME]
    
    
    data = data.to_numpy()
    res = np.empty((len(data), 20+20+34), np.uint8)
    res[:,:20] = data
    temp = data[:,::-1]
    shifts = np.argmax(temp!=0, axis=1)
    shifts = np.arange(20)[np.newaxis, :] + shifts[:, np.newaxis]
    shifts[shifts>=20] = 0
    res[:, 20:40] = temp[np.arange(data.shape[0])[:, np.newaxis], shifts]
    for k in range(34):
        res[:,40+k] = np.sum(temp==k+1, axis=1)
    return pd.DataFrame(res, columns=FEATURE_NAME)

def prepare_data(src_path='data/ver1.feather', dst_dir='data/', portion=1, augment_flag=True, featurize_flag=True):
    
    def helper(path, data, augment_flag):
        data.reset_index(drop=True, inplace=True)
        data, label = data[data.columns[:20]], data[data.columns[20:]]
        if augment_flag:
            data, label = augment(data, label)
        if featurize_flag:
            data = featurize(data)
        data.to_feather(f'{path}-data.feather')
        label.to_feather(f'{path}-label.feather')

    os.makedirs(dst_dir, exist_ok=True)
    raw = pd.read_feather(src_path)
    if portion < 1:
        raw = raw[:round(len(raw)*portion)]
    s1 = round(len(raw)*0.8)
    s2 = round(len(raw)*0.9)
    train = raw[:s1]
    validation = raw[s1:s2]
    test = raw[s2:]
    
    print('Train Data')
    helper(os.path.join(dst_dir, 'train'), train, augment_flag)
    print('Validation Data')
    helper(os.path.join(dst_dir, 'validation'), validation, False)
    print('Test Data')
    helper(os.path.join(dst_dir, 'test'), test, False)
    

In [85]:
# prepare_data(dst_dir='data/vanilla/', augment_flag=False, featurize_flag=False)
# prepare_data(dst_dir='data/augmented/', augment_flag=True, featurize_flag=False)
# prepare_data(dst_dir='data/featurized/', augment_flag=False, featurize_flag=True)
# prepare_data(dst_dir='data/both/', augment_flag=True, featurize_flag=True)
prepare_data(dst_dir='data/light/', augment_flag=True, featurize_flag=True, portion=0.05)

Train Data
Validation Data
Test Data


In [75]:
import gc
TILES = [f'{i+1}萬' for i in range(9)] + list('東南西北白')
WAITS = [f'{t}待' for t in TILES]

def train_and_test(data_dir, **train_params):
    
    models = {}
    if 'params' in train_params:
        pp = train_params['params']
        del train_params['params']
    else:
        pp = {}
    
    
    td = pd.read_feather(os.path.join(data_dir, 'train-data.feather'))
    vd = pd.read_feather(os.path.join(data_dir, 'validation-data.feather'))
    tl = pd.read_feather(os.path.join(data_dir, 'train-label.feather'))
    vl = pd.read_feather(os.path.join(data_dir, 'validation-label.feather'))
    gc.collect()
    
    if 'リーチ前0' in td:
        categoricals = [f'捨牌{i+1}' for i in range(20)] + [f'リーチ前{i}' for i in range(20)]
    else:
        categoricals = [f'捨牌{i+1}' for i in range(20)]
    
#     for tile in itertools.chain(range(9), range(27,32)):
    for tile in ['5萬待']:
        train_dataset = lg.Dataset(td, label=tl[tile])
        valid_dataset = lg.Dataset(vd, label=vl[tile])
        gc.collect()
        params = {
            'two_round': True,
            'use_missing': False,
            'objective': 'binary',}
        params.update(pp)
        model = lg.train(train_set=train_dataset,
                         valid_sets=[valid_dataset],
                         categorical_feature=categoricals,
                         params=params,
                         **train_params)
        display(model.trees_to_dataframe())

In [76]:
steps = 1
def helper(n):
    k = n//50
    r = 0.1 / (k+1)
    print(r)
    return r
train_and_test('data/both', early_stopping_rounds=20, num_boost_round=1000, learning_rates=helper, verbose_eval=steps)

New categorical_feature is ['リーチ前0', 'リーチ前1', 'リーチ前10', 'リーチ前11', 'リーチ前12', 'リーチ前13', 'リーチ前14', 'リーチ前15', 'リーチ前16', 'リーチ前17', 'リーチ前18', 'リーチ前19', 'リーチ前2', 'リーチ前3', 'リーチ前4', 'リーチ前5', 'リーチ前6', 'リーチ前7', 'リーチ前8', 'リーチ前9', '捨牌1', '捨牌10', '捨牌11', '捨牌12', '捨牌13', '捨牌14', '捨牌15', '捨牌16', '捨牌17', '捨牌18', '捨牌19', '捨牌2', '捨牌20', '捨牌3', '捨牌4', '捨牌5', '捨牌6', '捨牌7', '捨牌8', '捨牌9']


[LightGBM] [Info] Number of positive: 2837988, number of negative: 31335804
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1390
[LightGBM] [Info] Number of data points in the train set: 34173792, number of used features: 74




0.1
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.083046 -> initscore=-2.401666
[LightGBM] [Info] Start training from score -2.401666
[1]	valid_0's binary_logloss: 0.283305
Training until validation scores don't improve for 20 rounds
0.1
[2]	valid_0's binary_logloss: 0.281501
0.1
[3]	valid_0's binary_logloss: 0.279934
0.1
[4]	valid_0's binary_logloss: 0.278588
0.1
[5]	valid_0's binary_logloss: 0.277451
0.1
[6]	valid_0's binary_logloss: 0.276449
0.1
[7]	valid_0's binary_logloss: 0.275586
0.1
[8]	valid_0's binary_logloss: 0.274828
0.1
[9]	valid_0's binary_logloss: 0.274166
0.1
[10]	valid_0's binary_logloss: 0.273578
0.1
[11]	valid_0's binary_logloss: 0.273067
0.1
[12]	valid_0's binary_logloss: 0.272592
0.1
[13]	valid_0's binary_logloss: 0.272187
0.1
[14]	valid_0's binary_logloss: 0.271824
0.1
[15]	valid_0's binary_logloss: 0.271504
0.1
[16]	valid_0's binary_logloss: 0.271204
0.1
[17]	valid_0's binary_logloss: 0.270943
0.1
[18]	valid_0's binary_logloss: 0.270709
0.1
[19]	valid_0's bin

Unnamed: 0,tree_index,node_depth,node_index,left_child,right_child,parent_index,split_feature,split_gain,threshold,decision_type,missing_direction,missing_type,value,weight,count
0,0,1,0-S0,0-S1,0-L1,,5萬捨てカウント,424496.000000,0.0,<=,left,,-2.401670,0.000000e+00,34173792
1,0,2,0-S1,0-S2,0-S4,0-S0,8萬捨てカウント,104382.000000,0.0,<=,left,,-2.386540,2.282100e+06,29968878
2,0,3,0-S2,0-S3,0-S7,0-S1,2萬捨てカウント,80941.898438,0.0,<=,left,,-2.375550,1.805790e+06,23713902
3,0,4,0-S3,0-S5,0-S6,0-S2,捨牌8,61235.601562,0||3||4||6||7,==,right,,-2.364710,1.430700e+06,18788112
4,0,5,0-S5,0-S16,0-S9,0-S3,捨牌5,15134.099609,0||3||4||6||7,==,right,,-2.383190,7.960810e+05,10454232
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7681,125,9,125-L27,,,125-S26,,,,,,,-0.000523,9.968617e+04,1229145
7682,125,8,125-L26,,,125-S25,,,,,,,0.001333,1.520954e+05,1727436
7683,125,7,125-L25,,,125-S24,,,,,,,-0.000254,2.120238e+05,2653752
7684,125,4,125-L10,,,125-S9,,,,,,,-0.001509,4.713061e+04,532662


In [77]:
steps = 1
def helper(n):
    k = n//50
    r = 0.1 / (k+1)
    print(r)
    return r
train_and_test('data/vanilla', early_stopping_rounds=20, num_boost_round=1000, learning_rates=helper, verbose_eval=steps)

New categorical_feature is ['捨牌1', '捨牌10', '捨牌11', '捨牌12', '捨牌13', '捨牌14', '捨牌15', '捨牌16', '捨牌17', '捨牌18', '捨牌19', '捨牌2', '捨牌20', '捨牌3', '捨牌4', '捨牌5', '捨牌6', '捨牌7', '捨牌8', '捨牌9']


[LightGBM] [Info] Number of positive: 158454, number of negative: 1740090
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 626
[LightGBM] [Info] Number of data points in the train set: 1898544, number of used features: 19
0.1
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.083461 -> initscore=-2.396228
[LightGBM] [Info] Start training from score -2.396228
[1]	valid_0's binary_logloss: 0.283918
Training until validation scores don't improve for 20 rounds
0.1




[2]	valid_0's binary_logloss: 0.282543
0.1
[3]	valid_0's binary_logloss: 0.281405
0.1
[4]	valid_0's binary_logloss: 0.280422
0.1
[5]	valid_0's binary_logloss: 0.279554
0.1
[6]	valid_0's binary_logloss: 0.278791
0.1
[7]	valid_0's binary_logloss: 0.278095
0.1
[8]	valid_0's binary_logloss: 0.277484
0.1
[9]	valid_0's binary_logloss: 0.276919
0.1
[10]	valid_0's binary_logloss: 0.276417
0.1
[11]	valid_0's binary_logloss: 0.275935
0.1
[12]	valid_0's binary_logloss: 0.275499
0.1
[13]	valid_0's binary_logloss: 0.275098
0.1
[14]	valid_0's binary_logloss: 0.274731
0.1
[15]	valid_0's binary_logloss: 0.274426
0.1
[16]	valid_0's binary_logloss: 0.274142
0.1
[17]	valid_0's binary_logloss: 0.273853
0.1
[18]	valid_0's binary_logloss: 0.273577
0.1
[19]	valid_0's binary_logloss: 0.273356
0.1
[20]	valid_0's binary_logloss: 0.273155
0.1
[21]	valid_0's binary_logloss: 0.272957
0.1
[22]	valid_0's binary_logloss: 0.272774
0.1
[23]	valid_0's binary_logloss: 0.272629
0.1
[24]	valid_0's binary_logloss: 0.272461


Unnamed: 0,tree_index,node_depth,node_index,left_child,right_child,parent_index,split_feature,split_gain,threshold,decision_type,missing_direction,missing_type,value,weight,count
0,0,1,0-S0,0-S6,0-S1,,捨牌5,3620.010010,0||2||3||5||7||8,==,right,,-2.396230,0.000000,1898544
1,0,2,0-S6,0-L0,0-S18,0-S0,捨牌5,1445.369995,5,==,right,,-2.426860,30472.700000,398361
2,0,3,0-L0,,,0-S6,,,,,,,-2.503453,2271.751364,29698
3,0,3,0-S18,0-L7,0-S20,0-S6,捨牌4,445.558014,2||5||8,==,right,,-2.420650,28200.900000,368663
4,0,4,0-L7,,,0-S18,,,,,,,-2.467029,1924.234142,25155
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16892,276,6,276-S21,276-L21,276-L22,276-S20,捨牌9,34.544998,0||3||4||9||14||16||17||19||20||21||22||26||27...,==,right,,-0.000018,7063.460000,90433
16893,276,7,276-L21,,,276-S21,,,,,,,-0.000421,6318.266824,81194
16894,276,7,276-L22,,,276-S21,,,,,,,0.003351,745.197600,9239
16895,276,4,276-L4,,,276-S3,,,,,,,-0.001340,3848.546771,48899


In [78]:
steps = 1
def helper(n):
    k = n//50
    r = 0.1 / (k+1)
    print(r)
    return r
train_and_test('data/featurized', early_stopping_rounds=20, num_boost_round=1000, learning_rates=helper, verbose_eval=steps)

New categorical_feature is ['リーチ前0', 'リーチ前1', 'リーチ前10', 'リーチ前11', 'リーチ前12', 'リーチ前13', 'リーチ前14', 'リーチ前15', 'リーチ前16', 'リーチ前17', 'リーチ前18', 'リーチ前19', 'リーチ前2', 'リーチ前3', 'リーチ前4', 'リーチ前5', 'リーチ前6', 'リーチ前7', 'リーチ前8', 'リーチ前9', '捨牌1', '捨牌10', '捨牌11', '捨牌12', '捨牌13', '捨牌14', '捨牌15', '捨牌16', '捨牌17', '捨牌18', '捨牌19', '捨牌2', '捨牌20', '捨牌3', '捨牌4', '捨牌5', '捨牌6', '捨牌7', '捨牌8', '捨牌9']


[LightGBM] [Info] Number of positive: 158454, number of negative: 1740090
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1394
[LightGBM] [Info] Number of data points in the train set: 1898544, number of used features: 72




0.1
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.083461 -> initscore=-2.396228
[LightGBM] [Info] Start training from score -2.396228
[1]	valid_0's binary_logloss: 0.283311
Training until validation scores don't improve for 20 rounds
0.1
[2]	valid_0's binary_logloss: 0.281484
0.1
[3]	valid_0's binary_logloss: 0.279944
0.1
[4]	valid_0's binary_logloss: 0.278617
0.1
[5]	valid_0's binary_logloss: 0.277474
0.1
[6]	valid_0's binary_logloss: 0.276483
0.1
[7]	valid_0's binary_logloss: 0.275626
0.1
[8]	valid_0's binary_logloss: 0.274875
0.1
[9]	valid_0's binary_logloss: 0.274202
0.1
[10]	valid_0's binary_logloss: 0.273619
0.1
[11]	valid_0's binary_logloss: 0.273108
0.1
[12]	valid_0's binary_logloss: 0.272665
0.1
[13]	valid_0's binary_logloss: 0.272251
0.1
[14]	valid_0's binary_logloss: 0.271893
0.1
[15]	valid_0's binary_logloss: 0.271573
0.1
[16]	valid_0's binary_logloss: 0.271298
0.1
[17]	valid_0's binary_logloss: 0.271042
0.1
[18]	valid_0's binary_logloss: 0.270808
0.1
[19]	valid_0's bin

Unnamed: 0,tree_index,node_depth,node_index,left_child,right_child,parent_index,split_feature,split_gain,threshold,decision_type,missing_direction,missing_type,value,weight,count
0,0,1,0-S0,0-S1,0-L1,,5萬捨てカウント,23658.000000,0.0,<=,left,,-2.396230,0.000000,1898544
1,0,2,0-S1,0-S2,0-S4,0-S0,8萬捨てカウント,5829.709961,0.0,<=,left,,-2.381140,127413.000000,1665631
2,0,3,0-S2,0-S3,0-S7,0-S1,2萬捨てカウント,4428.419922,0.0,<=,left,,-2.370170,100891.000000,1318918
3,0,4,0-S3,0-S6,0-S5,0-S2,捨牌8,3474.110107,0||3||7,==,right,,-2.359470,80017.800000,1046051
4,0,5,0-S6,0-S14,0-S11,0-S3,捨牌5,829.322998,0||3||4||6||7,==,right,,-2.379310,41993.100000,548964
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5546,90,4,90-S20,90-S21,90-L21,90-S18,リーチ前0,19.968700,1||3||6||9||10||16||19||24||31||32,==,right,,-0.018456,217.551000,191163
5547,90,5,90-S21,90-L19,90-L22,90-S20,捨牌4,26.772499,1||2||3||5||15||16||19||22||23||24||25||26||27...,==,right,,0.008491,51.430100,34154
5548,90,6,90-L19,,,90-S21,,,,,,,0.035370,29.192799,19304
5549,90,6,90-L22,,,90-S21,,,,,,,-0.026821,22.237314,14850


In [79]:
steps = 1
def helper(n):
    k = n//50
    r = 0.1 / (k+1)
    print(r)
    return r
train_and_test('data/augmented', early_stopping_rounds=20, num_boost_round=1000, learning_rates=helper, verbose_eval=steps)

New categorical_feature is ['捨牌1', '捨牌10', '捨牌11', '捨牌12', '捨牌13', '捨牌14', '捨牌15', '捨牌16', '捨牌17', '捨牌18', '捨牌19', '捨牌2', '捨牌20', '捨牌3', '捨牌4', '捨牌5', '捨牌6', '捨牌7', '捨牌8', '捨牌9']


[LightGBM] [Info] Number of positive: 2837988, number of negative: 31335804
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 626
[LightGBM] [Info] Number of data points in the train set: 34173792, number of used features: 20
0.1




[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.083046 -> initscore=-2.401666
[LightGBM] [Info] Start training from score -2.401666
[1]	valid_0's binary_logloss: 0.283759
Training until validation scores don't improve for 20 rounds
0.1
[2]	valid_0's binary_logloss: 0.282294
0.1
[3]	valid_0's binary_logloss: 0.281051
0.1
[4]	valid_0's binary_logloss: 0.279942
0.1
[5]	valid_0's binary_logloss: 0.278982
0.1
[6]	valid_0's binary_logloss: 0.278111
0.1
[7]	valid_0's binary_logloss: 0.277339
0.1
[8]	valid_0's binary_logloss: 0.276637
0.1
[9]	valid_0's binary_logloss: 0.276002
0.1
[10]	valid_0's binary_logloss: 0.275446
0.1
[11]	valid_0's binary_logloss: 0.274924
0.1
[12]	valid_0's binary_logloss: 0.274468
0.1
[13]	valid_0's binary_logloss: 0.274051
0.1
[14]	valid_0's binary_logloss: 0.273672
0.1
[15]	valid_0's binary_logloss: 0.273289
0.1
[16]	valid_0's binary_logloss: 0.272976
0.1
[17]	valid_0's binary_logloss: 0.272705
0.1
[18]	valid_0's binary_logloss: 0.272436
0.1
[19]	valid_0's binary_

Unnamed: 0,tree_index,node_depth,node_index,left_child,right_child,parent_index,split_feature,split_gain,threshold,decision_type,missing_direction,missing_type,value,weight,count
0,0,1,0-S0,0-S12,0-S1,,捨牌5,62689.500000,2||5||8,==,right,,-2.401670,0.000000e+00,34173792
1,0,2,0-S12,0-L0,0-L13,0-S0,捨牌5,15244.599609,2||8,==,right,,-2.454720,2.051760e+05,2694396
2,0,3,0-L0,,,0-S12,,,,,,,-2.441068,1.640755e+05,2154660
3,0,3,0-L13,,,0-S12,,,,,,,-2.509178,4.110044e+04,539736
4,0,2,0-S1,0-S11,0-S2,0-S0,捨牌6,61214.199219,2||5||8,==,right,,-2.397130,2.397130e+06,31479396
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15306,250,5,250-S28,250-S29,250-L29,250-S15,捨牌5,100.550003,5||7||8,==,right,,-0.000062,9.588210e+05,12722733
15307,250,6,250-S29,250-L16,250-L30,250-S28,捨牌6,470.687988,2||3||4||5,==,right,,-0.000866,4.131170e+04,907242
15308,250,7,250-L16,,,250-S29,,,,,,,-0.007370,2.866549e+03,95988
15309,250,7,250-L30,,,250-S29,,,,,,,-0.000380,3.844512e+04,811254


In [10]:
TILES = [f'{i+1}萬' for i in range(9)] + list('東南西北白')
WAITS = [f'{t}待' for t in TILES]

def train_and_save(data_dir, **train_params):
    
    models = {}
    if 'params' in train_params:
        pp = train_params['params']
        del train_params['params']
    else:
        pp = {}
    
    
    td = pd.read_feather(os.path.join(data_dir, 'train-data.feather'))
    vd = pd.read_feather(os.path.join(data_dir, 'validation-data.feather'))
    train_dataset = lg.Dataset(td)
    valid_dataset = lg.Dataset(vd)
    if 'リーチ前0' in td:
        categoricals = [f'捨牌{i+1}' for i in range(20)] + [f'リーチ前{i}' for i in range(20)]
    else:
        categoricals = [f'捨牌{i+1}' for i in range(20)]
    del td, vd
    gc.collect()

    tl = pd.read_feather(os.path.join(data_dir, 'train-label.feather'))
    vl = pd.read_feather(os.path.join(data_dir, 'validation-label.feather'))
    
    
    for i, tile in enumerate(WAITS):
        train_dataset.set_label(tl[tile])
        valid_dataset.set_label(vl[tile])
        params = {
            'objective': 'binary',}
        params.update(pp)
        model = lg.train(train_set=train_dataset,
                         valid_sets=[valid_dataset],
                         categorical_feature=categoricals,
                         params=params,
                         **train_params)
        model.save_model(f'models/{i}.txt')

In [6]:
def helper(n):
    k = n//50
    r = 0.1 / (k+1)
    return r
train_and_save('data/both', early_stopping_rounds=20, num_boost_round=1000, learning_rates=helper, verbose_eval=20)

New categorical_feature is ['リーチ前0', 'リーチ前1', 'リーチ前10', 'リーチ前11', 'リーチ前12', 'リーチ前13', 'リーチ前14', 'リーチ前15', 'リーチ前16', 'リーチ前17', 'リーチ前18', 'リーチ前19', 'リーチ前2', 'リーチ前3', 'リーチ前4', 'リーチ前5', 'リーチ前6', 'リーチ前7', 'リーチ前8', 'リーチ前9', '捨牌1', '捨牌10', '捨牌11', '捨牌12', '捨牌13', '捨牌14', '捨牌15', '捨牌16', '捨牌17', '捨牌18', '捨牌19', '捨牌2', '捨牌20', '捨牌3', '捨牌4', '捨牌5', '捨牌6', '捨牌7', '捨牌8', '捨牌9']


[LightGBM] [Info] Number of positive: 1468152, number of negative: 32705640
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1390
[LightGBM] [Info] Number of data points in the train set: 34173792, number of used features: 74




[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.042961 -> initscore=-3.103543
[LightGBM] [Info] Start training from score -3.103543
Training until validation scores don't improve for 20 rounds
[20]	valid_0's binary_logloss: 0.161273
[40]	valid_0's binary_logloss: 0.160097
[60]	valid_0's binary_logloss: 0.159976
[80]	valid_0's binary_logloss: 0.160117
Early stopping, best iteration is:
[67]	valid_0's binary_logloss: 0.159966
[LightGBM] [Info] Number of positive: 1901658, number of negative: 32272134
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1390
[LightGBM] [Info] Number of data points in the train set: 34173792, number of used features: 74
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.055647 -> initscore=-2.831478
[LightGBM] [Info] Start training from score -2.831478
Training until validation scores don't improve for 20 rounds
[20]	valid_0's binary_logloss: 0.201109
[40]	va

In [7]:
def helper(n):
    k = n//30
    r = 0.13 / (k+1)
    return r
train_and_save('data/both', early_stopping_rounds=20, num_boost_round=1000, learning_rates=helper, verbose_eval=20)

[LightGBM] [Info] Number of positive: 1468152, number of negative: 32705640
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1390
[LightGBM] [Info] Number of data points in the train set: 34173792, number of used features: 74
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.042961 -> initscore=-3.103543
[LightGBM] [Info] Start training from score -3.103543
Training until validation scores don't improve for 20 rounds
[20]	valid_0's binary_logloss: 0.160574
[40]	valid_0's binary_logloss: 0.160003
[60]	valid_0's binary_logloss: 0.159957
[80]	valid_0's binary_logloss: 0.159954
Early stopping, best iteration is:
[78]	valid_0's binary_logloss: 0.159946
[LightGBM] [Info] Number of positive: 1901658, number of negative: 32272134
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1390
[LightGBM]

In [None]:
def helper(n):
    k = n//40
    r = 0.075 / (k+1)
    return r
train_and_save('data/both', early_stopping_rounds=20, num_boost_round=1000, learning_rates=helper, verbose_eval=20)