In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import sys

In [2]:
P = 32771

In [3]:
np.random.seed(P)
random_sample = list(set(np.random.randint(0, 2**32, size=4000000).tolist()))
np.random.shuffle(random_sample)
len(random_sample), np.array(random_sample).sum()

(3998155, 8583390742850828)

In [4]:
def dataset(xs):
    X = pd.DataFrame(map(lambda x: map(int,f'{x:032b}'), xs))
    y = pd.Series(map(lambda x: (x % P) % 2, xs))
    return X, y

In [5]:
X_train, y_train = dataset(random_sample[:2000000])
X_val, y_val = dataset(random_sample[-400000:-200000])
X_test, y_test = dataset(random_sample[-200000:])

In [6]:
print ('train', X_train.shape, y_train.shape)
print ('val', X_val.shape, y_val.shape)
print ('test', X_test.shape, y_test.shape)

train (2000000, 32) (2000000,)
val (200000, 32) (200000,)
test (200000, 32) (200000,)


In [7]:
params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'binary_error',
    'num_leaves': 31,
    'verbose': 1,
    'num_threads': 16,
}

train_data = lgb.Dataset(X_train, label=y_train)
valid_data = lgb.Dataset(X_val, label=y_val, reference=train_data)

clf = lgb.train(params, train_data, num_boost_round=30000,
                valid_sets=[valid_data, train_data], verbose_eval=1000)

[1000]	training's binary_error: 0.0183575	valid_0's binary_error: 0.02385
[2000]	training's binary_error: 0.010984	valid_0's binary_error: 0.020005
[3000]	training's binary_error: 0.007678	valid_0's binary_error: 0.01848
[4000]	training's binary_error: 0.0055555	valid_0's binary_error: 0.01779
[5000]	training's binary_error: 0.0037285	valid_0's binary_error: 0.017335
[6000]	training's binary_error: 0.0025345	valid_0's binary_error: 0.017295
[7000]	training's binary_error: 0.0015665	valid_0's binary_error: 0.01696
[8000]	training's binary_error: 0.00101	valid_0's binary_error: 0.01673
[9000]	training's binary_error: 0.0006275	valid_0's binary_error: 0.016535
[10000]	training's binary_error: 0.00036	valid_0's binary_error: 0.0166
[11000]	training's binary_error: 0.000198	valid_0's binary_error: 0.01646
[12000]	training's binary_error: 6.6e-05	valid_0's binary_error: 0.016465
[13000]	training's binary_error: 2.4e-05	valid_0's binary_error: 0.016485
[14000]	training's binary_error: 8.5e-06

In [10]:
th = 0.5

print ('train', ((clf.predict(X_train) > th) == y_train.values).mean())
print ('val', ((clf.predict(X_val) > th) == y_val.values).mean())
print ('test', ((clf.predict(X_test) > th) == y_test.values).mean())

train 1.0
val 0.98436
test 0.984025
