In [1]:
from time import perf_counter
import numpy as np
import pandas as pd
import random
from numba import jit
import string

from lightautoml_gpu.reader.gpu.seq_reader_gpu import DictToCudfSeqReader
from lightautoml_gpu.reader.gpu.seq_reader_gpu import DictToDaskCudfSeqReader
from lightautoml_gpu.reader.base import DictToPandasSeqReader
from lightautoml_gpu.tasks import Task

from lightautoml_gpu.transformers.seq import SeqNumCountsTransformer
from lightautoml_gpu.transformers.seq import SeqStatisticsTransformer
from lightautoml_gpu.transformers.seq import GetSeqTransformer
from lightautoml_gpu.transformers.gpu.seq_gpu import SeqNumCountsTransformerGPU
from lightautoml_gpu.transformers.gpu.seq_gpu import SeqStatisticsTransformerGPU
from lightautoml_gpu.transformers.gpu.seq_gpu import GetSeqTransformerGPU

RANDS_CHARS = np.array(list(string.ascii_letters + string.digits),
                       dtype=(np.str_, 1))

@jit(nopython=True)
def gen_cols(n_cols):
    cols = [""]*n_cols
    for i in range(n_cols):
        cols[i] = "col_" + str(i)
    return cols

def gen_string_data(n, n_str):
    string_db = ["algorithm", "analog", "app", "application", "array",
                 "backup", "bandwidth", "binary", "bit", "byte"]#,
                 #"bitmap", "blog", "bookmark", "boot", "broadband",
                 #"browser" , "buffer", "bug"]
    inds = np.random.randint(0, len(string_db), (n, n_str))
    output = np.empty(inds.shape, dtype=object)
    for i in range(inds.shape[0]):
        for j in range(inds.shape[1]):
            output[i][j] = string_db[inds[i][j]]

    return output

def gen_data_single_target(n: int, n_num: int, n_cat: int, n_date: int,
         n_str: str, max_n_cat: int, n_ids: int, max_ids: list = None,
         cols: list = None):
    n_cols = n_num+n_cat+n_str+n_date+n_ids
    cols = gen_cols(n_cols) if cols is None else cols
    data = np.random.random((n, n_num))*100-50

    category_data = np.random.randint(0, np.random.randint(1,max_n_cat),
                                      (n, n_cat))
    string_data = gen_string_data(n, n_str)

    string_data = np.reshape(string_data, (n, n_str))

    date_data = np.random.randint(0, 1000, (n, n_date))\
                               .astype(np.dtype("timedelta64[D]")) \
                              + np.datetime64("2018-01-01")

    if max_ids is None:
        id_data = np.arange(n, dtype=int)[:, np.newaxis]\
                  *np.ones(n_ids, dtype=int)[:, np.newaxis].T
        for elem in id_data.T:
            np.random.shuffle(elem)
    else:
        id_data = np.array(np.random.random((n, n_ids))*max_ids//1,
                           dtype=int)

    data = pd.DataFrame(data, columns = cols[:n_num]).astype('f')
    
    ix = [(row, col) for row in range(data.shape[0]) \
                     for col in range(data.shape[1])]
    #for row, col in random.sample(ix, int(round(.1*len(ix)))):
    #    data.iat[row, col] = np.nan
    
    nn = len(data.columns)
    for i in range(n_cat):
        data[cols[nn+i]] = pd.Series(category_data[:,i]).astype('f')
    nn = len(data.columns)
    for i in range(n_str):
        data[cols[nn+i]] = pd.Series(string_data[:,i]).astype(object)
    nn = len(data.columns)
    for i in range(n_date):
        data[cols[nn+i]] = pd.Series(date_data[:,i])
    nn = len(data.columns)
    for i in range(n_ids):
        data[cols[nn+i]] = pd.Series(id_data[:, i])

    data['TARGET'] = pd.Series(np.random.randint(0, 5, n)).astype('i')

    return 'TARGET', cols, data

## Data preparation
n, n_num, n_cat, n_date, n_str = 5000, 3, 2, 2, 1
max_n_cat, n_ids = 10, 1
cols_data1 = ["a","b","c","d","e","str1",
              "date1", "date2", "data1_id"]
_, _, data1 = gen_data_single_target(n, n_num, n_cat, 
              n_date, n_str, max_n_cat, n_ids, cols=cols_data1)

n, n_num, n_cat, n_date, n_str = 3500000, 2, 2, 0, 0
max_n_cat, n_ids = 5, 1
cols_data2 = ["h","i","j","k", "data2_id"]
_, _, data2 = gen_data_single_target(n, n_num, n_cat, 
              n_date, n_str, max_n_cat, n_ids, cols=cols_data2)

max_ids = [50, 100]
n, n_num, n_cat, n_date = 5000000, 4, 6, 2
n_str, max_n_cat, n_ids = 2, 15, 2
target, cols, train = gen_data_single_target(n, n_num, n_cat, 
                     n_date, n_str, max_n_cat, n_ids, max_ids)

n = 200
_, _, test = gen_data_single_target(n, n_num, n_cat, 
                     n_date, n_str, max_n_cat, n_ids, max_ids)
seq_params = {
         'data1':{'case': 'ids',
                  'params': {},
                  'scheme': {'to': 'plain', 
                             'from_id': 'data1_id',
                             'to_id': 'col_14'},
                 },
         'data2':{'case': 'ids',
                  'params': {},
                  'scheme': {'to': 'plain',
                             'from_id': 'data2_id',
                             'to_id': 'col_15'},
                      },
          }
seq_data = {'data1': data1[cols_data1],
            'data2': data2[cols_data2]              
           }
X_train = {'plain':train , 
           'seq': seq_data
          }
X_test = {'plain':test , 
           'seq': seq_data
          }
name = 'data2'

## Data preparation finished

task = Task('reg', metric='mae')
task_gpu = Task('reg', metric='mae', device='gpu')
task_mgpu = Task('reg', metric='mae', device='mgpu')
roles={'target': target}

reader = DictToPandasSeqReader(task=task, seq_params=seq_params)

In [2]:
res = reader.fit_read(X_train, roles=roles)

  folds_prior = (f_sum.sum() - f_sum) / (f_count.sum() - f_count)
  folds_prior = (f_sum.sum() - f_sum) / (f_count.sum() - f_count)
  folds_prior = (f_sum.sum() - f_sum) / (f_count.sum() - f_count)
  folds_prior = (f_sum.sum() - f_sum) / (f_count.sum() - f_count)
  folds_prior = (f_sum.sum() - f_sum) / (f_count.sum() - f_count)
  folds_prior = (f_sum.sum() - f_sum) / (f_count.sum() - f_count)
  folds_prior = (f_sum.sum() - f_sum) / (f_count.sum() - f_count)
  folds_prior = (f_sum.sum() - f_sum) / (f_count.sum() - f_count)
  folds_prior = (f_sum.sum() - f_sum) / (f_count.sum() - f_count)
  folds_prior = (f_sum.sum() - f_sum) / (f_count.sum() - f_count)


In [3]:
from lightautoml_gpu.reader.gpu.seq_gpu import IDSIndGPU, TopIndGPU
import cudf

In [4]:
name = 'data2'

ids_gpu = IDSIndGPU(
    scheme = reader.seq_params[name].get("scheme", None),
    **reader.seq_params[name]["params"])

In [5]:
cpu_data = res.seq_data[name].data
cudf_data = cudf.DataFrame.from_pandas(cpu_data, nan_as_null=False)
cudf_train = cudf.DataFrame.from_pandas(train, nan_as_null=False)

In [6]:
cpu_data.shape

(3500000, 3)

In [7]:
%%time

ids = cpu_data.reset_index().groupby(ids_gpu.scheme['from_id'])['index'].apply(list).to_dict()

CPU times: user 36.1 s, sys: 269 ms, total: 36.4 s
Wall time: 36.4 s


In [8]:
%%time

r = cudf_data.reset_index().groupby(ids_gpu.scheme['from_id'])['index'].agg('collect').to_pandas().to_dict()

CPU times: user 1.65 s, sys: 164 ms, total: 1.81 s
Wall time: 1.81 s


In [9]:
%%time

res1 = cudf_train[ids_gpu.scheme["to_id"]].to_pandas().map(ids).values


CPU times: user 1.02 s, sys: 55.8 ms, total: 1.08 s
Wall time: 1.08 s


In [10]:
%%time

res2 = cudf_train[ids_gpu.scheme["to_id"]].map(ids).to_pandas().values

CPU times: user 1.2 s, sys: 195 ms, total: 1.39 s
Wall time: 1.4 s


In [11]:
res1

array([list([3002905]), list([3174624]), list([1629428]), ...,
       list([1783702]), list([1885678]), list([1555375])], dtype=object)

In [12]:
res2

array([array([3002905]), array([3174624]), array([1629428]), ...,
       array([1783702]), array([1885678]), array([1555375])], dtype=object)

In [13]:
task = Task('reg', metric='mae')
task_gpu = Task('reg', metric='mae', device='gpu')
task_mgpu = Task('reg', metric='mae', device='mgpu')
roles={'target': target}

reader = DictToPandasSeqReader(task=task, seq_params=seq_params)    
res = reader.fit_read(X_train, roles=roles)
reader_gpu = DictToCudfSeqReader(task=task_gpu,
                                seq_params=seq_params, n_jobs=1)
res_gpu = reader_gpu.fit_read(X_train, roles=roles)
reader_mgpu = DictToDaskCudfSeqReader(task=task_mgpu, cv=3,
               n_jobs = 1, npartitions=2, seq_params=seq_params)
#res_mgpu = reader_mgpu.fit_read(X_train, roles=roles)

counts = SeqNumCountsTransformer()
counts_gpu = SeqNumCountsTransformerGPU()

stats = SeqStatisticsTransformer()
stats_gpu = SeqStatisticsTransformerGPU()

seq = GetSeqTransformer(name=name)
seq_gpu = GetSeqTransformerGPU(name=name)

  folds_prior = (f_sum.sum() - f_sum) / (f_count.sum() - f_count)
  folds_prior = (f_sum.sum() - f_sum) / (f_count.sum() - f_count)
  folds_prior = (f_sum.sum() - f_sum) / (f_count.sum() - f_count)
  folds_prior = (f_sum.sum() - f_sum) / (f_count.sum() - f_count)
  folds_prior = (f_sum.sum() - f_sum) / (f_count.sum() - f_count)
  folds_prior = (f_sum.sum() - f_sum) / (f_count.sum() - f_count)
  folds_prior = (f_sum.sum() - f_sum) / (f_count.sum() - f_count)
  folds_prior = (f_sum.sum() - f_sum) / (f_count.sum() - f_count)
  folds_prior = (f_sum.sum() - f_sum) / (f_count.sum() - f_count)
  folds_prior = (f_sum.sum() - f_sum) / (f_count.sum() - f_count)


In [14]:
50+35      74 ms, 7.19 ms      116 ms, 45.4 ms          3.43 ms, 1.02 ms
500+350    710 63               1 s 494                 10 ms, 1.04 ms
5m+3.5m    7   490             10 5                     10 ms, 1.1

SyntaxError: invalid syntax (3307397290.py, line 1)

In [15]:
%%time

counts.fit(res.seq_data[name])
out_counts = counts.transform(res.seq_data[name])

CPU times: user 7.08 s, sys: 8.12 ms, total: 7.09 s
Wall time: 7.09 s


In [16]:
%%time

counts_gpu.fit(res_gpu.seq_data[name])
out_counts_gpu = counts_gpu.transform(res_gpu.seq_data[name])


CPU times: user 465 ms, sys: 23.7 ms, total: 489 ms
Wall time: 490 ms


In [17]:
%%time

#counts_gpu.fit(res_mgpu.seq_data[name])
#out_counts_mgpu = counts_gpu.transform(res_mgpu.seq_data[name])


CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 3.1 µs


In [18]:
%%time

stats.fit(res.seq_data[name])
out_stats = stats.transform(res.seq_data[name])

CPU times: user 9.96 s, sys: 449 ms, total: 10.4 s
Wall time: 10.3 s


In [19]:
%%time

stats_gpu.fit(res_gpu.seq_data[name])
out_stats_gpu = stats_gpu.transform(res_gpu.seq_data[name])

CPU times: user 4.74 s, sys: 289 ms, total: 5.03 s
Wall time: 5.03 s


In [20]:
%%time

#stats_gpu.fit(res_mgpu.seq_data[name])
#out_stats_mgpu = stats_gpu.transform(res_mgpu.seq_data[name])

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 2.62 µs


In [21]:
np.allclose(out_stats_gpu.data.values_host, out_stats.data)

True

In [26]:
%%time

seq.fit(res)
out_seq = seq.transform(res)

CPU times: user 3.74 ms, sys: 7.36 ms, total: 11.1 ms
Wall time: 9.39 ms


In [27]:
%%time

seq_gpu.fit(res_gpu)
out_seq_gpu = seq_gpu.transform(res_gpu)


CPU times: user 377 µs, sys: 19 µs, total: 396 µs
Wall time: 401 µs


In [24]:
%%time

#seq_gpu.fit(res_mgpu)
#out_seq_mgpu = seq_gpu.transform(res_mgpu)

CPU times: user 4 µs, sys: 0 ns, total: 4 µs
Wall time: 6.68 µs
