In [None]:
#export
from local.torch_basics import *
from local.test import *
from local.core import *
from local.data.all import *
from local.tabular.core import *
from local.notebook.showdoc import show_doc

In [None]:
#default_exp tabular.rapids

# Tabular with rapids

> Basic functions to preprocess tabular data before assembling it in a `DataBunch` on the GPU.

In [None]:
#export
try: import cudf,nvcategory
except: print("This requires rapids, see https://rapids.ai/ for installation details")

In [None]:
#export
@patch
def __array__(self:cudf.DataFrame): return self.pandas.__array__

In [None]:
class TabularGPU(Tabular): pass

## TabularProcessors

In [None]:
#export
def _to_str(c): return c if c.dtype == "object" else c.astype("str")
def _remove_none(c):
    if None in c: c.remove(None)
    return c

In [None]:
#export
class CategorifyGPU(TabularProc, CollBase):
    "Transform the categorical variables to that type."
    order = 1
    def setups(self, to):
        self.items = {n: nvcategory.from_strings(_to_str(to.loc[:to.split,n]).data).keys() for n in to.all_cat_names}
        self.classes = to.classes = {n: CategoryMap(_remove_none(c.to_host()), add_na=True) for n,c in self.items.items()}
    
    def _apply_cats(self, c):
        return cudf.Series(nvcategory.from_strings(_to_str(c).data).set_keys(self[c.name]).values()).add(1)
    
    def encodes(self, to): 
        for c in to.all_cat_names: to.set_col(c, self._apply_cats(to.items[c]))
            
    def _decode_cats(self, c): return c.map(dict(enumerate(self.classes[c.name].items)))
    def decodes(self, to): to.transform(to.all_cat_names, self._decode_cats)

In [None]:
show_doc(CategorifyGPU, title_level=3)

<h3 id="CategorifyGPU" class="doc_header"><code>class</code> <code>CategorifyGPU</code><a href="" class="source_link" style="float:right">[source]</a></h3>

> <code>CategorifyGPU</code>(**`enc`**=*`None`*, **`dec`**=*`None`*, **`filt`**=*`None`*, **`as_item`**=*`False`*) :: [`TabularProc`](/tabular.core.html#TabularProc)

Transform the categorical variables to that type.

In [None]:
cat = CategorifyGPU()
df = cudf.from_pandas(pd.DataFrame({'a':[0,1,2,0,2]}))
to = Tabular(df, 'a')

cat.setup(to)
test_eq(list(cat.items['a'].to_host()), ['0','1','2'])
test_eq(df['a'].to_array(), np.array([1,2,3,1,3]))
df1 = cudf.from_pandas(pd.DataFrame({'a':[1,0,3,-1,2]}))
to1 = Tabular(df1, 'a')
cat(to1)
#Values that weren't in the training df are sent to 0 (na)
test_eq(df1['a'].to_array(), np.array([2,1,0,0,3]))

#Test decode
to2 = Tabular(df1.to_pandas(), 'a')
cat.decode(to2)
test_eq(to2.a, np.array(['1','0','#na#','#na#','2']))

In [None]:
cat = CategorifyGPU()
df = cudf.from_pandas(pd.DataFrame({'a':[0,1,2,3,2]}))
to = Tabular(df, 'a', split=3)
cat.setup(to)
test_eq(list(cat.items['a'].to_host()), ["0","1","2"])
test_eq(df['a'].to_array(), np.array([1,2,3,0,3]))

In [None]:
#TODO Categorical
cat = CategorifyGPU()
df = cudf.from_pandas(pd.DataFrame({'a':pd.Categorical(['M','H','L','M'], categories=['H','M','L'], ordered=True)}))
to = Tabular(df, 'a')
cat.setup(to)
#Fails for now
#test_eq(cat['a'].to_host(), ['H','M','L'])
#test_eq(df["a"].to_array(), [2,1,3,2])

<cudf.DataFrame ncols=1 nrows=4 >

In [None]:
#export
class NormalizeGPU(TabularProc):
    "Normalize the continuous variables."
    order = 2
    def setups(self, to):
        self.means,self.stds = {},{}
        for n in to.cont_names:
            col = df.loc[:to.split,n]
            self.means[n],self.stds[n] = col.mean(),col.std(ddof=0)+1e-7
    
    def encodes(self, to):
        for n in to.cont_names: to.set_col(n, (to.items[n]-self.means[n])/self.stds[n])
            
    #def decodes(self, to):
    #    for n in to.cont_names: to.set_col(n, (to.items[n]*self.stds[n])+self.means[n])
    def decodes(self, to): to.conts = (to.conts*self.stds ) + self.means

In [None]:
show_doc(NormalizeGPU, title_level=3)

<h3 id="NormalizeGPU" class="doc_header"><code>class</code> <code>NormalizeGPU</code><a href="" class="source_link" style="float:right">[source]</a></h3>

> <code>NormalizeGPU</code>(**`enc`**=*`None`*, **`dec`**=*`None`*, **`filt`**=*`None`*, **`as_item`**=*`False`*) :: [`TabularProc`](/tabular.core.html#TabularProc)

Normalize the continuous variables.

In [None]:
norm = NormalizeGPU()
df = cudf.from_pandas(pd.DataFrame({'a':[0,1,2,3,4]}))
to = Tabular(df, cont_names='a')
norm.setup(to)
x = np.array([0,1,2,3,4])
m,s = x.mean(),x.std()
test_eq(norm.means['a'], m)
test_close(norm.stds['a'], s)
test_close(df['a'].to_array(), (x-m)/s)
df1 = cudf.from_pandas(pd.DataFrame({'a':[5,6,7]}))
to1 = Tabular(df1, cont_names='a')
norm(to1)
test_close(df1['a'].to_array(), (np.array([5,6,7])-m)/s)
to2 = Tabular(df1.to_pandas(), cont_names='a')
to2 = norm.decode(to2)
test_close(to2.a, [5,6,7])

In [None]:
norm = NormalizeGPU()
df = cudf.from_pandas(pd.DataFrame({'a':[0,1,2,3,4]}))
to = Tabular(df, cont_names='a', split=3)
norm.setup(to)
x = np.array([0,1,2])
m,s = x.mean(),x.std()
test_eq(norm.means, {'a': m})
test_close(norm.stds['a'], s)
test_close(df['a'].to_array(), (np.array([0,1,2,3,4])-m)/s)

In [None]:
#export
def get_median(col):
    "Get the median of a cudf Series `col`"
    col = col.dropna().reset_index(drop=True)
    return col.sort_values()[len(col)//2]

In [None]:
#export
class FillStrategyGPU:
    "Namespace containing the various filling strategies."
    def median  (c,fill): return get_median(c)
    def constant(c,fill): return fill
    def mode    (c,fill): return c.dropna().value_counts().index[0]

In [None]:
#export
class FillMissingGPU(TabularProc):
    "Fill the missing values in continuous columns."
    def __init__(self, fill_strategy=FillStrategyGPU.median, add_col=True, fill_vals=None):
        if fill_vals is None: fill_vals = defaultdict(int)
        store_attr(self, 'fill_strategy,add_col,fill_vals')

    def setups(self, to):
        self.na_dict = {}
        for n in to.cont_names:
            col = to.loc[:to.split, n]
            if col.isnull().any(): self.na_dict[n] = self.fill_strategy(col, self.fill_vals[n])

    def encodes(self, to):
        for n in to.cont_names:
            if n in self.na_dict:
                if self.add_col:
                    to.items[n+'_na'] = to.items[n].isnull()
                    if n+'_na' not in to.cat_names: to.cat_names.append(n+'_na')
                to.set_col(n, to.items[n].fillna(self.na_dict[n]))
            elif df[n].isnull().any():
                raise Exception(f"nan values in `{n}` but not in setup training set")

In [None]:
show_doc(FillMissingGPU, title_level=3)

<h3 id="FillMissingGPU" class="doc_header"><code>class</code> <code>FillMissingGPU</code><a href="" class="source_link" style="float:right">[source]</a></h3>

> <code>FillMissingGPU</code>(**`fill_strategy`**=*`'median'`*, **`add_col`**=*`True`*, **`fill_vals`**=*`None`*) :: [`TabularProc`](/tabular.core.html#TabularProc)

Fill the missing values in continuous columns.

In [None]:
fill1,fill2,fill3 = (FillMissingGPU(fill_strategy=s) 
                     for s in [FillStrategyGPU.median, FillStrategyGPU.constant, FillStrategyGPU.mode])
df = cudf.from_pandas(pd.DataFrame({'a':[0,1,np.nan,1,2,3,4]}))
df1 = df.copy(); df2 = df.copy()
to,to1,to2 = Tabular(df, cont_names='a'),Tabular(df1, cont_names='a'),Tabular(df2, cont_names='a')
fill1.setup(to); fill2.setup(to1); fill3.setup(to2)
test_eq(fill1.na_dict, {'a': 2.})
test_eq(fill2.na_dict, {'a': 0})
test_eq(fill3.na_dict, {'a': 1.0})

for t in [to, to1, to2]: test_eq(t.cat_names, ['a_na'])

for to_,v in zip([to, to1, to2], [2., 0., 1.]):
    test_eq(to_.items['a'].to_array(), np.array([0, 1, v, 1, 2, 3, 4]))
    test_eq(to_.items['a_na'].to_array(), np.array([0, 0, 1, 0, 0, 0, 0]))
    
dfa = cudf.from_pandas(pd.DataFrame({'a':[np.nan,0,np.nan]}))
dfa1 = dfa.copy(); dfa2 = dfa.copy()
to,to1,to2 = Tabular(dfa, cont_names='a'),Tabular(dfa1, cont_names='a'),Tabular(dfa2, cont_names='a')
fill1(to); fill2(to1); fill3(to2)
for to_,v in zip([to, to1, to2], [2., 0., 1.]):
    test_eq(to_.items['a'].to_array(), np.array([v, 0, v]))
    test_eq(to_.items['a_na'].to_array(), np.array([1, 0, 1]))

## Tabular Pipelines -

In [None]:
procs = [NormalizeGPU(), CategorifyGPU(), FillMissingGPU(), noop]
proc = Pipeline(procs)

#Test reordering and partialize
test_eq(L(proc.fs).mapped(type), [FillMissingGPU, Transform, CategorifyGPU, NormalizeGPU])

df = cudf.from_pandas(pd.DataFrame({'a':[0,1,2,1,1,2,0], 'b':[0,1,np.nan,1,2,3,4]}))
to = Tabular(df, 'a', 'b')

#Test setup and apply on df_trn
proc.setup(to)
test_eq(to.items['a'].to_array(), [1,2,3,2,2,3,1])
test_eq(to.items['b_na'].to_array(), [1,1,2,1,1,1,1])
x = np.array([0,1,2,1,2,3,4])
m,s = x.mean(),x.std()
test_close(to.items['b'].to_array(), (x-m)/s)
test_eq(proc.classes, {'a': ['#na#','0','1','2'], 'b_na': ['#na#','False','True']})

In [None]:
#Test apply on y_names
procs = [NormalizeGPU(), CategorifyGPU(), FillMissingGPU(), noop]
proc = Pipeline(procs)

df = cudf.from_pandas(pd.DataFrame({'a':[0,1,2,1,1,2,0], 'b':[0,1,np.nan,1,2,3,4], 'c': ['b','a','b','a','a','b','a']}))
to = Tabular(df, 'a', 'b', y_names='c')

proc.setup(to)
test_eq(to.cat_names, ['a', 'b_na'])
test_eq(to.items['a'].to_array(), [1,2,3,2,2,3,1])
test_eq(to.items['b_na'].to_array(), [1,1,2,1,1,1,1])
test_eq(to.items['c'].to_array(), [2,1,2,1,1,2,1])
x = np.array([0,1,2,1,2,3,4])
m,s = x.mean(),x.std()
test_close(to.items['b'].to_array(), (x-m)/s)
test_eq(proc.classes, {'a': ['#na#','0','1','2'], 'b_na': ['#na#','False','True'], 'c': ['#na#','a','b']})

Pass the same `splits` as you will use for splitting the data, so that the setup is only done on the training set. `cat_names` are the names of the categorical variables, `cont_names` the continous ones, `cat_y` are the names of the dependent variables that are categories. If `inplace=True`, processing is applied inplace, otherwis it creates a copy of `df`.

In [None]:
#export
from torch.utils.dlpack import from_dlpack

class ReadTabBatchGPU(ItemTransform):
    def __init__(self, proc): self.proc = proc

    def encodes(self, to):
        return (from_dlpack(to.cats.to_dlpack()).long(),from_dlpack(to.conts.to_dlpack()).float()), from_dlpack(to.targ.to_dlpack()).long()

    def decodes(self, o):
        (cats,conts),targs = to_np(o)
        df = pd.DataFrame({**{c: cats [:,i] for i,c in enumerate(self.proc.cat_names )},
                           **{c: conts[:,i] for i,c in enumerate(self.proc.cont_names)},
                           self.proc.y_names: targs})
        to = Tabular(df, self.proc.cat_names, self.proc.cont_names, self.proc.y_names, is_y_cat=self.proc.cat_y is not None)
        to = self.proc.decode(to)
        return to

In [None]:
#export
@delegates()
class TabDataLoaderGPU(TfmdDL):
    do_item = noops
    def __init__(self, dataset, proc, bs=16, shuffle=False, after_batch=None, num_workers=0, **kwargs):
        after_batch = L(after_batch)+ReadTabBatchGPU(proc)
        super().__init__(dataset, bs=bs, shuffle=shuffle, after_batch=after_batch, num_workers=num_workers, **kwargs)

    def create_batch(self, b): return self.dataset.items[b]

## Integration example

In [None]:
path = untar_data(URLs.ADULT_SAMPLE)
df = cudf.from_pandas(pd.read_csv(path/'adult.csv'))
df_trn,df_tst = df.iloc[:10000].copy(),df.iloc[10000:].copy()
df_trn.head()

<cudf.DataFrame ncols=15 nrows=5 >

In [None]:
cat_names = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race']
cont_names = ['age', 'fnlwgt', 'education-num']
procs = [CategorifyGPU(), FillMissingGPU(), NormalizeGPU()]

splits = RandomSplitter()(range_of(df_trn))

In [None]:
%time to,proc = process_df(df_trn, procs, splits=splits, cat_names=cat_names, cont_names=cont_names, y_names="salary")

CPU times: user 270 ms, sys: 8.39 ms, total: 278 ms
Wall time: 277 ms


In [None]:
filts = [list(range(len(splits[0]))), list(range(len(splits[0]), 10000))]
dsrc = DataSource(to, filts=filts, tfms=[None])
dl = TabDataLoaderGPU(dsrc.valid, proc, bs=64, num_workers=0)

In [None]:
dl.show_batch()

  return cpp_dlpack.to_dlpack(gdf_cols)


Unnamed: 0,age,fnlwgt,education-num,workclass,education,marital-status,occupation,relationship,race,education-num_na,salary
0,35.0,153926.0,9.0,?,HS-grad,Married-civ-spouse,Adm-clerical,Wife,Black,False,<50k
1,30.0,75167.0,13.0,Private,Bachelors,Married-civ-spouse,Farming-fishing,Husband,White,False,<50k
2,37.0,254973.0,9.0,Private,HS-grad,Married-civ-spouse,Machine-op-inspct,Wife,White,False,>=50k
3,47.0,379118.0,4.0,Private,7th-8th,Married-civ-spouse,Exec-managerial,Husband,Black,False,>=50k
4,43.0,34007.0,13.0,Self-emp-not-inc,Bachelors,Married-civ-spouse,Handlers-cleaners,Husband,White,False,>=50k
5,19.0,301911.0,10.0,Private,Some-college,Never-married,Tech-support,Own-child,Asian-Pac-Islander,False,<50k
6,63.0,137192.0,13.0,Private,Bachelors,Married-civ-spouse,Exec-managerial,Husband,Asian-Pac-Islander,False,<50k
7,20.0,197496.0,10.0,Private,Some-college,Never-married,Tech-support,Own-child,White,False,<50k
8,63.0,188914.0,9.0,Private,HS-grad,Widowed,Other-service,Other-relative,Black,False,<50k
9,40.0,196029.0,9.0,Private,HS-grad,Divorced,,Unmarried,White,False,<50k


In [None]:
#not working yet

In [None]:
#dsrc = DataSource(df1, filts=splits, tfms=[[ReadTabLine(proc)], [ReadTabTarget(proc)]])

In [None]:
#dbch = dsrc.databunch(bs=64)
#dbch.show_batch()

## Export -

In [None]:
#hide
from local.notebook.export import notebook2script
notebook2script(all_fs=True)