In [None]:
#export
from local.imports import *
from local.test import *
from local.core import *
from local.data.all import *
from local.notebook.showdoc import show_doc

In [None]:
#export
pd.set_option('mode.chained_assignment','raise')

In [None]:
#default_exp tabular.core

# Tabular core

> Basic function to preprocess tabular data before assembling it in a `DataBunch`.

## TabularProc -

We use this class to preprocess tabular data. `cat_names` should contain the names of the categorical variables in your dataframe, `cont_names` the names of the continuous variables. If you don't need any state, you can initiliaze a `TabularProc` with a `func` to be applied on the dataframes. Otherwise you should subclass and implement `setup` and `__call__`.

In [None]:
#export
class Tabular(CollBase):
    def __init__(self, df, cat_names=None, cont_names=None, cat_y=None, splits=None):
        super().__init__(df)
        self.splits = L(ifnone(splits,[None]))
        self.cat_names,self.cont_names,self.cat_y = L(cat_names),L(cont_names),cat_y
    
    def __setitem__(self,k,v): super().__setitem__(list(k) if isinstance(k,L) else k, v)
    def transform(self, cols, f): self[cols] = self[cols].transform(f)
            
    @property
    def loc(self): return self.items.loc
    @property
    def iloc(self): return self.items.iloc
    
    @property
    def all_cat_names(self): return self.cat_names + self.cat_y
    @property
    def all_col_names(self): return self.cont_names + self.all_cat_names

In [None]:
#export
def _add_prop(cls, nm):
    prop = property(lambda o: o.items[list(getattr(o,nm+'_names'))])
    setattr(cls, nm+'s', prop)
    def _f(o,v): o.items[list(getattr(o,nm+'_names'))] = v
    setattr(cls, nm+'s', prop.setter(_f))

_add_prop(Tabular, 'cat')
_add_prop(Tabular, 'all_cat')
_add_prop(Tabular, 'cont')
_add_prop(Tabular, 'all_col')

In [None]:
#export
class TabularProc(Transform):
    "Base class to write a tabular processor for dataframes"
    process = NotImplemented
    def __init__(self): super().__init__()
    def encodes(self, to, **kwargs):
        self.process(to)
        return to

In [None]:
#export
class Categorify(TabularProc, CollBase):
    "Transform the categorical variables to that type."
    order = 1
    def setup(self, to):
        to.classes = self.items = {n:CategoryMap(to.loc[ifnone(to.splits[0], slice(None)),n])
                                   for n in to.all_cat_names}
        
    def _apply_cats(self, c): return c.cat.codes+1 if is_categorical_dtype(c) else c.map(self[c.name].o2i)
    def process(self, to): to.transform(to.all_cat_names, self._apply_cats)

    def decodes(self, to):
        cats = [self[c][v] for v,c in zip(to.items[0], to.cat_names)]
        to.items = (cats, to.items[1])
        return to

In [None]:
show_doc(Categorify, title_level=3)

<h3 id="Categorify" class="doc_header"><code>class</code> <code>Categorify</code><a href="https://nbviewer.jupyter.org/github/fastai/fastai_docs/blob/master/dev/40_tabular_core.ipynb#TabularProc--" class="source_link" style="float:right">[source]</a></h3>

> <code>Categorify</code>() :: [`TabularProc`](/tabular.core.html#TabularProc)

Transform the categorical variables to that type.

In [None]:
cat = Categorify()
df = pd.DataFrame({'a':[0,1,2,0,2]})
to = Tabular(df, 'a')

cat.setup(to)
test_eq(cat['a'], ['#na#',0,1,2])
cat(to)
test_eq(df['a'], [1,2,3,1,3])
df1 = pd.DataFrame({'a':[1,0,3,-1,2]})
to1 = Tabular(df1, 'a')
cat(to1)
#Values that weren't in the training df are sent to 0 (na)
test_eq(df1['a'], [2,1,0,0,3])

In [None]:
#test with splits
cat = Categorify()
df = pd.DataFrame({'a':[0,1,2,3,2]})
to = Tabular(df, 'a', splits=[range(3)])
cat.setup(to)
test_eq(cat['a'], ['#na#',0,1,2])
cat(to)
test_eq(df['a'], [1,2,3,0,3])

In [None]:
#test NaN
cat = Categorify()
df = pd.DataFrame({'a':['a', 'b', np.nan]})
to = Tabular(df, 'a')

cat.setup(to)
test_eq(cat['a'], ['#na#','a','b'])

In [None]:
df = pd.DataFrame({'a':pd.Categorical(['M','H','L','M'], categories=['H','M','L'], ordered=True)})
to = Tabular(df, 'a')
cat = Categorify()
cat.setup(to)
test_eq(cat['a'], ['#na#','H','M','L'])
cat(to)
test_eq(df.a, [2,1,3,2])

In [None]:
#export
class Normalize(TabularProc):
    "Normalize the continuous variables."
    order = 2
    def setup(self, to):
        df = to.loc[ifnone(to.splits[0],slice(None)), to.cont_names]
        self.means,self.stds = df.mean(),df.std(ddof=0)
    
    def process(self, to): to.conts = (to.conts-self.means) / (self.stds+1e-7)
        
    def decodes(self, to):
        conts = [(v*self.stds[c] + self.means[c]).item() for v,c in zip(to.items[1], to.cont_names)]
        to.items = (to.items[0], conts)
        return to

In [None]:
show_doc(Normalize, title_level=3)

<h3 id="Normalize" class="doc_header"><code>class</code> <code>Normalize</code><a href="https://nbviewer.jupyter.org/github/fastai/fastai_docs/blob/master/dev/40_tabular_core.ipynb#TabularProc--" class="source_link" style="float:right">[source]</a></h3>

> <code>Normalize</code>() :: [`TabularProc`](/tabular.core.html#TabularProc)

Normalize the continuous variables.

In [None]:
norm = Normalize()
df = pd.DataFrame({'a':[0,1,2,3,4]})
to = Tabular(df, cont_names='a')
norm.setup(to)
x = np.array([0,1,2,3,4])
m,s = x.mean(),x.std()
test_eq(norm.means['a'], m)
test_close(norm.stds['a'], s)
norm(to)
test_close(df['a'].values, (x-m)/s)
df1 = pd.DataFrame({'a':[5,6,7]})
to1 = Tabular(df1, cont_names='a')
norm(to1)
test_close(df1['a'].values, (np.array([5,6,7])-m)/s)

In [None]:
norm = Normalize()
df = pd.DataFrame({'a':[0,1,2,3,4]})
to = Tabular(df, cont_names='a', splits=[range(3)])
norm.setup(to)
x = np.array([0,1,2])
m,s = x.mean(),x.std()
test_eq(norm.means['a'], m)
test_close(norm.stds['a'], s)
norm(to)
test_close(df['a'].values, (np.array([0,1,2,3,4])-m)/s)

In [None]:
#export
class FillStrategy:
    "Namespace containing the various filling strategies."
    def median  (c,fill): return c.median()
    def constant(c,fill): return fill
    def mode    (c,fill): return c.dropna().value_counts().idxmax()

In [None]:
#export
class FillMissing(TabularProc):
    "Fill the missing values in continuous columns."
    def __init__(self, fill_strategy=FillStrategy.median, add_col=True, fill_vals=None):
        if fill_vals is None: fill_vals = defaultdict(int)
        store_attr(self, 'fill_strategy,add_col,fill_vals')
    
    def setup(self, to):
        df = to.loc[ifnone(to.splits[0],slice(None)), to.cont_names]
        self.na_dict = {n:self.fill_strategy(df[n], self.fill_vals[n])
                        for n in pd.isnull(to.conts).any().keys()}

    def process(self, to):
        missing = pd.isnull(to.conts)
        for n in missing.any().keys():
            assert n in self.na_dict, f"nan values in `{n}` but not in setup training set"
            to[n].fillna(self.na_dict[n], inplace=True)
            if self.add_col:
                to[n+'_na'] = missing[n]
                if n+'_na' not in to.cat_names: to.cat_names.append(n+'_na')

In [None]:
show_doc(FillMissing, title_level=3)

<h3 id="FillMissing" class="doc_header"><code>class</code> <code>FillMissing</code><a href="https://nbviewer.jupyter.org/github/fastai/fastai_docs/blob/master/dev/40_tabular_core.ipynb#TabularProc--" class="source_link" style="float:right">[source]</a></h3>

> <code>FillMissing</code>(**`fill_strategy`**=*`'median'`*, **`add_col`**=*`True`*, **`fill_vals`**=*`None`*) :: [`TabularProc`](/tabular.core.html#TabularProc)

Fill the missing values in continuous columns.

In [None]:
fill1,fill2,fill3 = (FillMissing(fill_strategy=s) 
                     for s in [FillStrategy.median, FillStrategy.constant, FillStrategy.mode])
df = pd.DataFrame({'a':[0,1,np.nan,1,2,3,4]})
df1 = df.copy(); df2 = df.copy()
to,to1,to2 = Tabular(df, cont_names='a'),Tabular(df1, cont_names='a'),Tabular(df2, cont_names='a')
fill1.setup(to); fill2.setup(to1); fill3.setup(to2)
test_eq(fill1.na_dict, {'a': 1.5})
test_eq(fill2.na_dict, {'a': 0})
test_eq(fill3.na_dict, {'a': 1.0})

fill1(to); fill2(to1); fill3(to2)
for t in [to, to1, to2]: test_eq(t.cat_names, ['a_na'])

for df_,v in zip([to, to1, to2], [1.5, 0., 1.]):
    test_eq(df_['a'].values, np.array([0, 1, v, 1, 2, 3, 4]))
    test_eq(df_['a_na'].values, np.array([0, 0, 1, 0, 0, 0, 0]))
    
dfa = pd.DataFrame({'a':[np.nan,0,np.nan]})
dfa1 = dfa.copy(); dfa2 = dfa.copy()
to,to1,to2 = Tabular(dfa, cont_names='a'),Tabular(dfa1, cont_names='a'),Tabular(dfa2, cont_names='a')
fill1(to); fill2(to1); fill3(to2)
for df_,v in zip([to, to1, to2], [1.5, 0., 1.]):
    test_eq(df_['a'].values, np.array([v, 0, v]))
    test_eq(df_['a_na'].values, np.array([1, 0, 1]))

## Tabular Pipelines -

In [None]:
procs = [Normalize(), Categorify(), FillMissing(), noop]
proc = Pipeline(procs)

#Test reordering and partialize
test_eq(L(proc.fs).mapped(type), [FillMissing, Transform, Categorify, Normalize])

df = pd.DataFrame({'a':[0,1,2,1,1,2,0], 'b':[0,1,np.nan,1,2,3,4]})
to = Tabular(df, 'a', 'b')

#Test setup and apply on df_trn
proc.setup(to)
test_eq(to.cat_names, ['a', 'b_na'])
test_eq(to['a'], [1,2,3,2,2,3,1])
test_eq(to['b_na'], [1,1,2,1,1,1,1])
x = np.array([0,1,1.5,1,2,3,4])
m,s = x.mean(),x.std()
test_close(to['b'].values, (x-m)/s)
test_eq(proc.classes, {'a': ['#na#',0,1,2], 'b_na': ['#na#',False,True]})

In [None]:
#Test apply on df_val
df = pd.DataFrame({'a':[2,1,3], 'b':[4,5,np.nan]})
to = Tabular(df, 'a', 'b')
proc(to)
# test_eq(proc.classes, {'a': ['#na#',0,1,2], 'b_na': ['#na#',False,True]})
test_eq(to['a'], [3,2,0])
test_eq(to['b_na'], [1,1,2])
x = np.array([4, 5, 1.5])
test_close(to['b'].values, (x-m)/s)

In [None]:
#Test apply on cat_y
procs = [Normalize(), Categorify(), FillMissing(), noop]
proc = Pipeline(procs)

df = pd.DataFrame({'a':[0,1,2,1,1,2,0], 'b':[0,1,np.nan,1,2,3,4], 'c': ['b','a','b','a','a','b','a']})
to = Tabular(df, 'a', 'b', cat_y='c')
proc.setup(to)
test_eq(to.cat_names, ['a', 'b_na'])
test_eq(to['a'], [1,2,3,2,2,3,1])
test_eq(to['b_na'], [1,1,2,1,1,1,1])
test_eq(to['c'], [2,1,2,1,1,2,1])
x = np.array([0,1,1.5,1,2,3,4])
m,s = x.mean(),x.std()
test_close(to['b'].values, (x-m)/s)
test_eq(proc.classes, {'a': ['#na#',0,1,2], 'b_na': ['#na#',False,True], 'c': ['#na#','a','b']})

In [None]:
#export
def process_df(df, procs, splits=None, cat_names=None, cont_names=None, cat_y=None, inplace=True):
    "Process `df` with `procs` and returns the processed dataframe and the `TabularProcessor` associated"
    to = Tabular(df if inplace else df.copy(), cat_names, cont_names, cat_y, splits)
    proc = Pipeline(procs)
    proc.setup(to)
    return to,proc

In [None]:
procs = [Normalize(), Categorify(), FillMissing(), noop]
df = pd.DataFrame({'a':[0,1,2,1,1,2,0], 'b':[0,1,np.nan,1,2,3,4], 'c': ['b','a','b','a','a','b','a']})
to,proc = process_df(df, procs, cat_names='a', cont_names='b', cat_y='c', inplace=False)
test_eq(to.cat_names, ['a', 'b_na'])
test_eq(to['a'], [1,2,3,2,2,3,1])
test_eq(df.a.dtype,int)
test_eq(to['b_na'], [1,1,2,1,1,1,1])
test_eq(to['c'], [2,1,2,1,1,2,1])

In [None]:
# inplace
to,proc = process_df(df, procs, cat_names='a', cont_names='b', cat_y='c', inplace=True)
test_eq(type(df.a.dtype),np.dtype)

Pass the same `splits` as you will use for splitting the data, so that the setup is only done on the training set. `cat_names` are the names of the categorical variables, `cont_names` the continous ones, `cat_y` are the names of the dependent variables that are categories. If `inplace=True`, processing is applied inplace, otherwis it creates a copy of `df`.

In [None]:
#export
class TabularLine(pd.Series):
    "A line of a dataframe that knows how to show itself"
    def show(self, ctx=None, **kwargs): return self if ctx is None else ctx.append(self)

In [None]:
#export
class TensorTabular(tuple):
    def get_ctxs(self, max_n=10, **kwargs):
        n_samples = min(self[0].shape[0], max_n)
        df = pd.DataFrame(index = range(n_samples))
        return [df.iloc[i] for i in range(n_samples)]
    
    def display(self, ctxs): display_df(pd.DataFrame(ctxs))

In [None]:
#export
class ReadTabLine(ItemTransform):
    def __init__(self, proc): self.proc = proc
    
    def encodes(self, row):
        cats,conts = (o.mapped(row.__getitem__) for o in (self.proc.cat_names,self.proc.cont_names))
        return TensorTabular((tensor(cats).long(),tensor(conts).float()))
    
    def decodes(self, o) -> TabularLine:
        to = Tabular(o, self.proc.cat_names, self.proc.cont_names, self.proc.cat_y)
        to = self.proc.decode(to)
        return pd.Series({c: v for v,c in zip(to.items[0]+to.items[1], self.proc.cat_names+self.proc.cont_names)})

In [None]:
#export
class ReadTabTarget(ItemTransform):
    def __init__(self, proc): self.proc = proc
    def encodes(self, row): return row[self.proc.cat_y].astype(np.int64)
    def decodes(self, o) -> Category: return self.proc.classes[self.proc.cat_y][o]

In [None]:
tds = TfmdDS(to.items, tfms=[[ReadTabLine(proc)], ReadTabTarget(proc)])
enc = tds[1]
test_eq(enc[0][0], tensor([2,1]))
test_close(enc[0][1], tensor([-0.628828]))
test_eq(enc[1], 1)

dec = tds.decode(enc)
assert isinstance(dec[0], TabularLine)
test_eq(dec[0], pd.Series({'a': 1, 'b_na': False, 'b': 1}))
test_eq(dec[1], 'a')

test_stdout(lambda: print(tds.show_at(1)), """a               1
b_na        False
b               1
category        a
dtype: object""")

## Integration example

In [None]:
path = untar_data(URLs.ADULT_SAMPLE)
df = pd.read_csv(path/'adult.csv')
df_trn,df_tst = df.iloc[:10000].copy(),df.iloc[10000:].copy()
df_trn.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,49,Private,101320,Assoc-acdm,12.0,Married-civ-spouse,,Wife,White,Female,0,1902,40,United-States,>=50k
1,44,Private,236746,Masters,14.0,Divorced,Exec-managerial,Not-in-family,White,Male,10520,0,45,United-States,>=50k
2,38,Private,96185,HS-grad,,Divorced,,Unmarried,Black,Female,0,0,32,United-States,<50k
3,38,Self-emp-inc,112847,Prof-school,15.0,Married-civ-spouse,Prof-specialty,Husband,Asian-Pac-Islander,Male,0,0,40,United-States,>=50k
4,42,Self-emp-not-inc,82297,7th-8th,,Married-civ-spouse,Other-service,Wife,Black,Female,0,0,50,United-States,<50k


In [None]:
cat_names = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race']
cont_names = ['age', 'fnlwgt', 'education-num']
procs = [Categorify(), FillMissing(), Normalize()]

In [None]:
splits = RandomSplitter()(range_of(df_trn))
to,proc = process_df(df_trn, procs, splits=splits, cat_names=cat_names, cont_names=cont_names, cat_y="salary")

In [None]:
dsrc = DataSource(to.items, filts=splits, tfms=[[ReadTabLine(proc)], [ReadTabTarget(proc)]])
dbch = dsrc.databunch(bs=64)
dbch.show_batch()

Unnamed: 0,workclass,education,marital-status,occupation,relationship,race,age_na,fnlwgt_na,education-num_na,age,fnlwgt,education-num,category
0,Private,Some-college,Never-married,Other-service,Not-in-family,White,False,False,False,23.999998,146706.0,10.0,<50k
1,Private,HS-grad,Never-married,Other-service,Not-in-family,White,False,False,False,40.0,170019.0,9.0,<50k
2,Self-emp-not-inc,5th-6th,Married-civ-spouse,Sales,Husband,White,False,False,False,47.0,121123.992188,3.0,>=50k
3,Private,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,False,False,False,49.0,148549.0,13.0,>=50k
4,State-gov,5th-6th,Never-married,Transport-moving,Not-in-family,White,False,False,False,22.999996,61743.0,3.0,<50k
5,State-gov,Bachelors,Married-civ-spouse,Prof-specialty,Wife,White,False,False,False,34.0,216283.0,13.0,>=50k
6,?,5th-6th,Never-married,?,Unmarried,White,False,False,False,31.999998,251612.0,3.0,<50k
7,Self-emp-not-inc,Prof-school,Married-civ-spouse,Prof-specialty,Husband,Amer-Indian-Eskimo,False,False,False,40.0,284706.0,15.0,>=50k
8,Private,HS-grad,Never-married,Exec-managerial,Not-in-family,White,False,False,False,27.999998,173110.0,9.0,<50k
9,Private,Some-college,Divorced,Exec-managerial,Not-in-family,White,False,False,False,25.999998,141824.0,10.0,<50k


In [None]:
dbch = dsrc.databunch(bs=64, num_workers=0)
%time _ = L(dbch.valid_dl)

CPU times: user 2.34 s, sys: 3.55 ms, total: 2.34 s
Wall time: 2.34 s


In [None]:
#export
class ReadTabBatch(ItemTransform):
    def __init__(self, proc): self.proc = proc
    
    def encodes(self, df):
        cats,conts = (o.mapped(df.__getitem__) for o in (self.proc.cat_names,self.proc.cont_names))
        targ = 
        return TensorTabular((tensor(cats).long(),tensor(conts).float()))
    
    def decodes(self, o) -> TabularLine:
        to = Tabular(o, self.proc.cat_names, self.proc.cont_names, self.proc.cat_y)
        to = self.proc.decode(to)
        return pd.Series({c: v for v,c in zip(to.items[0]+to.items[1], self.proc.cat_names+self.proc.cont_names)})

In [None]:
class TabDataLoader(DataLoader):
    do_item = noops
    def create_batch(self, b): return self.dataset.items.iloc[b]

In [None]:
dl = TabDataLoader(dsrc.valid, bs=8)

In [None]:
next(iter(dl))

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary,age_na,fnlwgt_na,education-num_na
6368,1.696524,1,0.056821,2,-1.212011,1,1,2,3,Female,0,0,40,United-States,1,1,1,1
8444,1.258321,5,-0.100321,10,1.162444,3,14,1,5,Male,0,0,48,United-States,2,1,1,1
1670,-0.421461,5,-0.199791,13,1.558186,5,2,4,5,Male,0,0,40,United-States,1,1,1,1
5164,1.404388,5,-0.770183,16,-0.024783,1,2,5,5,Female,0,0,40,United-States,1,1,1,1
993,-1.443936,5,1.09101,4,-3.190723,5,0,3,5,Male,0,0,40,Mexico,1,1,1,1
2702,-0.056291,3,-0.249457,16,-0.024783,5,9,2,5,Male,0,0,12,United-States,1,1,1,1
4245,-1.443936,5,-1.213603,12,-0.420526,5,7,4,5,Male,0,0,52,United-States,1,1,1,1
8457,-0.932699,7,0.837625,8,0.766702,5,4,2,5,Male,0,0,40,United-States,1,1,1,1


In [None]:
to_tst = Tabular(df_tst, cat_names, cont_names, cat_y="salary")
proc(to_tst)
to_tst.all_cols.head()

Unnamed: 0,age,fnlwgt,education-num,workclass,education,marital-status,occupation,relationship,race,age_na,fnlwgt_na,education-num_na,salary
10000,0.462933,1.342,1.176657,5,10,3,2,1,2,1,1,1,1
10001,-0.9283,1.255035,-0.424781,5,12,3,15,1,4,1,1,1,1
10002,1.048715,0.153084,-1.225501,5,2,1,9,2,5,1,1,1,1
10003,0.536155,-0.279734,-0.424781,5,12,7,2,5,5,1,1,1,1
10004,0.755824,1.444467,0.375938,6,9,3,5,1,5,1,1,1,2


## Export -

In [None]:
#hide
from local.notebook.export import notebook2script
notebook2script(all_fs=True)

Converted 00_test.ipynb.
Converted 01_core.ipynb.
Converted 01a_dataloader.ipynb.
Converted 01a_script.ipynb.
Converted 02_data_transforms.ipynb.
Converted 03_data_pipeline.ipynb.
Converted 04_data_external.ipynb.
Converted 05_data_core.ipynb.
Converted 06_data_source.ipynb.
Converted 07_vision_core.ipynb.
Converted 08_pets_tutorial.ipynb.
Converted 09_vision_augment.ipynb.
Converted 11_layers.ipynb.
Converted 12_optimizer.ipynb.
Converted 13_learner.ipynb.
Converted 14_callback_schedule.ipynb.
Converted 15_callback_hook.ipynb.
Converted 16_callback_progress.ipynb.
Converted 17_callback_tracker.ipynb.
Converted 18_callback_fp16.ipynb.
Converted 19_callback_mixup.ipynb.
Converted 20_metrics.ipynb.
Converted 21_tutorial_imagenette.ipynb.
Converted 30_text_core.ipynb.
Converted 31_text_data.ipynb.
Converted 32_text_models_awdlstm.ipynb.
Converted 33_test_models_core.ipynb.
Converted 34_callback_rnn.ipynb.
Converted 35_tutorial_wikitext.ipynb.
Converted 36_text_models_qrnn.ipynb.
Converted