In [1]:
# make your Google drive accessible 
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)
root_dir = "/content/gdrive/My Drive/"
base_dir = root_dir + 'fastai2_library/course-v4/'

# navigate to the notebooks directory for dl2
import os
os.chdir(base_dir)

Mounted at /content/gdrive


In [2]:
!pwd
# cd to base_dir if above os.chdir does not work using below command
# %cd "/content/gdrive/My Drive/fastai2_library/course-v4/"

/content/gdrive/My Drive/fastai2_library/course-v4


In [3]:
#hide
#skip
! [[ -e /content ]] && pip install -Uqq fastai  # upgrade fastai on colab

[K     |████████████████████████████████| 194kB 3.2MB/s 
[K     |████████████████████████████████| 51kB 4.4MB/s 
[K     |████████████████████████████████| 776.7MB 23kB/s 
[K     |████████████████████████████████| 12.8MB 41.3MB/s 
[?25h

In [4]:
%cd nbs

/content/gdrive/My Drive/fastai2_library/course-v4/nbs


In [None]:
#default_exp tabular.core

In [5]:
#export
from fastai.torch_basics import *
from fastai.data.all import *

In [None]:
#hide
from nbdev.showdoc import *

In [6]:
#export
pd.set_option('mode.chained_assignment','raise')

# Tabular core

> Basic function to preprocess tabular data before assembling it in a `DataLoaders`.

## Initial preprocessing

In [7]:
#export
def make_date(df, date_field):
    "Make sure `df[date_field]` is of the right date type."
    field_dtype = df[date_field].dtype
    if isinstance(field_dtype, pd.core.dtypes.dtypes.DatetimeTZDtype):
        field_dtype = np.datetime64
    if not np.issubdtype(field_dtype, np.datetime64):
        df[date_field] = pd.to_datetime(df[date_field], infer_datetime_format=True)

In [8]:
df = pd.DataFrame({'date': ['2019-12-04', '2019-11-29', '2019-11-15', '2019-10-24']})
make_date(df, 'date')
test_eq(df['date'].dtype, np.dtype('datetime64[ns]'))

In [9]:
#export
def add_datepart(df, field_name, prefix=None, drop=True, time=False):
    "Helper function that adds columns relevant to a date in the column `field_name` of `df`."
    make_date(df, field_name)
    field = df[field_name]
    prefix = ifnone(prefix, re.sub('[Dd]ate$', '', field_name))
    attr = ['Year', 'Month', 'Day', 'Dayofweek', 'Dayofyear', 'Is_month_end', 'Is_month_start',
            'Is_quarter_end', 'Is_quarter_start', 'Is_year_end', 'Is_year_start']
    if time: attr = attr + ['Hour', 'Minute', 'Second']
    for n in attr: df[prefix + n] = getattr(field.dt, n.lower())
    # Pandas removed `dt.week` in v1.1.10
    week = field.dt.isocalendar().week if hasattr(field.dt, 'isocalendar') else field.dt.week
    df.insert(3, prefix+'Week', week)
    mask = ~field.isna()
    df[prefix + 'Elapsed'] = np.where(mask,field.values.astype(np.int64) // 10 ** 9,None)
    if drop: df.drop(field_name, axis=1, inplace=True)
    return df

In [10]:
df = pd.DataFrame({'date': ['2019-12-04', None, '2019-11-15', '2019-10-24']})
df = add_datepart(df, 'date')
test_eq(df.columns, ['Year', 'Month', 'Week', 'Day', 'Dayofweek', 'Dayofyear', 'Is_month_end', 'Is_month_start', 
            'Is_quarter_end', 'Is_quarter_start', 'Is_year_end', 'Is_year_start', 'Elapsed'])
test_eq(df[df.Elapsed.isna()].shape,(1, 13))
df.head()

Unnamed: 0,Year,Month,Week,Day,Dayofweek,Dayofyear,Is_month_end,Is_month_start,Is_quarter_end,Is_quarter_start,Is_year_end,Is_year_start,Elapsed
0,2019.0,12.0,49.0,4.0,2.0,338.0,False,False,False,False,False,False,1575417600.0
1,,,,,,,False,False,False,False,False,False,
2,2019.0,11.0,46.0,15.0,4.0,319.0,False,False,False,False,False,False,1573776000.0
3,2019.0,10.0,43.0,24.0,3.0,297.0,False,False,False,False,False,False,1571875200.0


In [11]:
#export
def _get_elapsed(df,field_names, date_field, base_field, prefix):
    for f in field_names:
        day1 = np.timedelta64(1, 'D')
        last_date,last_base,res = np.datetime64(),None,[]
        for b,v,d in zip(df[base_field].values, df[f].values, df[date_field].values):
            if last_base is None or b != last_base:
                last_date,last_base = np.datetime64(),b
            if v: last_date = d
            res.append(((d-last_date).astype('timedelta64[D]') / day1))
        df[prefix + f] = res
    return df

In [12]:
#export
def add_elapsed_times(df, field_names, date_field, base_field):
    "Add in `df` for each event in `field_names` the elapsed time according to `date_field` grouped by `base_field`"
    field_names = list(L(field_names))
    #Make sure date_field is a date and base_field a bool
    df[field_names] = df[field_names].astype('bool')
    make_date(df, date_field)

    work_df = df[field_names + [date_field, base_field]]
    work_df = work_df.sort_values([base_field, date_field])
    work_df = _get_elapsed(work_df, field_names, date_field, base_field, 'After')
    work_df = work_df.sort_values([base_field, date_field], ascending=[True, False])
    work_df = _get_elapsed(work_df, field_names, date_field, base_field, 'Before')

    for a in ['After' + f for f in field_names] + ['Before' + f for f in field_names]:
        work_df[a] = work_df[a].fillna(0).astype(int)

    for a,s in zip([True, False], ['_bw', '_fw']):
        work_df = work_df.set_index(date_field)
        tmp = (work_df[[base_field] + field_names].sort_index(ascending=a)
                      .groupby(base_field).rolling(7, min_periods=1).sum())
        tmp.drop(base_field,1,inplace=True)
        tmp.reset_index(inplace=True)
        work_df.reset_index(inplace=True)
        work_df = work_df.merge(tmp, 'left', [date_field, base_field], suffixes=['', s])
    work_df.drop(field_names,1,inplace=True)
    return df.merge(work_df, 'left', [date_field, base_field])

In [13]:
df = pd.DataFrame({'date': ['2019-12-04', '2019-11-29', '2019-11-15', '2019-10-24'],
                   'event': [False, True, False, True], 'base': [1,1,2,2]})
df = add_elapsed_times(df, ['event'], 'date', 'base')
df

Unnamed: 0,date,event,base,Afterevent,Beforeevent,event_bw,event_fw
0,2019-12-04,False,1,5,0,1.0,0.0
1,2019-11-29,True,1,0,0,1.0,1.0
2,2019-11-15,False,2,22,0,1.0,0.0
3,2019-10-24,True,2,0,0,1.0,1.0


In [14]:
#export
def cont_cat_split(df, max_card=20, dep_var=None):
    "Helper function that returns column names of cont and cat variables from given `df`."
    cont_names, cat_names = [], []
    for label in df:
        if label in L(dep_var): continue
        if df[label].dtype == int and df[label].unique().shape[0] > max_card or df[label].dtype == float:
            cont_names.append(label)
        else: cat_names.append(label)
    return cont_names, cat_names

In [15]:
df = pd.DataFrame({'cat1': [1, 2, 3, 4], 'cont1': [1., 2., 3., 2.], 'cat2': ['a', 'b', 'b', 'a'], 
                   'y1': [1, 0, 1, 0], 'y2': [1, 1, 1, 0]})

# Test all columns
cont, cat = cont_cat_split(df)
test_eq((cont, cat), (['cont1'], ['cat1', 'cat2', 'y1', 'y2']))

# Test exclusion of dependent variable
cont, cat = cont_cat_split(df, dep_var='y1')
test_eq((cont, cat), (['cont1'], ['cat1', 'cat2', 'y2']))

# Test exclusion of multi-label dependent variables
cont, cat = cont_cat_split(df, dep_var=['y1', 'y2'])
test_eq((cont, cat), (['cont1'], ['cat1', 'cat2']))

# Test maximal cardinality bound for int variable
# Any cat col with more than max card will be treated as a cont col hence below
cont, cat = cont_cat_split(df, max_card=2, dep_var=['y1', 'y2'])
test_eq((cont, cat), (['cat1', 'cont1'], ['cat2']))

In [16]:
#export
def df_shrink_dtypes(df, skip=[], obj2cat=True, int2uint=False):
    "Return any possible smaller data types for DataFrame columns. Allows `object`->`category`, `int`->`uint`, and exclusion."

    # 1: Build column filter and typemap
    excl_types, skip = {'category','datetime64[ns]','bool'}, set(skip)

    typemap = {'int'   : [(np.dtype(x), np.iinfo(x).min, np.iinfo(x).max) for x in (np.int8, np.int16, np.int32, np.int64)],
               'uint'  : [(np.dtype(x), np.iinfo(x).min, np.iinfo(x).max) for x in (np.uint8, np.uint16, np.uint32, np.uint64)],
               'float' : [(np.dtype(x), np.finfo(x).min, np.finfo(x).max) for x in (np.float32, np.float64, np.longdouble)]
              }
    if obj2cat: typemap['object'] = 'category'  # User wants to categorify dtype('Object'), which may not always save space
    else:       excl_types.add('object')

    new_dtypes = {}
    exclude = lambda dt: dt[1].name not in excl_types and dt[0] not in skip

    for c, old_t in filter(exclude, df.dtypes.items()):
        t = next((v for k,v in typemap.items() if old_t.name.startswith(k)), None)

        if isinstance(t, list): # Find the smallest type that fits
            if int2uint and t==typemap['int'] and df[c].min() >= 0: t=typemap['uint']
            new_t = next((r[0] for r in t if r[1]<=df[c].min() and r[2]>=df[c].max()), None)
            if new_t and new_t == old_t: new_t = None
        else: new_t = t if isinstance(t, str) else None

        if new_t: new_dtypes[c] = new_t
    return new_dtypes

In [None]:
show_doc(df_shrink_dtypes, title_level=3)

<h3 id="df_shrink_dtypes" class="doc_header"><code>df_shrink_dtypes</code><a href="__main__.py#L2" class="source_link" style="float:right">[source]</a></h3>

> <code>df_shrink_dtypes</code>(**`df`**, **`skip`**=*`[]`*, **`obj2cat`**=*`True`*, **`int2uint`**=*`False`*)

Return any possible smaller data types for DataFrame columns. Allows `object`->`category`, `int`->`uint`, and exclusion.

In [18]:
df = pd.DataFrame({'i': [-100, 0, 100], 'f': [-100.0, 0.0, 100.0], 'e': [True, False, True],
                   'date':['2019-12-04','2019-11-29','2019-11-15',]})
dt = df_shrink_dtypes(df)
test_eq(df['i'].dtype, 'int64')
test_eq(dt['i'], 'int8')

test_eq(df['f'].dtype, 'float64')
test_eq(dt['f'], 'float32')

# Default ignore 'object' and 'boolean' columns
test_eq(df['date'].dtype, 'object')
test_eq(dt['date'], 'category')

# Test categorifying 'object' type
dt2 = df_shrink_dtypes(df, obj2cat=False)
test_eq('date' not in dt2, True)

In [19]:
#export
def df_shrink(df, skip=[], obj2cat=True, int2uint=False):
    "Reduce DataFrame memory usage, by casting to smaller types returned by `df_shrink_dtypes()`."
    dt = df_shrink_dtypes(df, skip, obj2cat=obj2cat, int2uint=int2uint)
    return df.astype(dt)

In [None]:
show_doc(df_shrink, title_level=3)

<h3 id="df_shrink" class="doc_header"><code>df_shrink</code><a href="__main__.py#L2" class="source_link" style="float:right">[source]</a></h3>

> <code>df_shrink</code>(**`df`**, **`skip`**=*`[]`*, **`obj2cat`**=*`True`*, **`int2uint`**=*`False`*)

Reduce DataFrame memory usage, by casting to smaller types returned by `df_shrink_dtypes()`.

`df_shrink(df)` attempts to make a DataFrame uses less memory, by fit numeric columns into smallest datatypes.  In addition:

 * `boolean`, `category`, `datetime64[ns]` dtype columns are ignored.
 * 'object' type columns are categorified, which can save a lot of memory in large dataset.  It can be turned off by `obj2cat=False`.
 * `int2uint=True`, to fit `int` types to `uint` types, if all data in the column is >= 0.
 * columns can be excluded by name using `excl_cols=['col1','col2']`.

To get only new column data types without actually casting a DataFrame,
use `df_shrink_dtypes()` with all the same parameters for `df_shrink()`.

In [20]:
df = pd.DataFrame({'i': [-100, 0, 100], 'f': [-100.0, 0.0, 100.0], 'u':[0, 10,254],
                  'date':['2019-12-04','2019-11-29','2019-11-15']})
df2 = df_shrink(df, skip=['date'])

test_eq(df['i'].dtype=='int64' and df2['i'].dtype=='int8', True)
test_eq(df['f'].dtype=='float64' and df2['f'].dtype=='float32', True)
test_eq(df['u'].dtype=='int64' and df2['u'].dtype=='int16', True)
test_eq(df2['date'].dtype, 'object')

test_eq(df2.memory_usage().sum() < df.memory_usage().sum(), True)

# Test int => uint (when col.min() >= 0)
df3 = df_shrink(df, int2uint=True)
test_eq(df3['u'].dtype, 'uint8')  # int64 -> uint8 instead of int16

# Test excluding columns
df4 = df_shrink(df, skip=['i','u'])
test_eq(df['i'].dtype, df4['i'].dtype)
test_eq(df4['u'].dtype, 'int64')

Here's an example using the `ADULT_SAMPLE` dataset:

In [21]:
path = untar_data(URLs.ADULT_SAMPLE)
df = pd.read_csv(path/'adult.csv')
new_df = df_shrink(df, int2uint=True)
print(f"Memory usage: {df.memory_usage().sum()} --> {new_df.memory_usage().sum()}")

Memory usage: 3907448 --> 818665


## Tabular -

In [22]:
#export
class _TabIloc:
    "Get/set rows by iloc and cols by name"
    def __init__(self,to): self.to = to
    def __getitem__(self, idxs):
        df = self.to.items
        # If you have row and col then cols is replaced with integer index of the column
        # so this way we can use col names and row numbers
        if isinstance(idxs,tuple):
            rows,cols = idxs
            cols = df.columns.isin(cols) if is_listy(cols) else df.columns.get_loc(cols)
        else: rows,cols = idxs,slice(None)
        # It wraps it back up into a tabular object as well so if you index into a tab object
        # with iloc you get back a tab object as well. 
        return self.to.new(df.iloc[rows, cols])

In [None]:
'''
Init signature: CollBase(*args, **kwargs)
Source:        
class CollBase:
    "Base class for composing a list of `items`"
    def __init__(self, items): self.items = items
    def __len__(self): return len(self.items)
    def __getitem__(self, k): return self.items[list(k) if isinstance(k,CollBase) else k]
    def __setitem__(self, k, v): self.items[list(k) if isinstance(k,CollBase) else k] = v
    def __delitem__(self, i): del(self.items[i])
    def __repr__(self): return self.items.__repr__()
    def __iter__(self): return self.items.__iter__()
File:           /usr/local/lib/python3.6/dist-packages/fastcore/foundation.py
Type:           type
'''
CollBase??

In [24]:
#export
class Tabular(CollBase, GetAttr, FilteredBase):
    "A `DataFrame` wrapper that knows which cols are cont/cat/y, and returns rows in `__getitem__`"
    '''
    A class that has all of the things in it that enables it to do all we want it to do
    Dataframe does NOT have all such info to build models. So categorical names, continuous names, processes 
    used to process data, what dependent variable is y_names - typically one but could be more,
    so that is why we pass in those four things. (Multiple y_names examples is a regression problem
    of predicting x and y values or multi-label classification)

    Make tabular object look a lot like dataframe - one way is to inherit from GetAttr - which means
    any unknown attributes passed down to _default property 
    
    In df not convenient to index by row number and col by name - can do .iloc to get row by number and col by number
    can do .loc to get row by number or index and col y number or index BUT most common use case is row by number and 
    col by name, SO redefined .iloc to use tabular iloc indexer 
    '''
    _default,with_cont='procs',True
    def __init__(self, df, procs=None, cat_names=None, cont_names=None, y_names=None, y_block=None, splits=None,
                 do_setup=True, device=None, inplace=False, reduce_memory=True):
        if inplace and splits is not None and pd.options.mode.chained_assignment is not None:
            warn("Using inplace with splits will trigger a pandas error. Set `pd.options.mode.chained_assignment=None` to avoid it.")
        if not inplace: df = df.copy()
        if reduce_memory: df = df_shrink(df)
        # below line is for rapids. Accessing individual items in df when df is in GPU (for rapids) is extra-ordinarily SLOW
        # so concatenate splits together if there are splits and index into the dataframe with that list and make the result
        # the dataframe that is used. So dataloaders then get continuous indices. 
        if splits is not None: df = df.iloc[sum(splits, [])]
        self.dataloaders = delegates(self._dl_type.__init__)(self.dataloaders)
        # Tabular Inherits from CollBase which defines basic things you would expect to have in a collection
        # & implements them by compositions. If you pass in a list, len of the CollBase will be length of list 
        # and so on. So by inheriting from collbase you can pass in df and so now we have self.items which is
        # going to be that dataframe. 
        super().__init__(df)

        self.y_names,self.device = L(y_names),device
        if y_block is None and self.y_names:
            # Make ys categorical if they're not numeric
            # Figures out whether y is type Category or Numeric 
            # by making ys categorical if they are NOT numeric
            ys = df[self.y_names]
            if len(ys.select_dtypes(include='number').columns)!=len(ys.columns): y_block = CategoryBlock()
            else: y_block = RegressionBlock()
        if y_block is not None and do_setup:
            if callable(y_block): y_block = y_block()
            procs = L(procs) + y_block.type_tfms
        # the procs are transforms and we make them Pipelines
        # Also, unless like for Tfmdlists, TfmdDL etc we do NOT apply procs lazily but ahead of time
        # Reasons: unlike opening an image, does not take whole lot of time to process number of rows
        # most tabular stuff is also designed to work quickly on lots of rows so faster if you do the
        # procs ahead of time, most proces is not data augmentation but just once applied data cleaning
        # type of stuff.
        # Still pipeline of transforms though 
        self.cat_names,self.cont_names,self.procs = L(cat_names),L(cont_names),Pipeline(procs)
        self.split = len(df) if splits is None else len(splits[0])
        if do_setup: self.setup()

    def new(self, df):
        return type(self)(df, do_setup=False, reduce_memory=False, y_block=TransformBlock(),
                          **attrdict(self, 'procs','cat_names','cont_names','y_names', 'device'))

    def subset(self, i): return self.new(self.items[slice(0,self.split) if i==0 else slice(self.split,len(self))])
    def copy(self): self.items = self.items.copy(); return self
    def decode(self): return self.procs.decode(self)
    def decode_row(self, row): return self.new(pd.DataFrame(row).T).decode().items.iloc[0]
    def show(self, max_n=10, **kwargs): display_df(self.new(self.all_cols[:max_n]).decode().items)
    def setup(self): self.procs.setup(self)
    def process(self): self.procs(self)
    def loc(self): return self.items.loc
    def iloc(self): return _TabIloc(self)
    def targ(self): return self.items[self.y_names]
    def x_names (self): return self.cat_names + self.cont_names
    def n_subsets(self): return 2
    def y(self): return self[self.y_names[0]]
    def new_empty(self): return self.new(pd.DataFrame({}, columns=self.items.columns))
    def to_device(self, d=None):
        self.device = d
        return self

    def all_col_names (self):
        ys = [n for n in self.y_names if n in self.items.columns]
        # Aside: Adding a none value works since each is an L 
        # will NOT work if ordinary list
        return self.x_names + self.y_names if len(ys) == len(self.y_names) else self.x_names

# Below is just alternate syntax to create a number of properties instead of saying @property before
# each of these. So it creates these properties.
properties(Tabular,'loc','iloc','targ','all_col_names','n_subsets','x_names','y')

* `df`: A `DataFrame` of your data
* `cat_names`: Your categorical `x` variables
* `cont_names`: Your continuous `x` variables
* `y_names`: Your dependent `y` variables
  * Note: Mixed y's such as Regression and Classification is not currently supported, however multiple regression or classification outputs is
* `y_block`: How to sub-categorize the type of `y_names` (`CategoryBlock` or `RegressionBlock`)
* `splits`: How to split your data
* `do_setup`: A parameter for if `Tabular` will run the data through the `procs` upon initialization
* `device`: `cuda` or `cpu`
* `inplace`: If `True`, `Tabular` will not keep a separate copy of your original `DataFrame` in memory. You should ensure `pd.options.mode.chained_assignment` is `None` before setting this
* `reduce_memory`: `fastai` will attempt to reduce the overall memory usage by the inputted `DataFrame` with `df_shrink`

In [None]:
'''
Signature: properties(cls, *ps)
Source:   
def properties(cls, *ps):
    "Change attrs in `cls` with names in `ps` to properties"
    for p in ps: setattr(cls,p,property(getattr(cls,p)))
File:      /usr/local/lib/python3.6/dist-packages/fastcore/utils.py
Type:      function
'''
properties??

In [25]:
#export
class TabularPandas(Tabular):
    "A `Tabular` object with transforms"
    def transform(self, cols, f, all_col=True):
        if not all_col: cols = [c for c in cols if c in self.items.columns]
        if len(cols) > 0: self[cols] = self[cols].transform(f)

In [26]:
#export
def _add_prop(cls, nm): 
    @property
    # read version of property which grabs 'nm'_names so 
    # cat_names, cont_names etc and accesses into that df 
    # with the list of columns
    def f(o): return o[list(getattr(o,nm+'_names'))]
    @f.setter
    # setter helps to set 'nm'_names to provided value v
    #
    def fset(o, v): o[getattr(o,nm+'_names')] = v
    setattr(cls, nm+'s', f)
    setattr(cls, nm+'s', fset)

_add_prop(Tabular, 'cat')
_add_prop(Tabular, 'cont')
_add_prop(Tabular, 'y')
_add_prop(Tabular, 'x')
_add_prop(Tabular, 'all_col')

In [27]:
# A df with 2 cols
df = pd.DataFrame({'a':[0,1,2,0,2], 'b':[0,0,0,0,1]})
# Create a Tabular object passing in the df and saying cat_names = 'a'
to = TabularPandas(df, cat_names='a')
# test that this pickles ok
t = pickle.loads(pickle.dumps(to))
# check that t's items and tabular object items are same
test_eq(t.items,to.items)
# 
# Coz Tabular object 'to' has only one col 'a' mentioned even though
# df has 'a' and 'b', all_cols of 'to' only has 'a'
# all_cols means all columns cont & cat & dep vars
test_eq(to.all_cols,to[['a']])
# check whether to.show() works to show col 'a'

In [28]:
to.show()

Unnamed: 0,a
0,0
1,1
2,2
3,0
4,2


In [29]:
#export
class TabularProc(InplaceTransform):
    "Base class to write a non-lazy tabular processor for dataframes"
    # Tabular processes are just transforms - specifically InplaceTransforms
    # For tabular data we would NOT like to create lots of copies of it
    # InplaceTransform -> call it and then return the original thing 
    # Processes goal is to change the data that is stored and InplaceTransform helps
    # by returning what you started with.
    # So TabularProc is just a transform that returns itself when you call it and when
    # you set it up, just does normal setup but also calls __call__ which is the
    # self(items.items ....) line. WHY? Look at example
    # So TabularProc overrides setup and it is a transform that when you set it up it 
    # also immediately calls it straightaway.
    def setup(self, items=None, train_setup=False): #TODO: properly deal with train_setup
        super().setup(getattr(items,'train',items), train_setup=False)
        # Procs are called as soon as data is available
        # Below step is explained further (see Categorify eg below). 
        # It is running encodes after running setup. Encodes (a call) actually
        # converts the categorical cols into ints using vocab generated in setup
        return self(items.items if isinstance(items,Datasets) else items)

    @property
    def name(self): return f"{super().name} -- {getattr(self,'__stored_args__',{})}"

map is a pandas method. can pass it a function, which is going to be super slow but you can pass it a dict and that will map from keys to values in the dict 

In [30]:
#export
def _apply_cats (voc, add, c):
    if not is_categorical_dtype(c):
        return pd.Categorical(c, categories=voc[c.name][add:]).codes+add
    return c.cat.codes+add #if is_categorical_dtype(c) else c.map(voc[c.name].o2i)
def _decode_cats(voc, c): return c.map(dict(enumerate(voc[c.name].items)))

In [None]:
'''
Init signature: CategoryMap(*args, **kwargs)
Source:        
class CategoryMap(CollBase):
    "Collection of categories with the reverse mapping in `o2i`"
    def __init__(self, col, sort=True, add_na=False, strict=False):
        if is_categorical_dtype(col):
            items = L(col.cat.categories, use_list=True)
            #Remove non-used categories while keeping order
            if strict: items = L(o for o in items if o in col.unique())
        else:
            if not hasattr(col,'unique'): col = L(col, use_list=True)
            # `o==o` is the generalized definition of non-NaN used by Pandas
            items = L(o for o in col.unique() if o==o)
            if sort: items = items.sorted()
        self.items = '#na#' + items if add_na else items
        self.o2i = defaultdict(int, self.items.val2idx()) if add_na else dict(self.items.val2idx())

    def map_objs(self,objs):
        "Map `objs` to IDs"
        return L(self.o2i[o] for o in objs)

    def map_ids(self,ids):
        "Map `ids` to objects in vocab"
        return L(self.items[o] for o in ids)

    def __eq__(self,b): return all_equal(b,self)
File:           /usr/local/lib/python3.6/dist-packages/fastai/data/transforms.py
Type:           type
'''
CategoryMap??

In [31]:
#export
class Categorify(TabularProc):
    "Transform the categorical variables to something similar to `pd.Categorical`"
    # Similar transform to categorized we have seen for dependent variables like image
    # specification but Categorify is for Tabular objects (to)
    order = 1
    # Setups creates a CategoryMap as a mapping from int numbers as its vocab to 
    # to a string map - that is what a Category map is, so it goes through all
    # Categorical columns, go into the dataframe using .iloc for each of those cols
    # and create a CategoryMap for that col.
    # So self.classes is therefore going to be a dict that goes from col names to the
    # vocab for that categorical column. 
    # So setup sets up the metadata - its vocab.
    def setups(self, to):
        store_attr(classes={n:CategoryMap(to.iloc[:,n].items, add_na=(n in to.cat_names)) for n in to.cat_names})

    # Encodes takes the categorical cols and converts them into ints using the vocab created in setups.
    # Needs to be two separate things coz at inference time, you do not want to run setups, you just run encodes.
    # At trng time do BOTH.

    # pass in to a transform the col names and a function _apply_cats UNLESS you have a pandas Categorical col 
    # in which case pandas has done the coding for you so you just use the codes and add

    # How does function _apply_cats get applied to each of col in cat_names - coz it uses to.transform
    # using TabularPandas (see above) for pandas the cols become the transformed version of the columns.
    # coz pandas has a .transform method for Series which is for each column.
    def encodes(self, to): to.transform(to.cat_names, partial(_apply_cats, self.classes, 1))
    def decodes(self, to): to.transform(to.cat_names, partial(_decode_cats, self.classes))
    def __getitem__(self,k): return self.classes[k]

In [32]:
#export
@Categorize
def setups(self, to:Tabular):
    if len(to.y_names) > 0:
        if self.vocab is None:
            self.vocab = CategoryMap(getattr(to, 'train', to).iloc[:,to.y_names[0]].items, strict=True)
        else:
            self.vocab = CategoryMap(self.vocab, sort=False, add_na=self.add_na)
        self.c = len(self.vocab)
    return self(to)

@Categorize
def encodes(self, to:Tabular):
    to.transform(to.y_names, partial(_apply_cats, {n: self.vocab for n in to.y_names}, 0), all_col=False)
    return to

@Categorize
def decodes(self, to:Tabular):
    to.transform(to.y_names, partial(_decode_cats, {n: self.vocab for n in to.y_names}), all_col=False)
    return to

In [None]:
show_doc(Categorify, title_level=3)

<h3 id="Categorify" class="doc_header"><code>class</code> <code>Categorify</code><a href="" class="source_link" style="float:right">[source]</a></h3>

> <code>Categorify</code>(**`enc`**=*`None`*, **`dec`**=*`None`*, **`split_idx`**=*`None`*, **`order`**=*`None`*) :: [`TabularProc`](/tabular.core.html#TabularProc)

Transform the categorical variables to something similar to `pd.Categorical`

In [33]:
df = pd.DataFrame({'a':[0,1,2,0,2]})
# Create a tabular object passing in a dataframe and any procs we wish to run and
# cat_names (ie col names which are categories) that we wish to run categorify on.

# Here we added a Categorify transform - we have not instantiated it - we just pass
# in the type, Pipeline is going to instantiate it for us - Pipeline always instantiates
# the types for you if you do not instantiate them
to = TabularPandas(df, Categorify, 'a')
# Once to created we check that the categorify procs does what we expect for col 'a'
# call setup, then encodes, 

# Recall that to.procs is of type Pipeline and also recall that if an attr is NOT found
# in any transforms it will continue to look for that attr in other transforms in the
# Pipeline which is what we want. The attribute is NOT added but uses Getattr to get
# the attribute
# Here it will look for a transform with type categorify and it converts to snakecase
# ASIDE: In V2 callbacks get automatically added as attributes.

# To find the vocab, We grab the procs out of our tabular object and ask for the
# categorify transform. Categorify has a __getitem__ which returns the vocab for 
# that column
cat = to.procs.categorify
# can see that whenever we create a categorify column, we add a '#na#' at the start
# (Same as in fastai-V1). Done so that in future if you get value outside of your vocab
# it will be '#na#' and hence what you see below. So below is the vocab for col 'a'
test_eq(cat['a'], ['#na#',0,1,2])
# It uses a defaultdict for the reverse dict o2i and so if item in testset that is not
# seen in training set (hence not in vocab) mapped to 0 ie #na# during test/inference
test_eq(to['a'], [1,2,3,1,3])
to.show()

Unnamed: 0,a
0,0
1,1
2,2
3,0
4,2


In [None]:
# skr adds
print(type(to.procs)) # Pipeline
print(type(cat['a'])) # CategoryMap
print(cat['a'].items) # same as vocab
print(cat['a'].o2i) # reverse dict

<class 'fastcore.transform.Pipeline'>
<class 'fastai.data.transforms.CategoryMap'>
['#na#', 0, 1, 2]
defaultdict(<class 'int'>, {'#na#': 0, 0: 1, 1: 2, 2: 3})


In [34]:
# say test set or inference
# create a new to for the testset with the same metadata & procs
# as we had before, so same vocab, same cont, cat vars, 
# way to do that is to start with same tabular object as before and
# pass in a new dataframe - here df1 passed in to tabular object to
# and we get back to1 a new tabular object with same metadata, vocab
# but with new data.
df1 = pd.DataFrame({'a':[1,0,3,-1,2]})
to1 = to.new(df1)
# But we do NOT want to call setup though, so we just call process
# which only calls procs (it is a pipeline so can be treated as a function)
to1.process()
# Have a couple of items -1 and 3 not in vocab
#Values that weren't in the training df are sent to 0 (na)
test_eq(to1['a'], [2,1,0,0,3])
# if you call decode you end up with same data you started with but for
# the two #na#s. 
to2 = cat.decode(to1)
test_eq(to2['a'], [1,0,'#na#','#na#',2])

Imp: Decoding in fastai in general does NOT always mean you get back what you started with. Some cases like normaization you get back what you started with but with some things like Categorify it won't be.

In [35]:
#test with splits

# not necessary to pass in type name, can instantiate ourselves
# as in cat is instance of Categorify. Then no need to pull out the
# cat as to.procs.categorify
cat = Categorify()
df = pd.DataFrame({'a':[0,1,2,3,2]})
# splits says first three elems of 'a' are in training set and last 2 in val set
# so elem 3 in 'a' is NOT in training set. So value 3 should NOT be part of vocab
# 
# We pass in splits as list of lists of indices to creation of tabular object.
# 
to = TabularPandas(df, cat, 'a', splits=[[0,1,2],[3,4]])
# vocab has only #na# and 0, 1, 2 and NO 3
test_eq(cat['a'], ['#na#',0,1,2])
# Check that vocab does not include value 3 - it does NOT
test_eq(to['a'], [1,2,3,0,3])

In [36]:
# skr adds - check dataset
print(to.train)
# Notice that the values for a are indices into the vocab
# ie indices 1, 2, 3 which point to values 0, 1, 2
print()
print(to.valid)
# Indices 0 and 3 into vocab so values #na# and 2.

   a
0  1
1  2
2  3

   a
3  0
4  3


In [37]:
to.n_subsets

2

In [38]:
# Other way to use Categorify 
# Create categorical col in Pandas itself, one reason to do this is defining not just categories
# but their order as well. Pandas is also efficient at dealing with categories.
df = pd.DataFrame({'a':pd.Categorical(['M','H','L','M'], categories=['H','M','L'], ordered=True)})
# Now when we use Categorify as before we get same result EXCEPT
to = TabularPandas(df, Categorify, 'a')
cat = to.procs.categorify
# The categorical processor ensures categories in right order and matched in that way 
# and pssibly more efficiently since it is using Pandas internal _cat_codes code 
test_eq(cat['a'], ['#na#','H','M','L'])
test_eq(to.items.a, [2,1,3,2])
to2 = cat.decode(to)
test_eq(to2['a'], ['M','H','L','M'])

In [40]:
#test with targets
cat = Categorify()
df = pd.DataFrame({'a':[0,1,2,3,2], 'b': ['a', 'b', 'a', 'b', 'b']})
to = TabularPandas(df, cat, 'a', splits=[[0,1,2],[3,4]], y_names='b')
test_eq(to.vocab, ['a', 'b'])
test_eq(to['a'], [1, 2, 3, 0, 3])
test_eq(to['b'], [0,1,0,1,1])
to2 = to.procs.decode(to)
test_eq(to2['b'], ['a', 'b', 'a', 'b', 'b'])

In [None]:
# duplicate?
cat = Categorify()
df = pd.DataFrame({'a':[0,1,2,3,2], 'b': ['a', 'b', 'a', 'b', 'b']})
to = TabularPandas(df, cat, 'a', splits=[[0,1,2],[3,4]], y_names='b')
test_eq(to.vocab, ['a', 'b'])
test_eq(to['b'], [0,1,0,1,1])
to2 = to.procs.decode(to)
test_eq(to2['b'], ['a', 'b', 'a', 'b', 'b'])

In [None]:
#test with targets and train
cat = Categorify()
df = pd.DataFrame({'a':[0,1,2,3,2], 'b': ['a', 'b', 'a', 'c', 'b']})
to = TabularPandas(df, cat, 'a', splits=[[0,1,2],[3,4]], y_names='b')
test_eq(to.vocab, ['a', 'b'])

In [None]:
#export
@Normalize
def setups(self, to:Tabular):
    # This says if to has train attr then make the to means be the
    # mean of the to train otherwise mean of to. Same for std.
    # So to can have a train and valid or not and the setup code
    # should work for both conditions and return right type of object
    # Notice that we are doing this only to the continuous vars in the
    # to. 

    # We are getting below the means and stds for all the cols of the continuous vars
    # in the to - akin to df.mean()
    store_attr(means=dict(getattr(to, 'train', to).conts.mean()),
               stds=dict(getattr(to, 'train', to).conts.std(ddof=0)+1e-7))
    return self(to)

@Normalize
def encodes(self, to:Tabular):
    # Applying to all cont cols in to at once
    to.conts = (to.conts-self.means) / self.stds
    return to

@Normalize
def decodes(self, to:Tabular):
    to.conts = (to.conts*self.stds ) + self.means
    return to

In [None]:
norm = Normalize()
df = pd.DataFrame({'a':[0,1,2,3,4]})
to = TabularPandas(df, norm, cont_names='a')
x = np.array([0,1,2,3,4])
m,s = x.mean(),x.std()
test_eq(norm.means['a'], m)
test_close(norm.stds['a'], s)
test_close(to['a'].values, (x-m)/s)

In [None]:
df1 = pd.DataFrame({'a':[5,6,7]})
#provide df1 to create a new to using to
to1 = to.new(df1)
#only process no setup
to1.process()
# check normalize is appplied using to's m and s
test_close(to1['a'].values, (np.array([5,6,7])-m)/s)
to2 = norm.decode(to1)
# decode gets back orig values
test_close(to2['a'].values, [5,6,7])

In [None]:
# using splits for training and validation
norm = Normalize()
df = pd.DataFrame({'a':[0,1,2,3,4]})
to = TabularPandas(df, norm, cont_names='a', splits=[[0,1,2],[3,4]])
# so m and s are of training set of to 
x = np.array([0,1,2])
m,s = x.mean(),x.std()
test_eq(norm.means['a'], m)
test_close(norm.stds['a'], s)
test_close(to['a'].values, (np.array([0,1,2,3,4])-m)/s)

In [None]:
#export
class FillStrategy:
    "Namespace containing the various filling strategies."
    # Class containing 3 diff methods, 
    def median  (c,fill): return c.median()
    def constant(c,fill): return fill
    def mode    (c,fill): return c.dropna().value_counts().idxmax()

Currently, filling with the `median`, a `constant`, and the `mode` are supported.

In [None]:
#export
class FillMissing(TabularProc):
    "Fill the missing values in continuous columns."
    def __init__(self, fill_strategy=FillStrategy.median, add_col=True, fill_vals=None):
        if fill_vals is None: fill_vals = defaultdict(int)
        store_attr()

    def setups(self, dsets):
        # check for any missing values in every cont col 
        missing = pd.isnull(dsets.conts).any()
        # if any missing create a na_dict where the col with missing values appears as
        # col name in na_dict and the value of the missing value will depend on the
        # fill_strategy you ask for = median, mode, const, passing in the col, and fill value
        store_attr(na_dict={n:self.fill_strategy(dsets[n], self.fill_vals[n])
                            for n in missing[missing].keys()})
        self.fill_strategy = self.fill_strategy.__name__

    def encodes(self, to):
        # when encodes do 2 things: first use pandas fillna to fill whatever value we 
        # put in as fill value in the dict for that col, and do it in place.
        # Second: if we ask to add a col to indicate which rows have missing vals filled in
        # then we add a col with same name and _na at the end which is of type boolean
        # and True if that value is missing and 
        missing = pd.isnull(to.conts)
        for n in missing.any()[missing.any()].keys():
            assert n in self.na_dict, f"nan values in `{n}` but not in setup training set"
        for n in self.na_dict.keys():
            to[n].fillna(self.na_dict[n], inplace=True)
            if self.add_col:
                to.loc[:,n+'_na'] = missing[n]
                if n+'_na' not in to.cat_names: to.cat_names.append(n+'_na')

In [None]:
show_doc(FillMissing, title_level=3)

<h3 id="FillMissing" class="doc_header"><code>class</code> <code>FillMissing</code><a href="" class="source_link" style="float:right">[source]</a></h3>

> <code>FillMissing</code>(**`fill_strategy`**=*`median`*, **`add_col`**=*`True`*, **`fill_vals`**=*`None`*) :: [`TabularProc`](/tabular.core.html#TabularProc)

Fill the missing values in continuous columns.

In [None]:
# Creating 3 different processes with 3 diff fill strategies 
fill1,fill2,fill3 = (FillMissing(fill_strategy=s) 
                     for s in [FillStrategy.median, FillStrategy.constant, FillStrategy.mode])
# df with missing value
df = pd.DataFrame({'a':[0,1,np.nan,1,2,3,4]})
df1 = df.copy(); df2 = df.copy()
# tos with 3 diff processes
tos = (TabularPandas(df, fill1, cont_names='a'),
       TabularPandas(df1, fill2, cont_names='a'),
       TabularPandas(df2, fill3, cont_names='a'))
# make sure that na_dict for 'a' col has appropriate fill value
# of median, constant or mode value of col 'a'. 
test_eq(fill1.na_dict, {'a': 1.5})
test_eq(fill2.na_dict, {'a': 0})
test_eq(fill3.na_dict, {'a': 1.0})

# check that to has cat_names added 'a_na'
# NOT enough to add to df, also need to add to cat_names of tabular object.
for t in tos: test_eq(t.cat_names, ['a_na'])

for to_,v in zip(tos, [1.5, 0., 1.]):
    test_eq(to_['a'].values, np.array([0, 1, v, 1, 2, 3, 4]))
    test_eq(to_['a_na'].values, np.array([0, 0, 1, 0, 0, 0, 0]))

In [None]:
fill = FillMissing() 
df = pd.DataFrame({'a':[0,1,np.nan,1,2,3,4], 'b': [0,1,2,3,4,5,6]})
to = TabularPandas(df, fill, cont_names=['a', 'b'])
test_eq(fill.na_dict, {'a': 1.5})
test_eq(to.cat_names, ['a_na'])
test_eq(to['a'].values, np.array([0, 1, 1.5, 1, 2, 3, 4]))
test_eq(to['a_na'].values, np.array([0, 0, 1, 0, 0, 0, 0]))
test_eq(to['b'].values, np.array([0,1,2,3,4,5,6]))

## TabularPandas Pipelines -

In [None]:
procs = [Normalize, Categorify, FillMissing, noop]
# a col cat b col cont
df = pd.DataFrame({'a':[0,1,2,1,1,2,0], 'b':[0,1,np.nan,1,2,3,4]})
to = TabularPandas(df, procs, cat_names='a', cont_names='b')

# The procs will only work on cols of specific types 
# eg normalize only on cont cols, categorify only on cat cols
# ALSO: for cat cols we also categorize Dependent vars but for 
# normalize we don't normalize dependent vars but typically do
# a sigmoid in the model or something like that.

#Test setup and apply on df_main
test_eq(to.cat_names, ['a', 'b_na'])
test_eq(to['a'], [1,2,3,2,2,3,1])
test_eq(to['b_na'], [1,1,2,1,1,1,1])
x = np.array([0,1,1.5,1,2,3,4])
m,s = x.mean(),x.std()
test_close(to['b'].values, (x-m)/s)
test_eq(to.classes, {'a': ['#na#',0,1,2], 'b_na': ['#na#',False,True]})

In [None]:
#Test apply on y_names
df = pd.DataFrame({'a':[0,1,2,1,1,2,0], 'b':[0,1,np.nan,1,2,3,4], 'c': ['b','a','b','a','a','b','a']})
to = TabularPandas(df, procs, 'a', 'b', y_names='c')

test_eq(to.cat_names, ['a', 'b_na'])
test_eq(to['a'], [1,2,3,2,2,3,1])
test_eq(to['b_na'], [1,1,2,1,1,1,1])
test_eq(to['c'], [1,0,1,0,0,1,0])
x = np.array([0,1,1.5,1,2,3,4])
m,s = x.mean(),x.std()
test_close(to['b'].values, (x-m)/s)
test_eq(to.classes, {'a': ['#na#',0,1,2], 'b_na': ['#na#',False,True]})
test_eq(to.vocab, ['a','b'])

In [None]:
df = pd.DataFrame({'a':[0,1,2,1,1,2,0], 'b':[0,1,np.nan,1,2,3,4], 'c': ['b','a','b','a','a','b','a']})
to = TabularPandas(df, procs, 'a', 'b', y_names='c')

test_eq(to.cat_names, ['a', 'b_na'])
test_eq(to['a'], [1,2,3,2,2,3,1])
test_eq(df.a.dtype,int)
test_eq(to['b_na'], [1,1,2,1,1,1,1])
test_eq(to['c'], [1,0,1,0,0,1,0])

In [None]:
df = pd.DataFrame({'a':[0,1,2,1,1,2,0], 'b':[0,np.nan,1,1,2,3,4], 'c': ['b','a','b','a','a','b','a']})
to = TabularPandas(df, procs, cat_names='a', cont_names='b', y_names='c', splits=[[0,1,4,6], [2,3,5]])

test_eq(to.cat_names, ['a', 'b_na'])
# coz of split there is no 2 in vocab of a, hence 0,2,0 last 
test_eq(to['a'], [1,2,2,1,0,2,0])
test_eq(df.a.dtype,int)
test_eq(to['b_na'], [1,2,1,1,1,1,1])
test_eq(to['c'], [1,0,0,0,1,0,1])

In [None]:
#export
def _maybe_expand(o): return o[:,None] if o.ndim==1 else o

To model we need Tensors - one for cat vars, one for cont vars and one for dep vars. The cont and cat are different data types and we cannot put them all in the same tensor since (in order to be in one tensor they all have to be same data type). 

In [None]:
#export
class ReadTabBatch(ItemTransform):
    # Now we create a typical normal lazy transform which gets applied 
    # as we are getting our batches. 

    # tabular object we are transforming. 
    def __init__(self, to): self.to = to

    def encodes(self, to):
        # grab cat vars make tensors out of them turn them to long
        if not to.with_cont: res = (tensor(to.cats).long(),)
        # grab cont make them tensor then floats 
        # also make those two things a tuple of those 2 things
        # These are our independent vars
        else: res = (tensor(to.cats).long(),tensor(to.conts).float())
        # Dependent var turned into a long/float
        ys = [n for n in to.y_names if n in to.items.columns]
        if len(ys) == len(to.y_names): res = res + (tensor(to.targ),)
        if to.device is not None: res = to_device(res, to.device)
        return res

    def decodes(self, o):
        o = [_maybe_expand(o_) for o_ in to_np(o) if o_.size != 0]
        vals = np.concatenate(o, axis=1)
        try: df = pd.DataFrame(vals, columns=self.to.all_col_names)
        except: df = pd.DataFrame(vals, columns=self.to.x_names)
        to = self.to.new(df)
        return to

In [None]:
#export
@typedispatch
def show_batch(x: Tabular, y, its, max_n=10, ctxs=None):
    x.show()

In [None]:
from torch.utils.data.dataloader import _MultiProcessingDataLoaderIter,_SingleProcessDataLoaderIter,_DatasetKind

In [None]:
_loaders = (_MultiProcessingDataLoaderIter,_SingleProcessDataLoaderIter)

In [None]:
#export
@delegates()
class TabDataLoader(TfmdDL):
    "A transformed `DataLoader` for Tabular data"
    #
    # We want to do everything a batch at a time, so especially for rapids stuff we don't want
    # to pull out individual rows and collect them later, everything done by grabbing whole batch 
    # at a time, That is why we replace do_item - the thing that grabs a single item, for collation
    # we replace with noops. 
    do_item = noops
    def __init__(self, dataset, bs=16, shuffle=False, after_batch=None, num_workers=0, **kwargs):
        # Tabular Data Loader is a transform data loader where we know that any after_batch callback
        # you aksed for we also need to add in ReadTabBatch so that is automatically added to the
        # Transforms for you. 
        if after_batch is None: after_batch = L(TransformBlock().batch_tfms)+ReadTabBatch(dataset)
        super().__init__(dataset, bs=bs, shuffle=shuffle, after_batch=after_batch, num_workers=num_workers, **kwargs)
    # Also replace below create_batch which collates things into a batch with something that grabs all of the samples
    # in a batch. directly from the to using iloc. 
    # So when rapids got 16X speedup they wrote own version of this type of code. This is also one of the reasons
    # to replace PyTorch Dataloader to make this kind of thing easy to do so creating a batch at a time data loader
    # is just 7ish lines of code. 
    def create_batch(self, b): return self.dataset.iloc[b]

TabularPandas._dl_type = TabDataLoader

## Integration example

For a more in-depth explanation, see the [tabular tutorial](http://docs.fast.ai/tutorial.tabular)

In [None]:
path = untar_data(URLs.ADULT_SAMPLE)
df = pd.read_csv(path/'adult.csv')
df_main,df_test = df.iloc[:10000].copy(),df.iloc[10000:].copy()
df_test.drop('salary', axis=1, inplace=True)
df_main.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,49,Private,101320,Assoc-acdm,12.0,Married-civ-spouse,,Wife,White,Female,0,1902,40,United-States,>=50k
1,44,Private,236746,Masters,14.0,Divorced,Exec-managerial,Not-in-family,White,Male,10520,0,45,United-States,>=50k
2,38,Private,96185,HS-grad,,Divorced,,Unmarried,Black,Female,0,0,32,United-States,<50k
3,38,Self-emp-inc,112847,Prof-school,15.0,Married-civ-spouse,Prof-specialty,Husband,Asian-Pac-Islander,Male,0,0,40,United-States,>=50k
4,42,Self-emp-not-inc,82297,7th-8th,,Married-civ-spouse,Other-service,Wife,Black,Female,0,0,50,United-States,<50k


In [None]:
cat_names = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race']
cont_names = ['age', 'fnlwgt', 'education-num']
procs = [Categorify, FillMissing, Normalize]
splits = RandomSplitter()(range_of(df_main))

In [None]:
to = TabularPandas(df_main, procs, cat_names, cont_names, y_names="salary", splits=splits)

In [None]:
dls = to.dataloaders()
dls.valid.show_batch()

Unnamed: 0,workclass,education,marital-status,occupation,relationship,race,education-num_na,age,fnlwgt,education-num,salary
0,Private,Bachelors,Married-civ-spouse,Prof-specialty,Husband,White,False,48.0,202466.99966,13.0,>=50k
1,Private,Some-college,Married-civ-spouse,Craft-repair,Husband,White,False,24.000001,223810.998444,10.0,<50k
2,Private,Assoc-acdm,Married-civ-spouse,Prof-specialty,Husband,White,False,30.0,54608.000867,12.0,<50k
3,Private,HS-grad,Married-civ-spouse,Handlers-cleaners,Husband,White,False,39.0,201409.999946,9.0,<50k
4,State-gov,Some-college,Divorced,Prof-specialty,Not-in-family,Black,False,55.0,181641.000022,10.0,<50k
5,Private,HS-grad,Never-married,Adm-clerical,Own-child,White,False,22.0,113549.998637,9.0,<50k
6,Private,Some-college,Never-married,Craft-repair,Not-in-family,White,False,44.0,109272.998341,10.0,<50k
7,Private,HS-grad,Divorced,Adm-clerical,Not-in-family,White,False,52.999999,195638.000069,9.0,<50k
8,Private,11th,Never-married,Craft-repair,Other-relative,White,False,32.0,375832.999583,7.0,<50k
9,Private,HS-grad,Married-civ-spouse,Adm-clerical,Wife,White,False,26.0,202091.000094,9.0,<50k


In [None]:
to.show()

Unnamed: 0,workclass,education,marital-status,occupation,relationship,race,education-num_na,age,fnlwgt,education-num,salary
1093,?,Some-college,Married-civ-spouse,?,Husband,White,False,66.0,260111.0,10.0,>=50k
2768,Local-gov,Masters,Married-civ-spouse,Prof-specialty,Husband,White,False,60.0,141637.0,14.0,>=50k
1917,Private,Some-college,Married-civ-spouse,Craft-repair,Husband,Asian-Pac-Islander,False,49.0,81973.0,10.0,>=50k
6176,Private,7th-8th,Never-married,Other-service,Own-child,White,False,39.0,194287.0,4.0,<50k
6195,Self-emp-not-inc,HS-grad,Married-civ-spouse,Other-service,Husband,White,False,37.0,35330.0,9.0,<50k
6102,Self-emp-not-inc,HS-grad,Divorced,Sales,Not-in-family,White,False,41.0,89942.0,9.0,<50k
8214,Private,Bachelors,Never-married,Prof-specialty,Own-child,White,False,23.0,119838.0,13.0,<50k
9562,Private,Doctorate,Divorced,Prof-specialty,Not-in-family,White,False,29.0,195284.0,16.0,>=50k
1724,Private,10th,Separated,Craft-repair,Unmarried,White,False,32.0,184833.0,6.0,<50k
4423,Private,HS-grad,Married-civ-spouse,Machine-op-inspct,Husband,White,False,36.0,437909.0,9.0,<50k


We can decode any set of transformed data by calling `to.decode_row` with our raw data:

In [None]:
row = to.items.iloc[0]
to.decode_row(row)

age                                  66
workclass                             ?
fnlwgt                           260111
education                  Some-college
education-num                        10
marital-status       Married-civ-spouse
occupation                            ?
relationship                    Husband
race                              White
sex                                Male
capital-gain                          0
capital-loss                          0
hours-per-week                       40
native-country            United-States
salary                            >=50k
education-num_na                  False
Name: 1093, dtype: object

In [None]:
to_tst = to.new(df_test)
to_tst.process()
to_tst.items.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,education-num_na
10000,0.461344,5,1.349813,10,1.17349,3,2,1,2,Male,0,0,40,Philippines,1
10001,-0.930039,5,1.262382,12,-0.431008,3,15,1,4,Male,0,0,40,United-States,1
10002,1.047189,5,0.154538,2,-1.233257,1,9,2,5,Female,0,0,37,United-States,1
10003,0.534575,5,-0.280595,12,-0.431008,7,2,5,5,Female,0,0,43,United-States,1
10004,0.754267,6,1.452827,9,0.371241,3,5,1,5,Male,0,0,60,United-States,1


In [None]:
tst_dl = dls.valid.new(to_tst)
tst_dl.show_batch()

Unnamed: 0,workclass,education,marital-status,occupation,relationship,race,education-num_na,age,fnlwgt,education-num
0,Private,Bachelors,Married-civ-spouse,Adm-clerical,Husband,Asian-Pac-Islander,False,45.0,338104.998726,13.0
1,Private,HS-grad,Married-civ-spouse,Transport-moving,Husband,Other,False,26.0,328663.005114,9.0
2,Private,11th,Divorced,Other-service,Not-in-family,White,False,52.999999,209021.999726,7.0
3,Private,HS-grad,Widowed,Adm-clerical,Unmarried,White,False,46.0,162030.000378,9.0
4,Self-emp-inc,Assoc-voc,Married-civ-spouse,Exec-managerial,Husband,White,False,49.0,349230.002765,11.0
5,Local-gov,Some-college,Married-civ-spouse,Exec-managerial,Husband,White,False,34.0,124826.998872,10.0
6,Self-emp-inc,Some-college,Married-civ-spouse,Sales,Husband,White,False,52.999999,290639.997877,10.0
7,Private,Some-college,Never-married,Sales,Own-child,White,False,19.0,106273.002965,10.0
8,Private,Some-college,Married-civ-spouse,Protective-serv,Husband,Black,False,72.000001,53683.999433,10.0
9,Private,Some-college,Never-married,Sales,Own-child,White,False,19.999999,505980.004135,10.0


## Other target types

### Multi-label categories

#### one-hot encoded label

In [None]:
def _mock_multi_label(df):
    sal,sex,white = [],[],[]
    for row in df.itertuples():
        sal.append(row.salary == '>=50k')
        sex.append(row.sex == ' Male')
        white.append(row.race == ' White')
    df['salary'] = np.array(sal)
    df['male']   = np.array(sex)
    df['white']  = np.array(white)
    return df

In [None]:
path = untar_data(URLs.ADULT_SAMPLE)
df = pd.read_csv(path/'adult.csv')
df_main,df_test = df.iloc[:10000].copy(),df.iloc[10000:].copy()
df_main = _mock_multi_label(df_main)

In [None]:
df_main.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary,male,white
0,49,Private,101320,Assoc-acdm,12.0,Married-civ-spouse,,Wife,White,Female,0,1902,40,United-States,True,False,True
1,44,Private,236746,Masters,14.0,Divorced,Exec-managerial,Not-in-family,White,Male,10520,0,45,United-States,True,True,True
2,38,Private,96185,HS-grad,,Divorced,,Unmarried,Black,Female,0,0,32,United-States,False,False,False
3,38,Self-emp-inc,112847,Prof-school,15.0,Married-civ-spouse,Prof-specialty,Husband,Asian-Pac-Islander,Male,0,0,40,United-States,True,True,False
4,42,Self-emp-not-inc,82297,7th-8th,,Married-civ-spouse,Other-service,Wife,Black,Female,0,0,50,United-States,False,False,False


In [None]:
#export
@EncodedMultiCategorize
def setups(self, to:Tabular):
    self.c = len(self.vocab)
    return self(to)

@EncodedMultiCategorize
def encodes(self, to:Tabular): return to

@EncodedMultiCategorize
def decodes(self, to:Tabular):
    to.transform(to.y_names, lambda c: c==1)
    return to

In [None]:
cat_names = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race']
cont_names = ['age', 'fnlwgt', 'education-num']
procs = [Categorify, FillMissing, Normalize]
splits = RandomSplitter()(range_of(df_main))
y_names=["salary", "male", "white"]

In [None]:
%time to = TabularPandas(df_main, procs, cat_names, cont_names, y_names=y_names, y_block=MultiCategoryBlock(encoded=True, vocab=y_names), splits=splits)

CPU times: user 73.8 ms, sys: 0 ns, total: 73.8 ms
Wall time: 75.2 ms


In [None]:
dls = to.dataloaders()
dls.valid.show_batch()

Unnamed: 0,workclass,education,marital-status,occupation,relationship,race,education-num_na,age,fnlwgt,education-num,salary,male,white
0,Private,HS-grad,Never-married,Craft-repair,Not-in-family,White,False,44.0,221171.998543,9.0,False,True,True
1,Self-emp-not-inc,Assoc-voc,Married-civ-spouse,Farming-fishing,Husband,White,False,28.0,39388.002873,11.0,False,True,True
2,Private,Bachelors,Never-married,Prof-specialty,Own-child,White,False,24.000001,126612.99824,13.0,False,False,True
3,Private,HS-grad,Never-married,Adm-clerical,Unmarried,White,False,30.0,45781.000285,9.0,False,False,True
4,Private,HS-grad,Married-civ-spouse,#na#,Husband,White,True,40.0,104196.002008,10.0,False,True,True
5,Self-emp-not-inc,Masters,Married-civ-spouse,Exec-managerial,Husband,White,False,58.000001,130714.00295,14.0,False,True,True
6,Private,11th,Never-married,Adm-clerical,Unmarried,Black,False,23.000001,177087.000463,7.0,False,True,False
7,Private,HS-grad,Married-civ-spouse,Craft-repair,Husband,Asian-Pac-Islander,False,36.0,99871.998153,9.0,False,True,False
8,Self-emp-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,False,54.000001,129431.998305,13.0,True,True,True
9,Private,Assoc-voc,Married-civ-spouse,Craft-repair,Husband,White,False,47.0,326856.996506,11.0,False,True,True


#### Not one-hot encoded

In [None]:
def _mock_multi_label(df):
    targ = []
    for row in df.itertuples():
        labels = []
        if row.salary == '>=50k': labels.append('>50k')
        if row.sex == ' Male':   labels.append('male')
        if row.race == ' White': labels.append('white')
        targ.append(' '.join(labels))
    df['target'] = np.array(targ)
    return df

In [None]:
path = untar_data(URLs.ADULT_SAMPLE)
df = pd.read_csv(path/'adult.csv')
df_main,df_test = df.iloc[:10000].copy(),df.iloc[10000:].copy()
df_main = _mock_multi_label(df_main)

In [None]:
df_main.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary,target
0,49,Private,101320,Assoc-acdm,12.0,Married-civ-spouse,,Wife,White,Female,0,1902,40,United-States,>=50k,>50k white
1,44,Private,236746,Masters,14.0,Divorced,Exec-managerial,Not-in-family,White,Male,10520,0,45,United-States,>=50k,>50k male white
2,38,Private,96185,HS-grad,,Divorced,,Unmarried,Black,Female,0,0,32,United-States,<50k,
3,38,Self-emp-inc,112847,Prof-school,15.0,Married-civ-spouse,Prof-specialty,Husband,Asian-Pac-Islander,Male,0,0,40,United-States,>=50k,>50k male
4,42,Self-emp-not-inc,82297,7th-8th,,Married-civ-spouse,Other-service,Wife,Black,Female,0,0,50,United-States,<50k,


In [None]:
@MultiCategorize
def encodes(self, to:Tabular): 
    #to.transform(to.y_names, partial(_apply_cats, {n: self.vocab for n in to.y_names}, 0))
    return to
  
@MultiCategorize
def decodes(self, to:Tabular): 
    #to.transform(to.y_names, partial(_decode_cats, {n: self.vocab for n in to.y_names}))
    return to

In [None]:
cat_names = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race']
cont_names = ['age', 'fnlwgt', 'education-num']
procs = [Categorify, FillMissing, Normalize]
splits = RandomSplitter()(range_of(df_main))

In [None]:
%time to = TabularPandas(df_main, procs, cat_names, cont_names, y_names="target", y_block=MultiCategoryBlock(), splits=splits)

CPU times: user 73.8 ms, sys: 1.13 ms, total: 74.9 ms
Wall time: 77.1 ms


In [None]:
to.procs[2].vocab

['-', '_', 'a', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'k', 'l', 'm', 'n', 'o', 'p', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y']

### Regression

In [None]:
#export
@RegressionSetup
def setups(self, to:Tabular):
    if self.c is not None: return
    self.c = len(to.y_names)
    return to

@RegressionSetup
def encodes(self, to:Tabular): return to

@RegressionSetup
def decodes(self, to:Tabular): return to

In [None]:
path = untar_data(URLs.ADULT_SAMPLE)
df = pd.read_csv(path/'adult.csv')
df_main,df_test = df.iloc[:10000].copy(),df.iloc[10000:].copy()
df_main = _mock_multi_label(df_main)

In [None]:
cat_names = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race']
cont_names = ['fnlwgt', 'education-num']
procs = [Categorify, FillMissing, Normalize]
splits = RandomSplitter()(range_of(df_main))

In [None]:
%time to = TabularPandas(df_main, procs, cat_names, cont_names, y_names='age', splits=splits)

CPU times: user 85.6 ms, sys: 1.4 ms, total: 87 ms
Wall time: 92.8 ms


In [None]:
to.procs[-1].means

{'education-num': 10.0625, 'fnlwgt': 192767.350625}

In [None]:
dls = to.dataloaders()
dls.valid.show_batch()

Unnamed: 0,workclass,education,marital-status,occupation,relationship,race,education-num_na,fnlwgt,education-num,age
0,Private,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,False,125784.000982,13.0,76.0
1,Local-gov,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,False,160472.001568,13.0,45.0
2,Self-emp-not-inc,HS-grad,Never-married,Sales,Not-in-family,White,False,195123.999982,9.0,41.0
3,Private,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,False,303154.995525,13.0,42.0
4,Self-emp-inc,HS-grad,Married-civ-spouse,Exec-managerial,Wife,White,False,154536.999077,9.0,58.0
5,?,HS-grad,Never-married,?,Own-child,Amer-Indian-Eskimo,False,99483.000273,9.0,31.0
6,Private,Bachelors,Divorced,Exec-managerial,Not-in-family,White,False,51099.994821,13.0,38.0
7,Private,HS-grad,Never-married,Adm-clerical,Other-relative,Black,False,429346.003824,9.0,22.0
8,?,Some-college,Never-married,?,Own-child,White,False,205940.000277,10.0,21.0
9,?,11th,Married-civ-spouse,?,Husband,White,False,49193.999511,7.0,64.0


## Not being used now - for multi-modal

In [None]:
class TensorTabular(fastuple):
    def get_ctxs(self, max_n=10, **kwargs):
        n_samples = min(self[0].shape[0], max_n)
        df = pd.DataFrame(index = range(n_samples))
        return [df.iloc[i] for i in range(n_samples)]

    def display(self, ctxs): display_df(pd.DataFrame(ctxs))

class TabularLine(pd.Series):
    "A line of a dataframe that knows how to show itself"
    def show(self, ctx=None, **kwargs): return self if ctx is None else ctx.append(self)

class ReadTabLine(ItemTransform):
    def __init__(self, proc): self.proc = proc

    def encodes(self, row):
        cats,conts = (o.map(row.__getitem__) for o in (self.proc.cat_names,self.proc.cont_names))
        return TensorTabular(tensor(cats).long(),tensor(conts).float())

    def decodes(self, o):
        to = TabularPandas(o, self.proc.cat_names, self.proc.cont_names, self.proc.y_names)
        to = self.proc.decode(to)
        return TabularLine(pd.Series({c: v for v,c in zip(to.items[0]+to.items[1], self.proc.cat_names+self.proc.cont_names)}))

class ReadTabTarget(ItemTransform):
    def __init__(self, proc): self.proc = proc
    def encodes(self, row): return row[self.proc.y_names].astype(np.int64)
    def decodes(self, o): return Category(self.proc.classes[self.proc.y_names][o])

In [None]:
# tds = TfmdDS(to.items, tfms=[[ReadTabLine(proc)], ReadTabTarget(proc)])
# enc = tds[1]
# test_eq(enc[0][0], tensor([2,1]))
# test_close(enc[0][1], tensor([-0.628828]))
# test_eq(enc[1], 1)

# dec = tds.decode(enc)
# assert isinstance(dec[0], TabularLine)
# test_close(dec[0], pd.Series({'a': 1, 'b_na': False, 'b': 1}))
# test_eq(dec[1], 'a')

# test_stdout(lambda: print(show_at(tds, 1)), """a               1
# b_na        False
# b               1
# category        a
# dtype: object""")

## Export -

In [None]:
#hide
from nbdev.export import notebook2script
notebook2script()

Converted 00_torch_core.ipynb.
Converted 01_layers.ipynb.
Converted 02_data.load.ipynb.
Converted 03_data.core.ipynb.
Converted 04_data.external.ipynb.
Converted 05_data.transforms.ipynb.
Converted 06_data.block.ipynb.
Converted 07_vision.core.ipynb.
Converted 08_vision.data.ipynb.
Converted 09_vision.augment.ipynb.
Converted 09b_vision.utils.ipynb.
Converted 09c_vision.widgets.ipynb.
Converted 10_tutorial.pets.ipynb.
Converted 11_vision.models.xresnet.ipynb.
Converted 12_optimizer.ipynb.
Converted 13_callback.core.ipynb.
Converted 13a_learner.ipynb.
Converted 13b_metrics.ipynb.
Converted 14_callback.schedule.ipynb.
Converted 14a_callback.data.ipynb.
Converted 15_callback.hook.ipynb.
Converted 15a_vision.models.unet.ipynb.
Converted 16_callback.progress.ipynb.
Converted 17_callback.tracker.ipynb.
Converted 18_callback.fp16.ipynb.
Converted 18a_callback.training.ipynb.
Converted 19_callback.mixup.ipynb.
Converted 20_interpret.ipynb.
Converted 20a_distributed.ipynb.
Converted 21_vision.l