In [None]:
# make your Google drive accessible 
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)
root_dir = "/content/gdrive/My Drive/"
base_dir = root_dir + 'fastai2_library/course-v4/'

# navigate to the notebooks directory for dl2
import os
os.chdir(base_dir)

Mounted at /content/gdrive


In [None]:
!pwd
# cd to base_dir if above os.chdir does not work using below command
# %cd "/content/gdrive/My Drive/fastai2_library/course-v4/"

/content/gdrive/My Drive/fastai2_library/course-v4


In [None]:
#hide
#skip
! [[ -e /content ]] && pip install -Uqq fastai  # upgrade fastai on colab

[K     |████████████████████████████████| 194kB 3.2MB/s 
[K     |████████████████████████████████| 51kB 4.8MB/s 
[?25h

In [None]:
%cd nbs

/content/gdrive/My Drive/fastai2_library/course-v4/nbs


# Tabular training

> How to use the tabular application in fastai

To illustrate the tabular application, we will use the example of the [Adult dataset](https://archive.ics.uci.edu/ml/datasets/Adult) where we have to predict if a person is earning more or less than $50k per year using some general data.

In [None]:
from fastai.tabular.all import *

We can download a sample of this dataset with the usual `untar_data` command:

In [None]:
path = untar_data(URLs.ADULT_SAMPLE)
path.ls()

(#3) [Path('/root/.fastai/data/adult_sample/models'),Path('/root/.fastai/data/adult_sample/adult.csv'),Path('/root/.fastai/data/adult_sample/export.pkl')]

Then we can have a look at how the data is structured:

In [None]:
df = pd.read_csv(path/'adult.csv')
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,49,Private,101320,Assoc-acdm,12.0,Married-civ-spouse,,Wife,White,Female,0,1902,40,United-States,>=50k
1,44,Private,236746,Masters,14.0,Divorced,Exec-managerial,Not-in-family,White,Male,10520,0,45,United-States,>=50k
2,38,Private,96185,HS-grad,,Divorced,,Unmarried,Black,Female,0,0,32,United-States,<50k
3,38,Self-emp-inc,112847,Prof-school,15.0,Married-civ-spouse,Prof-specialty,Husband,Asian-Pac-Islander,Male,0,0,40,United-States,>=50k
4,42,Self-emp-not-inc,82297,7th-8th,,Married-civ-spouse,Other-service,Wife,Black,Female,0,0,50,United-States,<50k


In [None]:
# How many rows
len(df)

32561

In [None]:
len(df.columns)

15

To start processing a table, we need to take Categorical values and convert them to ints. Also take continuous variables and if there are any missing values, we need to replace with something
(could be median, mean etc). Typically JH also adds a column for any missing value which is binary and indicates T/F, T when value is missing and has been replaced. So we identify which columns are continuous through cont_names and which columns are categories through cat_names. This enables the processing of each one. 

Some of the columns are continuous (like age) and we will treat them as float numbers we can feed our model directly. Others are categorical (like workclass or education) and we will convert them to a unique index that we will feed to embedding layers. We can specify our categorical and continuous column names, as well as the name of the dependent variable in `TabularDataLoaders` factory methods:

In [None]:
'''
Init signature: FillMissing(x, **kwargs)
Source:        
class FillMissing(TabularProc):
    "Fill the missing values in continuous columns."
    def __init__(self, fill_strategy=FillStrategy.median, add_col=True, fill_vals=None):
        if fill_vals is None: fill_vals = defaultdict(int)
        store_attr()

    def setups(self, dsets):
        missing = pd.isnull(dsets.conts).any()
        store_attr(na_dict={n:self.fill_strategy(dsets[n], self.fill_vals[n])
                            for n in missing[missing].keys()})
        self.fill_strategy = self.fill_strategy.__name__

    def encodes(self, to):
        missing = pd.isnull(to.conts)
        for n in missing.any()[missing.any()].keys():
            assert n in self.na_dict, f"nan values in `{n}` but not in setup training set"
        for n in self.na_dict.keys():
            to[n].fillna(self.na_dict[n], inplace=True)
            if self.add_col:
                to.loc[:,n+'_na'] = missing[n]
                if n+'_na' not in to.cat_names: to.cat_names.append(n+'_na')
File:           /usr/local/lib/python3.6/dist-packages/fastai/tabular/core.py
Type:           _TfmMeta
'''
FillMissing??

In [None]:
# While FillMissing uses a fill_strategy ddefault of FillStrategy.median, you could specify different approaches
# value in fill_vals 
'''
Init signature: FillStrategy(*args, **kwargs)
Source:        
class FillStrategy:
    "Namespace containing the various filling strategies."
    def median  (c,fill): return c.median()
    def constant(c,fill): return fill
    def mode    (c,fill): return c.dropna().value_counts().idxmax()
File:           /usr/local/lib/python3.6/dist-packages/fastai/tabular/core.py
Type:           type
'''
FillStrategy??

In [None]:
'''
Init signature: TabularDataLoaders(*args, **kwargs)
Source:        
class TabularDataLoaders(DataLoaders):
    "Basic wrapper around several `DataLoader`s with factory methods for tabular data"
    @classmethod
    @delegates(Tabular.dataloaders, but=["dl_type", "dl_kwargs"])
    def from_df(cls, df, path='.', procs=None, cat_names=None, cont_names=None, y_names=None, y_block=None,
                valid_idx=None, **kwargs):
        "Create from `df` in `path` using `procs`"
        if cat_names is None: cat_names = []
        if cont_names is None: cont_names = list(set(df)-set(L(cat_names))-set(L(y_names)))
        splits = RandomSplitter()(df) if valid_idx is None else IndexSplitter(valid_idx)(df)
        to = TabularPandas(df, procs, cat_names, cont_names, y_names, splits=splits, y_block=y_block)
        return to.dataloaders(path=path, **kwargs)

    @classmethod
    def from_csv(cls, csv, skipinitialspace=True, **kwargs):
        "Create from `csv` file in `path` using `procs`"
        return cls.from_df(pd.read_csv(csv, skipinitialspace=skipinitialspace), **kwargs)

    @delegates(TabDataLoader.__init__)
    def test_dl(self, test_items, rm_type_tfms=None, process=True, **kwargs):
        to = self.train_ds.new(test_items)
        if process: to.process()
        return self.valid.new(to, **kwargs)
File:           /usr/local/lib/python3.6/dist-packages/fastai/tabular/data.py
Type:           type
'''
TabularDataLoaders??

In [None]:
dls = TabularDataLoaders.from_csv(path/'adult.csv', path=path, y_names="salary",
    cat_names = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race'],
    cont_names = ['age', 'fnlwgt', 'education-num'],
    procs = [Categorify, FillMissing, Normalize])

The last part is the list of pre-processors we apply to our data:

- `Categorify` is going to take every categorical variable and make a map from integer to unique categories, then replace the values by the corresponding index.
- `FillMissing` will fill the missing values in the continuous variables by the median of existing values (you can choose a specific value if you prefer)
- `Normalize` will normalize the continuous variables (substract the mean and divide by the std)



To further expose what's going on below the surface, let's rewrite this utilizing `fastai`'s `TabularPandas` class. We will need to make one adjustment, which is defining how we want to split our data. By default the factory method above used a random 80/20 split, so we will do the same:

We need to know how to split our training and validation set and provide that as splits 

In [None]:
splits = RandomSplitter(valid_pct=0.2)(range_of(df))

So a Tabular Object (hence to) in Fastai is given a dataframe, names of categorical variables, names of continuous vars, the way to split the df for training and validation sets, the y var we are trying predict and some processes we use which we will explore shortly.

In [None]:
'''
Init signature: TabularPandas(*args, **kwargs)
Source:        
class TabularPandas(Tabular):
    "A `Tabular` object with transforms"
    def transform(self, cols, f, all_col=True):
        if not all_col: cols = [c for c in cols if c in self.items.columns]
        if len(cols) > 0: self[cols] = self[cols].transform(f)
File:           /usr/local/lib/python3.6/dist-packages/fastai/tabular/core.py
Type:           type
'''
TabularPandas??

In [None]:
'''
Init signature: Tabular(*args, **kwargs)
Source:        
class Tabular(CollBase, GetAttr, FilteredBase):
    "A `DataFrame` wrapper that knows which cols are cont/cat/y, and returns rows in `__getitem__`"
    _default,with_cont='procs',True
    def __init__(self, df, procs=None, cat_names=None, cont_names=None, y_names=None, y_block=None, splits=None,
                 do_setup=True, device=None, inplace=False, reduce_memory=True):
                 if inplace and splits is not None and pd.options.mode.chained_assignment is not None:
            warn("Using inplace with splits will trigger a pandas error. Set `pd.options.mode.chained_assignment=None` to avoid it.")
            ..........
'''
Tabular??

In [None]:
to = TabularPandas(df, procs=[Categorify, FillMissing,Normalize],
                   cat_names = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race'],
                   cont_names = ['age', 'fnlwgt', 'education-num'],
                   y_names='salary',
                   splits=splits)

From a TabularObject we can build dataloaders.

Before finally building our `DataLoaders` again:

In [None]:
dls = to.dataloaders(bs=64)

> Later we will explore why using `TabularPandas` to preprocess will be valuable.

The `show_batch` method works like for every other application:

In [None]:
dls.show_batch()

Unnamed: 0,workclass,education,marital-status,occupation,relationship,race,education-num_na,age,fnlwgt,education-num,salary
0,Private,5th-6th,Separated,Machine-op-inspct,Not-in-family,White,False,56.0,301834.996801,3.0,<50k
1,Private,Some-college,Never-married,Other-service,Own-child,White,False,31.0,273323.998893,10.0,<50k
2,Private,5th-6th,Married-civ-spouse,Machine-op-inspct,Wife,Asian-Pac-Islander,False,43.0,143582.000969,3.0,<50k
3,Self-emp-not-inc,Some-college,Married-civ-spouse,Craft-repair,Husband,White,False,55.0,35339.999911,10.0,<50k
4,Federal-gov,Some-college,Married-civ-spouse,Exec-managerial,Husband,White,False,39.0,30916.002866,10.0,>=50k
5,Private,Some-college,Never-married,Adm-clerical,Own-child,White,False,29.0,173651.999647,10.0,<50k
6,Federal-gov,HS-grad,Married-civ-spouse,Craft-repair,Husband,White,False,44.0,320070.999875,9.0,>=50k
7,Private,Some-college,Never-married,Adm-clerical,Unmarried,White,False,24.0,176320.999915,10.0,<50k
8,Private,HS-grad,Married-civ-spouse,Transport-moving,Husband,White,False,64.0,169871.000543,9.0,<50k
9,Private,Masters,Married-civ-spouse,Exec-managerial,Husband,White,False,38.0,266645.000378,14.0,>=50k


We can define a model using the `tabular_learner` method. When we define our model, `fastai` will try to infer the loss function based on our `y_names` earlier. 

**Note**: Sometimes with tabular data, your `y`'s may be encoded (such as 0 and 1). In such a case you should explicitly pass `y_block = CategoryBlock` in your constructor so `fastai` won't presume you are doing regression.

In [None]:
'''
Signature: tabular_learner(dls, layers=None, emb_szs=None, config=None, n_out=None, y_range=None, loss_func=None, opt_func=<function Adam at 0x7fd8ad5d4950>, lr=0.001, splitter=<function trainable_params at 0x7fd8bb215d90>, cbs=None, metrics=None, path=None, model_dir='models', wd=None, wd_bn_bias=False, train_bn=True, moms=(0.95, 0.85, 0.95))
Source:   
@log_args(to_return=True, but_as=Learner.__init__)
@delegates(Learner.__init__)
def tabular_learner(dls, layers=None, emb_szs=None, config=None, n_out=None, y_range=None, **kwargs):
    "Get a `Learner` using `dls`, with `metrics`, including a `TabularModel` created using the remaining params."
    if config is None: config = tabular_config()
    if layers is None: layers = [200,100]
    to = dls.train_ds
    emb_szs = get_emb_sz(dls.train_ds, {} if emb_szs is None else emb_szs)
    if n_out is None: n_out = get_c(dls)
    assert n_out, "`n_out` is not defined, and could not be inferred from data, set `dls.c` or pass `n_out`"
    if y_range is None and 'y_range' in config: y_range = config.pop('y_range')
    model = TabularModel(emb_szs, len(dls.cont_names), n_out, layers, y_range=y_range, **config)
    return TabularLearner(dls, model, **kwargs)
File:      /usr/local/lib/python3.6/dist-packages/fastai/tabular/learner.py
Type:      function
'''
tabular_learner??

In [None]:
learn = tabular_learner(dls, metrics=accuracy)

And we can train that model with the `fit_one_cycle` method (the `fine_tune` method won't be useful here since we don't have a pretrained model).

In [None]:
learn.fit_one_cycle(1)

epoch,train_loss,valid_loss,accuracy,time
0,0.374992,0.346354,0.844287,00:06


We can then have a look at some predictions:

In [None]:
learn.show_results()

Unnamed: 0,workclass,education,marital-status,occupation,relationship,race,education-num_na,age,fnlwgt,education-num,salary,salary_pred
0,5.0,12.0,1.0,2.0,2.0,2.0,1.0,0.472543,0.992654,-0.423447,0.0,0.0
1,5.0,10.0,3.0,5.0,1.0,5.0,1.0,-0.188046,-1.492562,1.142143,1.0,1.0
2,5.0,2.0,1.0,13.0,5.0,5.0,1.0,0.252347,1.857686,-1.206242,0.0,0.0
3,7.0,16.0,3.0,13.0,1.0,5.0,1.0,-0.408243,-0.757394,-0.032049,0.0,0.0
4,5.0,10.0,3.0,13.0,1.0,5.0,1.0,-0.041249,-0.669592,1.142143,1.0,1.0
5,7.0,1.0,1.0,6.0,5.0,5.0,1.0,0.766139,-1.236949,-1.597639,0.0,0.0
6,5.0,12.0,1.0,8.0,2.0,5.0,1.0,-0.114647,0.976366,-0.423447,0.0,0.0
7,3.0,10.0,5.0,11.0,2.0,5.0,1.0,0.766139,-0.151366,1.142143,0.0,0.0
8,5.0,12.0,5.0,9.0,4.0,5.0,1.0,-1.289029,1.067568,-0.423447,0.0,0.0


Or use the predict method on a row:

In [None]:
row, clas, probs = learn.predict(df.iloc[0])

In [None]:
row.show()

Unnamed: 0,workclass,education,marital-status,occupation,relationship,race,education-num_na,age,fnlwgt,education-num,salary
0,Private,Assoc-acdm,Married-civ-spouse,#na#,Wife,White,False,49.0,101320.000268,12.0,>=50k


In [None]:
type(row)

fastai.tabular.core.TabularPandas

In [None]:
type(clas)

torch.Tensor

In [None]:
type(probs)

torch.Tensor

In [None]:
clas, probs

(tensor(1), tensor([0.4880, 0.5120]))

To get prediction on a new dataframe, you can use the `test_dl` method of the `DataLoaders`. That dataframe does not need to have the dependent variable in its column.

In [None]:
test_df = df.copy()
test_df.drop(['salary'], axis=1, inplace=True)
dl = learn.dls.test_dl(test_df)

Then `Learner.get_preds` will give you the predictions:

In [None]:
learn.get_preds(dl=dl)

(tensor([[0.4880, 0.5120],
         [0.4387, 0.5613],
         [0.9856, 0.0144],
         ...,
         [0.5484, 0.4516],
         [0.6578, 0.3422],
         [0.5948, 0.4052]]), None)

## `fastai` with Other Libraries

As mentioned earlier, `TabularPandas` is a powerful and easy preprocessing tool for tabular data. Integration with libraries such as Random Forests and XGBoost requires only one extra step, that the `.dataloaders` call did for us. Let's look at our `to` again. It's values are stored in a `DataFrame` like object, where we can extract the `cats`, `conts,` `xs` and `ys` if we want to:

In [None]:
to.xs[:3]

Unnamed: 0,workclass,education,marital-status,occupation,relationship,race,education-num_na,age,fnlwgt,education-num
10009,5,16,5,13,4,5,1,-1.362428,3.014706,-0.032049
21900,7,13,5,11,4,5,1,-0.775237,0.482885,1.53354
21860,5,2,6,9,5,3,1,-0.261445,0.513222,-1.206242


To then preprocess our data, all we need to do is call `process` to apply all of our `procs` inplace:

In [None]:
to.process()
to.xs[:3]

Unnamed: 0,workclass,education,marital-status,occupation,relationship,race,education-num_na,age,fnlwgt,education-num
10009,0,0,0,0,0,0,1,-2.930406,-1.804725,-3.958567
21900,0,0,0,0,0,0,1,-2.887307,-1.804749,-3.3458
21860,0,0,0,0,0,0,1,-2.849595,-1.804749,-4.418143


Now that everything is encoded, you can then send this off to XGBoost or Random Forests by extracting the train and validation sets and their values:

In [None]:
X_train, y_train = to.train.xs, to.train.ys.values.ravel()
X_test, y_test = to.valid.xs, to.valid.ys.values.ravel()

And now we can directly send this in!