In [None]:
from fastai import *          # Quick accesss to most common functionality
from fastai.tabular import *  # Quick accesss to tabular functionality     # Access to example data provided with fastai

# Tabular example

Tabular data should be in a Pandas `DataFrame`.

In [None]:
def remove_last_line(t:str):
    return '\n'.join(t.split('\n')[:-1])

In [None]:
def codes(df, idx):
    return [df[c].iloc[[idx]].cat.codes.iloc[0] + 1 for c in df.columns]

In [None]:
def def_emb_sz(df, n, sz_dict):
    col = df[n]
    n_cat = len(col.cat.categories)+1  # extra cat for NA
    sz = sz_dict.get(n, min(50, (n_cat//2)+1))  # rule of thumb
    return n_cat,sz

In [None]:
def _text2html_table(items:Collection[Collection[str]], widths:Collection[int])->str:
    html_code = f"<table>"
    for w in widths: html_code += f"  <col width='{w}px'>"
    for line in items:
        html_code += "  <tr>\n"
        html_code += "\n".join([f"    <th>{o}</th>" for o in line if len(o) >= 1])
        html_code += "\n  </tr>\n"
    return html_code + "</table>\n"

In [None]:
class TabularLine(ItemBase):
    def __init__(self, cats, conts, classes, names): 
        self.cats,self.conts,self.classes,self.names = cats,conts,classes,names
        self.data = [tensor(cats), tensor(conts)]
        
    def __str__(self):  
        res = ''
        for c, n in zip(self.cats, self.names[:len(self.cats)]):
            res += f"{n} {(self.classes[n][c-1] if c != 0 else 'nan')}\n"
        for c,n in zip(self.conts, self.names[len(self.cats):]):
            res += f'{n} {c:.4f}\n'
        return res
    
    def show_batch(self, idxs:Collection[int], rows:int, ds:Dataset, figsize:Tuple[int,int]=(9,10))->None:
        from IPython.display import display, HTML
        x,y = ds[0]
        items = [x.names]
        for i in idxs[:rows]:
            x,y = ds[i]
            res = []
            for c, n in zip(x.cats, self.names[:len(x.cats)]):
                res.append(str(x.classes[n][c-1]) if c != 0 else 'nan')
            res += [f'{c:.4f}' for c in x.conts] 
            items.append(res)
        display(HTML(_text2html_table(items, [10] * len(items[0]))))

class TabularList(ItemList):
    def __init__(self, items:Iterator, cat_names:OptStrList=None, cont_names:OptStrList=None, create_func:Callable=None, 
                 path:PathOrStr='.', xtra=None):
        #dataframe is in xtra, items is just a range of index
        assert xtra is not None and len(xtra)==len(items), "Use from_df or from_csv"
        super().__init__(range(len(items)), create_func=create_func, path=path, xtra=xtra)
        self.cat_names,self.cont_names = cat_names,cont_names
    
    @classmethod
    def from_df(cls, df:DataFrame, path:PathOrStr='.', create_func:Callable=None, cat_names:OptStrList=None, 
                cont_names:OptStrList=None)->'ItemList':
        "Get the list of inputs in the `col` of `path/csv_name`."
        res = cls(create_func=create_func, items=range(len(df)), path=path, xtra=df,
                  cat_names=cat_names, cont_names=cont_names)
        return res
    
    def new(self, items:Iterator, xtra:Any=None)->'TabularList':
        return self.__class__(items=items, cat_names=self.cat_names, cont_names=self.cont_names,
                              create_func=self.create_func, path=self.path, xtra=xtra)
    
    def get(self, o): 
        return TabularLine(self.codes[o], self.conts[o], self.classes, self.col_names)
    
    def get_emb_szs(self, sz_dict): return [def_emb_sz(self.xtra, n, sz_dict) for n in self.cat_names]
    
    def preprocess(self, tfms=None):
        tfms,new_tfms = ifnone(tfms,[]),[]
        for tfm in tfms:
            if isinstance(tfm, TabularTransform): tfm(self.xtra, test=True)
            else:
                #cat and cont names may have been changed by transform (like Fill_NA)
                tfm = tfm(self.cat_names, self.cont_names)
                tfm(self.xtra)
                new_tfms.append(tfm)
                self.cat_names, self.cont_names = tfm.cat_names, tfm.cont_names
        self.codes = np.stack([c.cat.codes.values for n,c in self.xtra[self.cat_names].items()], 1).astype(np.int64) + 1
        self.conts = np.stack([c.astype('float32').values for n,c in self.xtra[self.cont_names].items()], 1)
        self.classes = {n:c.cat.categories.values for n,c in self.xtra[self.cat_names].items()}
        self.col_names = list(self.xtra[self.cat_names].columns.values) 
        self.col_names += list(self.xtra[self.cont_names].columns.values)
        self.preprocess_kwargs = {'tfms':new_tfms}

In [None]:
class Normalize(TabularTransform):
    "Transform the categorical variables to that type."

    def apply_train(self, df:DataFrame):
        self.means,self.stds = {},{}
        for n in self.cont_names:
            self.means[n],self.stds[n] = df.loc[:,n].mean(),df.loc[:,n].std()
            df.loc[:,n] = (df.loc[:,n]-self.means[n]) / (1e-7 + self.stds[n])

    def apply_test(self, df:DataFrame):
        for n in self.cont_names:
            df.loc[:,n] = (df.loc[:,n]-self.means[n]) / (1e-7 + self.stds[n])

In [None]:
path = untar_data(URLs.ADULT_SAMPLE)
df = pd.read_csv(path/'adult.csv')

In [None]:
dep_var = '>=50k'
cat_names = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race']
cont_names = ['age', 'fnlwgt', 'education-num']

In [None]:
sl = (TabularList.from_df(df, path, cat_names=cat_names, cont_names=cont_names)
      .random_split_by_pct()
      .label_from_df(cols=dep_var))

In [None]:
slp = sl.preprocess(tfms=[FillMissing, Categorify, Normalize])

In [None]:
data = slp.databunch()

In [None]:
data.show_batch(rows=10)

workclass,education,marital-status,occupation,relationship,race,education-num_na,age,fnlwgt,education-num
Private,Some-college,Married-civ-spouse,Sales,Husband,White,False,1.5656,-0.6946,-0.0292
Private,Some-college,Divorced,Adm-clerical,Not-in-family,White,False,-0.1175,-0.0963,-0.0292
?,HS-grad,Married-civ-spouse,?,Husband,White,False,1.6388,-1.1494,-0.4224
?,Assoc-voc,Married-civ-spouse,?,Husband,White,False,0.6143,-0.8632,0.3640
Self-emp-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,False,0.1752,-0.4677,1.1503
Private,Some-college,Married-civ-spouse,Exec-managerial,Husband,White,False,0.2484,-0.3702,-0.0292
Private,Bachelors,Married-civ-spouse,nan,Husband,White,True,0.6143,1.6660,-0.0292
Private,HS-grad,Married-civ-spouse,Craft-repair,Husband,White,False,0.5411,0.5652,-0.4224
Private,Some-college,Never-married,Adm-clerical,Not-in-family,White,False,-0.7029,-0.3670,-0.0292
Private,HS-grad,Married-civ-spouse,Farming-fishing,Husband,White,False,-0.7029,-0.0527,-0.4224


In [None]:
learn = get_tabular_learner(data, layers=[200,100], metrics=accuracy)
learn.fit(1, 1e-2)

Total time: 00:03
epoch  train_loss  valid_loss  accuracy
1      0.362071    0.362204    0.832310  (00:03)

