In [None]:
from fastai import *          # Quick accesss to most common functionality
from fastai.tabular import *  # Quick accesss to tabular functionality     # Access to example data provided with fastai

# Tabular example

Tabular data should be in a Pandas `DataFrame`.

In [None]:
def remove_last_line(t:str):
    return '\n'.join(t.split('\n')[:-1])

In [None]:
def codes(df, idx):
    return [df[c].iloc[[idx]].cat.codes.iloc[0] + 1 for c in df.columns]

In [None]:
def def_emb_sz(df, n, sz_dict):
    col = df[n]
    n_cat = len(col.cat.categories)+1  # extra cat for NA
    sz = sz_dict.get(n, min(50, (n_cat//2)+1))  # rule of thumb
    return n_cat,sz

In [None]:
class TabularLine(ItemBase):
    def __init__(self, cats, codes, conts): 
        self.cats,self.conts = cats,conts
        self.data = [codes, conts.values]
    def __str__(self):  return f'{remove_last_line(str(self.cats))}\n{remove_last_line(str(self.conts))}'
    
    def show_batch(self, idxs:Collection[int], rows:int, ds:Dataset, figsize:Tuple[int,int]=(9,10))->None:
        from IPython.display import clear_output, display, HTML
        items = [['text', 'label']]
        for i in idxs[:rows]:
            x,y = ds[i]
            items.append([str(x), str(y)])
        display(HTML(_text2html_table(items, [90,10])))

class TabularList(ItemList):
    def __init__(self, items:Iterator, cat_names:OptStrList=None, cont_names:OptStrList=None, create_func:Callable=None, 
                 path:PathOrStr='.', xtra=None):
        #dataframe is in xtra, items is just a range of index
        assert xtra is not None and len(xtra)==len(items), "Use from_df or from_csv"
        super().__init__(range(len(items)), create_func=create_func, path=path, xtra=xtra)
        self.cat_names,self.cont_names = cat_names,cont_names
    
    @classmethod
    def from_df(cls, df:DataFrame, path:PathOrStr='.', create_func:Callable=None, cat_names:OptStrList=None, 
                cont_names:OptStrList=None)->'ItemList':
        "Get the list of inputs in the `col` of `path/csv_name`."
        res = cls(create_func=create_func, items=range(len(df)), path=path, xtra=df,
                  cat_names=cat_names, cont_names=cont_names)
        return res
    
    def new(self, items:Iterator, xtra:Any=None)->'TabularList':
        return self.__class__(items=items, cat_names=self.cat_names, cont_names=self.cont_names,
                              create_func=self.create_func, path=self.path, xtra=xtra)
    
    def get(self, o): 
        return TabularLine(self.xtra[self.cat_names].iloc[o],
                           codes(self.xtra[self.cat_names], o),
                           self.xtra[self.cont_names].iloc[o])
    
    def get_emb_szs(self, sz_dict): return [def_emb_sz(self.xtra, n, sz_dict) for n in self.cat_names]
    
    def preprocess(self, tfms=None):
        tfms,new_tfms = ifnone(tfms,[]),[]
        for tfm in tfms:
            if isinstance(tfm, TabularTransform): tfm(self.xtra, test=True)
            else:
                #cat and cont names may have been changed by transform (like Fill_NA)
                tfm = tfm(self.cat_names, self.cont_names)
                tfm(self.xtra)
                new_tfms.append(tfm)
                self.cat_names, self.cont_names = tfm.cat_names, tfm.cont_names
        self.preprocess_kwargs = {'tfms':new_tfms}

In [None]:
class Normalize(TabularTransform):
    "Transform the categorical variables to that type."

    def apply_train(self, df:DataFrame):
        self.means,self.stds = {},{}
        for n in self.cont_names:
            self.means[n],self.stds[n] = df.loc[:,n].mean(),df.loc[:,n].std()
            df.loc[:,n] = (df.loc[:,n]-self.means[n]) / (1e-7 + self.stds[n])

    def apply_test(self, df:DataFrame):
        for n in self.cont_names:
            df.loc[:,n] = (df.loc[:,n]-self.means[n]) / (1e-7 + self.stds[n])


In [None]:
path = untar_data(URLs.ADULT_SAMPLE)
df = pd.read_csv(path/'adult.csv')

In [None]:
dep_var = '>=50k'
cat_names = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race']
cont_names = ['age', 'fnlwgt', 'education-num']

In [None]:
sl = (TabularList.from_df(df, path, cat_names=cat_names, cont_names=cont_names)
      .random_split_by_pct()
      .label_from_df(CategoryList, dep_var))

In [None]:
slp = sl.preprocess(tfms=[FillMissing, Categorify, Normalize])

In [None]:
data = slp.databunch()

In [None]:
x,y = next(iter(data.train_dl))

In [None]:
x

[[tensor([5, 6, 2, 5, 5, 5, 5, 5, 8, 3, 3, 8, 5, 5, 1, 5, 5, 5, 5, 5, 8, 5, 5, 7,
          5, 8, 5, 8, 5, 2, 5, 8, 7, 5, 5, 5, 5, 7, 5, 5, 5, 5, 5, 5, 7, 7, 5, 5,
          5, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 7, 5, 1, 5, 2], device='cuda:0'),
  tensor([12,  8, 12, 16, 16, 10, 16, 13, 11, 12,  1, 16,  5, 12,  2, 12,  9, 15,
          16, 13, 12, 12, 16, 12, 10, 12, 16, 13, 16, 10,  1, 10, 12, 10, 12, 11,
           7, 15, 12, 16, 10, 12,  9,  9,  6,  8, 10, 12, 10, 16, 10, 12, 10, 12,
           1, 16, 16, 12, 16, 15, 12,  8, 12, 16], device='cuda:0'),
  tensor([3, 1, 1, 5, 5, 1, 7, 7, 3, 3, 3, 5, 3, 4, 5, 5, 1, 3, 5, 5, 1, 5, 5, 3,
          5, 3, 5, 3, 3, 5, 5, 3, 3, 5, 3, 3, 3, 1, 3, 1, 5, 1, 5, 3, 3, 3, 5, 5,
          3, 5, 5, 7, 5, 1, 3, 1, 3, 7, 1, 3, 3, 4, 5, 1], device='cuda:0'),
  tensor([ 9,  5,  2,  4,  2,  5,  8, 13, 11, 11,  9,  9,  8,  9,  1,  4,  2,  5,
          15,  5,  5, 13,  2,  6, 14, 12,  9, 11,  4, 11,  4,  9,  0, 11,  8, 11,
           4, 11,  5,  2,  2,  2, 14,  

In [None]:
learn = get_tabular_learner(data, layers=[200,100], metrics=accuracy)
learn.fit(1, 1e-2)

AttributeError: 'CategoryList' object has no attribute 'c'