In [None]:
#export
from local.imports import *
from local.test import *
from local.core import *
from local.data.transform import *
from local.data.core import *
from local.data.external import *
from local.notebook.showdoc import show_doc

In [None]:
#default_exp tabular.core

# Tabular core

> Basic function to preprocess tabular data before assembling it in a `DataBunch`.

In [None]:
path = untar_data(URLs.ADULT_SAMPLE)

In [None]:
df = pd.read_csv(path/'adult.csv')
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,49,Private,101320,Assoc-acdm,12.0,Married-civ-spouse,,Wife,White,Female,0,1902,40,United-States,>=50k
1,44,Private,236746,Masters,14.0,Divorced,Exec-managerial,Not-in-family,White,Male,10520,0,45,United-States,>=50k
2,38,Private,96185,HS-grad,,Divorced,,Unmarried,Black,Female,0,0,32,United-States,<50k
3,38,Self-emp-inc,112847,Prof-school,15.0,Married-civ-spouse,Prof-specialty,Husband,Asian-Pac-Islander,Male,0,0,40,United-States,>=50k
4,42,Self-emp-not-inc,82297,7th-8th,,Married-civ-spouse,Other-service,Wife,Black,Female,0,0,50,United-States,<50k


In [None]:
#export
@docs
class TabularProc():
    "Base class to write a tabular processor for dataframes"
    def __init__(self, cat_names, cont_names):
        self.cat_names,self.cont_names = cat_names,cont_names
    
    def setup(self, df_trn): pass
    def __call__(self, df):  pass

    _docs = dict(setup="Use `df_trn` to set its state and process `df_trn`",
                 __call__="Process `df` with the state computed on `df_trn`")

In [None]:
show_doc(TabularProc.setup)

<h4 id="<code>TabularProc.setup</code>" class="doc_header"><code>TabularProc.setup</code><a href="https://github.com/fastai/fastai_docs/tree/master/dev/__main__.py#L7" class="source_link" style="float:right">[source]</a></h4>

> <code>TabularProc.setup</code>(**`df_trn`**)

Use `df_trn` to set its state and process `df_trn`

In [None]:
show_doc(TabularProc.__call__)

<h4 id="<code>TabularProc.__call__</code>" class="doc_header"><code>TabularProc.__call__</code><a href="https://github.com/fastai/fastai_docs/tree/master/dev/__main__.py#L8" class="source_link" style="float:right">[source]</a></h4>

> <code>TabularProc.__call__</code>(**`df`**)

Process `df` with the state computed on `df_trn`

In [None]:
#export
from pandas.api.types import is_numeric_dtype, is_categorical_dtype

In [None]:
#export
class Categorify(TabularProc):
    "Transform the categorical variables to that type."
    def setup(self, df_trn):
        self.categories = {}
        for n in self.cat_names:
            df_trn.loc[:,n] = df_trn.loc[:,n].astype('category').cat.as_ordered()
            self.categories[n] = df_trn[n].cat.categories
        
    def __call__(self, df):
        for n in self.cat_names:
            df.loc[:,n] = pd.Categorical(df[n], categories=self.categories[n], ordered=True)

In [None]:
#export
class Normalize(TabularProc):
    "Normalize the continuous variables."
    def setup(self, df_trn):
        self.means,self.stds = {},{}
        for n in self.cont_names:
            assert is_numeric_dtype(df[n]), (f"""Cannot normalize '{n}' column as it isn't numerical.
                Are you sure it doesn't belong in the categorical set of columns?""")
            self.means[n],self.stds[n] = df_trn[n].mean(),df_trn[n].std()
        self.__call__(df_trn)
    
    def __call__(self, df):
        for n in self.cont_names: df[n] = (df[n]-self.means[n]) / (1e-7 + self.stds[n])

In [None]:
#export
mk_class('FillStrategy', 'median', 'constant', 'most_common')
FillStrategy.__doc__ = "Namespace containing the various filling strategies"

In [None]:
#export
_all_ = ['FillStrategy']

In [None]:
show_doc(FillStrategy, title_level=3)

<h3 id="<code>class</code> <code>FillStrategy</code>" class="doc_header"><code>class</code> <code>FillStrategy</code><a href="" class="source_link" style="float:right">[source]</a></h3>

> <code>FillStrategy</code>(**\*`args`**, **\*\*`kwargs`**)

Namespace containing the various filling strategies

In [None]:
#export
class FillMissing(TabularProc):
    "Fill the missing values in continuous columns."
    def __init__(self, cat_names, cont_names, fill_strategy=FillStrategy.median, add_col=True, fill_val=0.):
        super().__init__(cat_names, cont_names)
        self.fill_strategy,self.add_col,self.fill_val = fill_strategy,add_col,fill_val
    
    def setup(self, df_trn):
        self.na_dict = {}
        for name in self.cont_names:
            if pd.isnull(df_trn[name]).sum():
                if self.fill_strategy == FillStrategy.median: filler = df_trn[name].median()
                elif self.fill_strategy == FillStrategy.constant: filler = self.fill_val
                else: filler = df_trn[name].dropna().value_counts().idxmax()
                self.na_dict[name] = filler
                if self.add_col and name+'_na' not in self.cat_names: self.cat_names.append(name+'_na')
        self.__call__(df_trn)

    def __call__(self, df):
        for name in self.cont_names:
            if name in self.na_dict:
                if self.add_col: df[name+'_na'] = pd.isnull(df[name])
                df[name] = df[name].fillna(self.na_dict[name])
            elif pd.isnull(df[name]).sum() != 0:
                raise Exception(f"""There are nan values in field {name} but there were none in the training set. 
                Please fix those manually.""")

In [None]:
#export
class TabularPreprocessor():
    "An object that will preprocess dataframes using `procs`"
    def __init__(self, cat_names, cont_names, procs, inplace=True):
        self.cat_names,self.cont_names,self.procs,self.inplace = cat_names,cont_names,procs,inplace
    
    def setup(self, df_trn):
        df_trn = df_trn if self.inplace else df_trn.copy() 
        self.procs,procs = [],self.procs
        for p in procs: 
            p_ = p(self.cat_names,self.cont_names)
            p_.setup(df_trn)
            self.cat_names,self.cont_names = p_.cat_names,p_.cont_names
            self.procs.append(p_)
        return df_trn
    
    def __call__(self, df):
        df = df if self.inplace else df.copy() 
        for p in self.procs: p(df)
        return df

In [None]:
cat_names = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race']
cont_names = ['age', 'fnlwgt', 'education-num']
procs = [FillMissing, Categorify, Normalize]
proc = TabularPreprocessor(cat_names, cont_names, procs, inplace=False)

In [None]:
df = pd.read_csv(path/'adult.csv')
cut = int(0.8*len(df))
df_trn,df_val = df.iloc[:cut],df.iloc[cut:]

In [None]:
df_trn = proc.setup(df_trn)

In [None]:
df_val1 = proc(df_val)

In [None]:
df_trn1.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary,education-num_na
0,0.754358,Private,-0.840264,Assoc-acdm,0.754968,Married-civ-spouse,,Wife,White,Female,0,1902,40,United-States,>=50k,False
1,0.388642,Private,0.444009,Masters,1.54086,Divorced,Exec-managerial,Not-in-family,White,Male,10520,0,45,United-States,>=50k,False
2,-0.050219,Private,-0.88896,HS-grad,-0.030925,Divorced,,Unmarried,Black,Female,0,0,32,United-States,<50k,True
3,-0.050219,Self-emp-inc,-0.730951,Prof-school,1.933807,Married-civ-spouse,Prof-specialty,Husband,Asian-Pac-Islander,Male,0,0,40,United-States,>=50k,False
4,0.242355,Self-emp-not-inc,-1.020663,7th-8th,-0.030925,Married-civ-spouse,Other-service,Wife,Black,Female,0,0,50,United-States,<50k,True
