# Use fastai_v1 for (pre-)processing of tablular data

... or in other words, a fastai v1 equivalent of doing `proc_df()` so that it can be used for e.g. scikit-learn models like Random Forest etc.

In [1]:
from fastai.tabular import *  
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

## Get the data

In [2]:
path = untar_data(URLs.ADULT_SAMPLE)
path

PosixPath('/home/gautam/.fastai/data/adult_sample')

In [3]:
df = pd.read_csv(path/'adult.csv')
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,49,Private,101320,Assoc-acdm,12.0,Married-civ-spouse,,Wife,White,Female,0,1902,40,United-States,>=50k
1,44,Private,236746,Masters,14.0,Divorced,Exec-managerial,Not-in-family,White,Male,10520,0,45,United-States,>=50k
2,38,Private,96185,HS-grad,,Divorced,,Unmarried,Black,Female,0,0,32,United-States,<50k
3,38,Self-emp-inc,112847,Prof-school,15.0,Married-civ-spouse,Prof-specialty,Husband,Asian-Pac-Islander,Male,0,0,40,United-States,>=50k
4,42,Self-emp-not-inc,82297,7th-8th,,Married-civ-spouse,Other-service,Wife,Black,Female,0,0,50,United-States,<50k


In [4]:
procs = [FillMissing, Categorify, Normalize]
valid_idx = range(len(df)-2000, len(df))
dep_var = 'salary'
cat_names = ['workclass', 'education', 'marital-status', 'occupation', 
             'relationship', 'race', 'sex', 'native-country']

data = TabularDataBunch.from_df(path, df, dep_var, valid_idx=valid_idx, procs=procs, cat_names=cat_names)
data.show_batch()

workclass,education,marital-status,occupation,relationship,race,sex,native-country,education-num_na,capital-loss,education-num,fnlwgt,age,capital-gain,hours-per-week,target
Private,Some-college,Never-married,Handlers-cleaners,Not-in-family,White,Male,United-States,False,-0.2168,-0.0297,0.2848,-1.3638,-0.1459,-0.8437,<50k
Private,11th,Never-married,Other-service,Own-child,White,Female,United-States,False,-0.2168,-1.2052,0.4809,-1.5102,-0.1459,-1.6516,<50k
Private,HS-grad,Married-civ-spouse,Sales,Husband,White,Male,United-States,False,-0.2168,-0.4216,0.0778,-0.9244,-0.1459,1.984,<50k
Private,Doctorate,Married-civ-spouse,Prof-specialty,Husband,White,Male,United-States,False,-0.2168,2.3212,0.5144,1.7118,1.872,0.7721,>=50k
Private,Bachelors,Divorced,Prof-specialty,Unmarried,White,Female,Germany,False,-0.2168,1.1457,-0.5488,-0.4118,-0.0232,-0.0358,<50k


In [5]:
print(data.train_ds.cont_names)  # `cont_names` defaults to: set(df)-set(cat_names)-{dep_var}

['capital-loss', 'education-num', 'fnlwgt', 'age', 'capital-gain', 'hours-per-week']


In [6]:
data.train_ds.x.codes.shape, data.train_ds.x.conts.shape, data.train_ds.y.items.shape

((30561, 9), (30561, 6), (30561,))

In [7]:
X = np.concatenate((data.train_ds.x.codes, data.train_ds.x.conts), axis=1)
y = data.train_ds.y.items
X.shape, y.shape

((30561, 15), (30561,))

## Split into training and validation sets

In [8]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=None, random_state=42)
X_train.shape, X_valid.shape, X_train.shape[0] + X_valid.shape[0]

((22920, 15), (7641, 15), 30561)

## Train RandomForest model

In [9]:
m = RandomForestClassifier(n_jobs=-1, n_estimators=10)
m.fit(X_train, y_train)
m.score(X_train,y_train), m.score(X_valid,y_valid)

(0.9872600349040139, 0.8564324041355843)