# Current Population Survey FastAI

We are going to use nearly all of the columns(numerical and categorical) for predicting 'ismarried' and 'faminc_50' using FastAI's tabular_learner

The results obtained using FastAI's learner are significantly better than the FFNN. We are able to achieve scores above .99 for predicting each category.

In [1]:
from fastai.tabular import * 
from fastai.tabular.transform import *
from fastai.tabular.data import *

In [2]:
path = '../datasets'

In [3]:
df = pd.read_csv("../datasets/CPS2016_UPDATE.csv")
df.head()

Unnamed: 0,age,weekly_hrs,educ,fam_income,num_in_house,num_child,sex,marital,race,region,state,citizen,worker_class,industry,occupation,ismarried,faminc_50
0,28,40,11,11,6,4,1,6,6,4,15,1,4,5,4,0,0
1,44,50,13,16,5,2,1,1,6,3,10,1,2,10,2,1,1
2,23,20,9,14,3,0,1,6,6,2,26,1,4,8,10,0,1
3,20,20,7,9,5,0,2,6,6,3,45,2,4,11,3,0,0
4,20,20,9,13,4,0,1,6,6,4,15,1,4,9,3,0,1


In [4]:
procs = [FillMissing, Categorify, Normalize]
valid_idx = range(len(df)-40000, len(df))

# Predict if an individual is married

In [5]:
dep_var = 'ismarried'
cat_names = ['age', 'weekly_hrs', 'educ','num_in_house', 'fam_income',
             'num_child', 'sex', 'race', 'state', 'industry', 'occupation']

In [6]:
data = TabularDataBunch.from_df(path, df, dep_var, valid_idx=valid_idx, procs=procs, cat_names=cat_names)
print(data.train_ds.cont_names)  # `cont_names` defaults to: set(df)-set(cat_names)-{dep_var}

['worker_class', 'citizen', 'marital', 'faminc_50', 'region']


In [7]:
(cat_x,cont_x),y = next(iter(data.train_dl))
for o in (cat_x, cont_x, y): print(to_np(o[:5]))

[[39 41 13  1 ...  6  5 10  2]
 [47 41 12  2 ...  2 44 12  3]
 [ 6 36 10  1 ...  2  1 11  3]
 [41 23 13  1 ...  1 31 11  3]
 [47 21 13  2 ...  1 27  8  1]]
[[ 1.53493  -0.564308  0.684005  0.833958  1.157395]
 [ 0.234825 -0.564308  0.252384  0.833958  0.185165]
 [ 0.234825 -0.564308  1.115625 -1.199022  0.185165]
 [ 1.53493  -0.564308  1.115625 -1.199022 -1.759295]
 [ 0.234825 -0.564308 -1.042476  0.833958  1.157395]]
[0 0 0 0 1]


In [None]:
learn = tabular_learner(data, layers=[200,100], metrics=accuracy)

In [None]:
learn.fit(5)

epoch,train_loss,valid_loss,accuracy,time
0,0.01807,0.002573,0.999525,01:03
1,0.009104,0.003516,0.998625,00:59
2,0.011314,0.003849,0.998675,00:59
3,0.008792,0.002742,0.999075,00:59
4,0.005759,0.001904,0.999425,00:59


In [None]:
learn.predict(df.iloc[0])

(Category 0, tensor(0), tensor([1.0000e+00, 3.5616e-06]))

# Predict if Family Income > 50,000

In [None]:
dep_var = 'faminc_50'
cat_names = ['age', 'weekly_hrs', 'educ','num_in_house', 
             'num_child', 'sex', 'race', 'state', 'marital',
             'occupation', 'industry']

In [None]:
data = TabularDataBunch.from_df(path, df, dep_var, valid_idx=valid_idx, procs=procs, cat_names=cat_names)
print(data.train_ds.cont_names)  # `cont_names` defaults to: set(df)-set(cat_names)-{dep_var}

['worker_class', 'ismarried', 'citizen', 'fam_income', 'region']


In [None]:
(cat_x,cont_x),y = next(iter(data.train_dl))
for o in (cat_x, cont_x, y): print(to_np(o[:5]))

[[42 41 10  1 ...  9  5  2 12]
 [14 41 14  2 ... 12  6  2 10]
 [ 3 21  6  3 ...  2  6  3 11]
 [60 31  9  2 ...  8  4  3 12]
 [ 6 26 10  4 ... 19  6  8 11]]
[[ 0.234825 -0.93144  -0.564308  0.133828  0.185165]
 [-2.365385 -0.93144  -0.564308  1.164909  1.157395]
 [-1.06528  -0.93144  -0.564308  0.133828  1.157395]
 [ 0.234825 -0.93144   0.790759 -0.381712  0.185165]
 [ 0.234825 -0.93144  -0.564308  0.907139  0.185165]]
[1 1 1 0 1]


In [None]:
learn = tabular_learner(data, layers=[200,100], metrics=accuracy)

In [None]:
learn.fit(5)

epoch,train_loss,valid_loss,accuracy,time
0,0.092278,0.02727,0.9999,01:01
1,0.062569,0.015363,0.9999,01:00
2,0.069672,0.024755,0.9957,00:58


In [None]:
learn.predict(df.iloc[0])