# Current Population Survey FastAI

We are going to use nearly all of the columns (numerical and categorical) for predicting 'ismarried' and 'faminc_50' using FastAI's tabular_learner

In [1]:
from fastai.tabular import * 
from fastai.tabular.transform import *
from fastai.tabular.data import *

In [2]:
path = '../datasets'

In [3]:
df = pd.read_csv("../datasets/CPS2016_UPDATE.csv")
df.head()

Unnamed: 0,age,weekly_hrs,educ,fam_income,num_in_house,num_child,sex,marital,race,region,state,citizen,worker_class,industry,occupation,ismarried,faminc_50
0,28,40,11,11,6,4,1,6,6,4,15,1,4,5,4,0,0
1,44,50,13,16,5,2,1,1,6,3,10,1,2,10,2,1,1
2,23,20,9,14,3,0,1,6,6,2,26,1,4,8,10,0,1
3,20,20,7,9,5,0,2,6,6,3,45,2,4,11,3,0,0
4,20,20,9,13,4,0,1,6,6,4,15,1,4,9,3,0,1


In [4]:
procs = [FillMissing, Categorify, Normalize]
valid_idx = range(len(df)-40000, len(df))

# Predict if an individual is married

In [5]:
dep_var = 'ismarried'
cat_names = ['num_child', 'sex', 'race', 'state', 'industry', 'occupation']
cont_names = ['age', 'weekly_hrs', 'educ','num_in_house', 'fam_income']

In [6]:
data = TabularDataBunch.from_df(path, df, dep_var, valid_idx=valid_idx, procs=procs, 
                                cat_names=cat_names,
                                cont_names=cont_names)
print(data.train_ds.cont_names)  # `cont_names` defaults to: set(df)-set(cat_names)-{dep_var}

['age', 'weekly_hrs', 'educ', 'num_in_house', 'fam_income']


In [7]:
(cat_x,cont_x),y = next(iter(data.train_dl))
for o in (cat_x, cont_x, y): print(to_np(o[:5]))

[[ 1  1  4 29  5  4]
 [ 2  2  2  9  4  5]
 [ 1  2  2 31 11  3]
 [ 2  1  1  5  6 10]
 [ 1  1  2 10 13  3]]
[[-0.390002  1.7643    0.965331  1.009929  0.649369]
 [-1.133753  0.638793 -0.578774  1.589523 -0.123942]
 [-0.761878  0.638793 -0.578774 -1.308446 -2.443873]
 [-0.080106 -1.912355  0.965331 -0.149259  0.649369]
 [ 0.22979   0.638793 -0.192747 -1.308446 -0.639482]]
[0 0 0 1 0]


In [8]:
learn = tabular_learner(data, layers=[200,100], metrics=accuracy)

In [9]:
learn.fit(10)

epoch,train_loss,valid_loss,accuracy,time
0,0.37359,0.388197,0.83425,00:57
1,0.378469,0.390568,0.835325,00:55
2,0.367815,0.377841,0.83845,00:55
3,0.361387,0.384005,0.834975,00:55
4,0.356391,0.390451,0.833775,00:56
5,0.360532,0.388292,0.836825,00:56
6,0.352827,0.389239,0.834775,00:55
7,0.340881,0.385161,0.833925,00:56
8,0.342444,0.391996,0.833025,00:59
9,0.324568,0.403108,0.82875,00:57


In [10]:
learn.predict(df.iloc[0])

(Category 1, tensor(1), tensor([0.1302, 0.8698]))

# Predict if Family Income > 50,000

In [11]:
dep_var = 'faminc_50'
cat_names = ['num_child', 'sex', 'race', 'state', 'industry', 'occupation', 'marital']
cont_names = ['age', 'weekly_hrs', 'educ','num_in_house']

In [12]:
data = TabularDataBunch.from_df(path, df, dep_var, valid_idx=valid_idx, procs=procs, 
                                cat_names=cat_names,
                                cont_names=cont_names)
print(data.train_ds.cont_names)  # `cont_names` defaults to: set(df)-set(cat_names)-{dep_var}

['age', 'weekly_hrs', 'educ', 'num_in_house']


In [13]:
(cat_x,cont_x),y = next(iter(data.train_dl))
for o in (cat_x, cont_x, y): print(to_np(o[:5]))

[[ 3  1  4 43  7  2  1]
 [ 1  2  2 11 10  9  6]
 [ 1  2  1  5 11  3  6]
 [ 2  1  6 34 13  7  1]
 [ 1  1  4  5  6 10  1]]
[[ 0.539686  0.638793 -0.192747  0.430335]
 [ 0.787603  0.638793 -0.192747 -0.149259]
 [-1.195732 -0.486713 -0.192747 -0.149259]
 [-0.266044  0.638793 -0.192747 -0.149259]
 [ 1.159478  0.638793 -0.192747 -0.728853]]
[0 1 1 1 0]


In [14]:
learn = tabular_learner(data, layers=[200,100], metrics=accuracy)

In [15]:
learn.fit(10)

epoch,train_loss,valid_loss,accuracy,time
0,0.553457,0.498349,0.7581,00:56
1,0.542192,0.506058,0.7546,00:55
2,0.541313,0.506232,0.7557,00:56
3,0.53739,0.509122,0.755175,00:56
4,0.519991,0.506522,0.75575,00:57
5,0.52626,0.505602,0.756125,00:59
6,0.518558,0.513284,0.74845,00:58
7,0.515915,0.513022,0.752325,00:56
8,0.511044,0.51599,0.7524,00:57
9,0.504449,0.520386,0.74555,00:55


In [16]:
learn.predict(df.iloc[0])

(Category 1, tensor(1), tensor([0.4634, 0.5366]))