In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/census-income-dataset/adult.data
/kaggle/input/census-income-dataset/adult.test


In [2]:
!pip install -Uqq fastbook
import fastbook
fastbook.setup_book()

In [3]:
from fastbook import *
from fastai.tabular.all import *

path=Path('/kaggle/input/census-income-dataset')
path

Path('/kaggle/input/census-income-dataset')

In [4]:
path.ls()

(#2) [Path('/kaggle/input/census-income-dataset/adult.data'),Path('/kaggle/input/census-income-dataset/adult.test')]

Let's use Panda to see dataset before we use FastAI lib.

According to Barry Becker, the columns are as following order;
* **age**: continuous.
* **workclass**: Private, Self-emp-not-inc, Self-emp-inc, Federal-gov, Local-gov, State-gov, Without-pay, Never-worked.
* **fnlwgt**: continuous.
* **education**: Bachelors, Some-college, 11th, HS-grad, Prof-school, Assoc-acdm, Assoc-voc, 9th, 7th-8th, 12th, Masters, 1st-4th, 10th, Doctorate, 5th-6th, Preschool.
* **education-num**: continuous.
* **marital-status**: Married-civ-spouse, Divorced, Never-married, Separated, Widowed, Married-spouse-absent, Married-AF-spouse.
* **occupation**: Tech-support, Craft-repair, Other-service, Sales, Exec-managerial, Prof-specialty, Handlers-cleaners, Machine-op-inspct, Adm-clerical, Farming-fishing, Transport-moving, Priv-house-serv, Protective-serv, Armed-Forces.
* **relationship**: Wife, Own-child, Husband, Not-in-family, Other-relative, Unmarried.
* **race**: White, Asian-Pac-Islander, Amer-Indian-Eskimo, Other, Black.
* **sex**: Female, Male.
* **capital-gain**: continuous.
* **capital-loss**: continuous.
* **hours-per-week**: continuous.
* **native-country**: United-States, Cambodia, England, Puerto-Rico, Canada, Germany, Outlying-US(Guam-USVI-etc), India, Japan, Greece, South, China, Cuba, Iran, Honduras, Philippines, Italy, Poland, Jamaica, Vietnam, Mexico, Portugal, Ireland, France, Dominican-Republic, Laos, Ecuador, Taiwan, Haiti, Columbia, Hungary, Guatemala, Nicaragua, Scotland, Thailand, Yugoslavia, El-Salvador, Trinadad&Tobago, Peru, Hong, Holand-Netherlands.

http://archive.ics.uci.edu/ml/datasets/Census+Income

In [5]:
column_names = ['age','workclass','fnlwggt','education','education-num','marital-status','occupation','relationship','race','sex','capital-gain','capital-loss','hours-per-week','native-country','salary']
df_train=pd.read_csv(path/'adult.data', low_memory=False, header=None, names=column_names)
df_test=pd.read_csv(path/'adult.test', low_memory=False, header=None, names=column_names)

In [6]:
#Dataset does not have the header so we set header/column_names when we load dataset with Panda. Let's check how it looks.
df_train.columns

Index(['age', 'workclass', 'fnlwggt', 'education', 'education-num',
       'marital-status', 'occupation', 'relationship', 'race', 'sex',
       'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
       'salary'],
      dtype='object')

In [7]:
#Let's see actual data
df_train.head()

Unnamed: 0,age,workclass,fnlwggt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [8]:
#Let's see tail data
df_train.tail()

Unnamed: 0,age,workclass,fnlwggt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K
32560,52,Self-emp-inc,287927,HS-grad,9,Married-civ-spouse,Exec-managerial,Wife,White,Female,15024,0,40,United-States,>50K


In [9]:
#check categories of second column
df_train['workclass'].unique()

array(['State-gov', 'Self-emp-not-inc', 'Private', 'Federal-gov', 'Local-gov', '?', 'Self-emp-inc', 'Without-pay', 'Never-worked'], dtype=object)

# **We will use FastAI library to prepare dataloader.**

Categorify is a TabularProc that replaces a column with a numeric categorical column. FillMissing is a TabularProc that replaces missing values with the median of the column, and creates a new Boolean column that is set to True for any row where the value was missing. These two transforms are needed for nearly every tabular dataset you will use, so this is a good starting point for your data processing:


In [10]:
procs = [Categorify, FillMissing, Normalize]
label_column='salary'
category_columns=['workclass','education','marital-status','occupation','relationship','race','sex','native-country']
continuous_columns=['age','fnlwggt','education-num','capital-gain','capital-loss','hours-per-week']

In [11]:
df_train['salary'].unique()

array(['<=50K', '>50K'], dtype=object)

In [12]:
##https://docs.fast.ai/tutorial.tabular.html
#load from DF, not csv here.
dls=TabularDataLoaders.from_df(df_train, path=path,
                               y_names=label_column,
                               cat_names=category_columns,
                               cont_names=continuous_columns,
                               procs=procs,
                              bs=64)

In [13]:
#The show_batch method works like for every other application:
dls.show_batch()

Unnamed: 0,workclass,education,marital-status,occupation,relationship,race,sex,native-country,age,fnlwggt,education-num,capital-gain,capital-loss,hours-per-week,salary
0,Self-emp-inc,Some-college,Married-civ-spouse,Sales,Husband,White,Male,United-States,84.0,172906.999416,10.0,-4.5e-05,3e-06,35.0,>50K
1,Private,Bachelors,Divorced,Exec-managerial,Not-in-family,White,Male,United-States,39.0,114543.996898,13.0,-4.5e-05,3e-06,45.0,>50K
2,Private,Bachelors,Married-spouse-absent,Adm-clerical,Not-in-family,Amer-Indian-Eskimo,Male,Philippines,49.0,190319.000008,13.0,-4.5e-05,3e-06,40.0,<=50K
3,Private,HS-grad,Never-married,Handlers-cleaners,Not-in-family,White,Male,United-States,48.0,39986.001304,9.0,-4.5e-05,3e-06,40.0,<=50K
4,State-gov,Bachelors,Never-married,Prof-specialty,Unmarried,White,Female,United-States,40.0,119577.997766,13.0,-4.5e-05,3e-06,20.0,<=50K
5,Private,HS-grad,Never-married,Other-service,Own-child,White,Female,United-States,35.0,255702.001646,9.0,-4.5e-05,3e-06,27.0,<=50K
6,State-gov,Prof-school,Married-civ-spouse,Prof-specialty,Husband,White,Male,United-States,43.0,345968.993915,15.0,-4.5e-05,3e-06,50.0,>50K
7,Private,Some-college,Never-married,Sales,Not-in-family,White,Male,United-States,20.0,32426.003365,10.0,-4.5e-05,3e-06,25.0,<=50K
8,Private,HS-grad,Married-civ-spouse,Transport-moving,Husband,White,Male,United-States,60.999999,198078.000321,9.0,-4.5e-05,3e-06,40.0,>50K
9,Private,Some-college,Married-civ-spouse,Sales,Husband,White,Male,United-States,56.0,257554.999904,10.0,-4.5e-05,3e-06,40.0,<=50K


In [14]:
#We can define a model using the tabular_learner method. When we define our model, fastai will try to infer the loss function based on our y_names earlier.
learn=tabular_learner(dls, metrics=accuracy)

In [15]:
#And we can train that model with the fit_one_cycle method (the fine_tune method won't be useful here since we don't have a pretrained model).
learn.fit_one_cycle(1)

epoch,train_loss,valid_loss,accuracy,time
0,0.327059,0.326583,0.847512,00:07


In [16]:
#We can then have a look at some predictions:
learn.show_results()

Unnamed: 0,workclass,education,marital-status,occupation,relationship,race,sex,native-country,age,fnlwggt,education-num,capital-gain,capital-loss,hours-per-week,salary,salary_pred
0,5.0,16.0,3.0,8.0,1.0,5.0,2.0,40.0,-0.26398,0.259236,-0.02957,-0.146174,-0.218637,0.773539,0.0,0.0
1,5.0,15.0,3.0,11.0,1.0,5.0,2.0,40.0,-0.1175,-0.646348,1.914559,1.848703,-0.218637,0.368706,1.0,1.0
2,5.0,11.0,3.0,14.0,6.0,5.0,1.0,1.0,-0.26398,1.4521,2.303384,-0.146174,-0.218637,-1.655458,0.0,1.0
3,5.0,13.0,5.0,11.0,2.0,3.0,2.0,40.0,-0.630178,1.008156,1.525733,-0.146174,-0.218637,-0.845793,0.0,0.0
4,8.0,12.0,3.0,15.0,1.0,5.0,2.0,40.0,-1.069617,0.074408,-0.418395,-0.146174,-0.218637,0.692572,1.0,0.0
5,3.0,13.0,3.0,11.0,1.0,5.0,2.0,40.0,0.614897,-0.596857,1.525733,-0.146174,-0.218637,-0.036127,1.0,1.0
6,1.0,10.0,5.0,1.0,2.0,5.0,2.0,40.0,1.640253,-1.389546,1.136907,-0.146174,-0.218637,-2.708024,1.0,0.0
7,5.0,16.0,3.0,13.0,1.0,5.0,2.0,40.0,0.321938,0.086078,-0.02957,-0.146174,-0.218637,-0.19806,1.0,0.0
8,5.0,16.0,5.0,9.0,2.0,5.0,1.0,40.0,0.028979,0.48212,-0.02957,-0.146174,-0.218637,-1.250625,0.0,0.0


In [17]:
#Or use the predict method on a row:
row, clas, probs=learn.predict(df_test.iloc[0])

In [18]:
row.show()

Unnamed: 0,workclass,education,marital-status,occupation,relationship,race,sex,native-country,age,fnlwggt,education-num,capital-gain,capital-loss,hours-per-week,salary
0,Private,11th,Never-married,Machine-op-inspct,Own-child,Black,Male,United-States,25.0,226802.001355,7.0,-4.5e-05,3e-06,40.0,<=50K


In [19]:
clas, probs

(tensor(0), tensor([9.9923e-01, 7.7435e-04]))

Training only one cycle, we got 85% accuracy. The model predicted that the df_test.iloc[0] is zero meaning salary <=50K. Which is correct per row.show() above.

In [20]:
#Let's train more cycles; 10.
learn_2=tabular_learner(dls, metrics=accuracy)
learn_2.fit_one_cycle(10)

epoch,train_loss,valid_loss,accuracy,time
0,0.345868,0.336266,0.845362,00:06
1,0.335995,0.332475,0.846437,00:06
2,0.325225,0.326833,0.848741,00:06
3,0.322463,0.331097,0.846591,00:06
4,0.301132,0.316258,0.852733,00:07
5,0.309468,0.319684,0.85258,00:06
6,0.300698,0.323759,0.849202,00:06
7,0.291629,0.321811,0.852119,00:06
8,0.295297,0.320659,0.850584,00:06
9,0.28826,0.325345,0.849662,00:07


In [21]:
#We can then have a look at some predictions:
learn_2.show_results()

Unnamed: 0,workclass,education,marital-status,occupation,relationship,race,sex,native-country,age,fnlwggt,education-num,capital-gain,capital-loss,hours-per-week,salary,salary_pred
0,5.0,12.0,5.0,8.0,4.0,5.0,2.0,40.0,-1.435815,1.762656,-0.418395,-0.146174,-0.218637,0.125806,0.0,0.0
1,7.0,12.0,3.0,4.0,1.0,5.0,2.0,40.0,1.859973,0.227752,-0.418395,-0.146174,-0.218637,0.773539,0.0,0.0
2,5.0,16.0,5.0,13.0,4.0,5.0,1.0,40.0,-1.435815,-0.62557,-0.02957,-0.146174,-0.218637,-1.655458,0.0,0.0
3,5.0,13.0,3.0,11.0,1.0,5.0,2.0,40.0,-0.556939,2.67902,1.525733,0.541358,-0.218637,-0.036127,1.0,1.0
4,3.0,9.0,1.0,5.0,2.0,5.0,1.0,40.0,0.321938,0.321532,0.359256,-0.146174,-0.218637,-0.036127,0.0,0.0
5,5.0,16.0,5.0,8.0,4.0,5.0,1.0,40.0,-0.776658,1.020261,-0.02957,-0.146174,-0.218637,-0.036127,0.0,0.0
6,1.0,15.0,5.0,1.0,2.0,5.0,2.0,40.0,1.933212,0.327311,1.914559,3.189775,-0.218637,3.202535,1.0,1.0
7,5.0,10.0,5.0,2.0,4.0,5.0,2.0,40.0,-1.069617,0.366815,1.136907,-0.146174,-0.218637,-0.19806,0.0,0.0
8,8.0,16.0,5.0,2.0,4.0,5.0,2.0,40.0,-1.435815,-0.732307,-0.02957,-0.146174,-0.218637,-2.465124,0.0,0.0


In [22]:
#Let's check network model

learn_2.summary()

TabularModel (Input shape: ['64 x 8', '64 x 6'])
Layer (type)         Output Shape         Param #    Trainable 
                     64 x 6              
Embedding                                 60         True      
____________________________________________________________________________
                     64 x 8              
Embedding                                 136        True      
____________________________________________________________________________
                     64 x 5              
Embedding                                 40         True      
____________________________________________________________________________
                     64 x 8              
Embedding                                 128        True      
____________________________________________________________________________
                     64 x 5              
Embedding                                 35         True      
_________________________________________________