In [1]:
import pandas as pd
from fastai.tabular import *
from sklearn.ensemble import RandomForestClassifier

In [2]:
path = untar_data(URLs.ADULT_SAMPLE)
df = pd.read_csv(path/'adult.csv')

In [3]:
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,49,Private,101320,Assoc-acdm,12.0,Married-civ-spouse,,Wife,White,Female,0,1902,40,United-States,>=50k
1,44,Private,236746,Masters,14.0,Divorced,Exec-managerial,Not-in-family,White,Male,10520,0,45,United-States,>=50k
2,38,Private,96185,HS-grad,,Divorced,,Unmarried,Black,Female,0,0,32,United-States,<50k
3,38,Self-emp-inc,112847,Prof-school,15.0,Married-civ-spouse,Prof-specialty,Husband,Asian-Pac-Islander,Male,0,0,40,United-States,>=50k
4,42,Self-emp-not-inc,82297,7th-8th,,Married-civ-spouse,Other-service,Wife,Black,Female,0,0,50,United-States,<50k


In [4]:
df.dtypes

age                 int64
workclass          object
fnlwgt              int64
education          object
education-num     float64
marital-status     object
occupation         object
relationship       object
race               object
sex                object
capital-gain        int64
capital-loss        int64
hours-per-week      int64
native-country     object
salary             object
dtype: object

In [5]:
# fill missing
df.isnull().sum()

age                 0
workclass           0
fnlwgt              0
education           0
education-num     487
marital-status      0
occupation        512
relationship        0
race                0
sex                 0
capital-gain        0
capital-loss        0
hours-per-week      0
native-country      0
salary              0
dtype: int64

In [6]:
df['missing_education-num'] = df['education-num'].isnull().map({True: 1, False:0})
df['missing_occupation'] = df['occupation'].isnull().map({True: 1, False:0})
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary,missing_education-num,missing_occupation
0,49,Private,101320,Assoc-acdm,12.0,Married-civ-spouse,,Wife,White,Female,0,1902,40,United-States,>=50k,0,1
1,44,Private,236746,Masters,14.0,Divorced,Exec-managerial,Not-in-family,White,Male,10520,0,45,United-States,>=50k,0,0
2,38,Private,96185,HS-grad,,Divorced,,Unmarried,Black,Female,0,0,32,United-States,<50k,1,1
3,38,Self-emp-inc,112847,Prof-school,15.0,Married-civ-spouse,Prof-specialty,Husband,Asian-Pac-Islander,Male,0,0,40,United-States,>=50k,0,0
4,42,Self-emp-not-inc,82297,7th-8th,,Married-civ-spouse,Other-service,Wife,Black,Female,0,0,50,United-States,<50k,1,0


In [7]:
val = df['education-num'].median()
df['education-num'] = df['education-num'].fillna(val)

In [8]:
val = 'no_occupation'
df['occupation'] = df['occupation'].fillna(val)

In [9]:
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary,missing_education-num,missing_occupation
0,49,Private,101320,Assoc-acdm,12.0,Married-civ-spouse,no_occupation,Wife,White,Female,0,1902,40,United-States,>=50k,0,1
1,44,Private,236746,Masters,14.0,Divorced,Exec-managerial,Not-in-family,White,Male,10520,0,45,United-States,>=50k,0,0
2,38,Private,96185,HS-grad,10.0,Divorced,no_occupation,Unmarried,Black,Female,0,0,32,United-States,<50k,1,1
3,38,Self-emp-inc,112847,Prof-school,15.0,Married-civ-spouse,Prof-specialty,Husband,Asian-Pac-Islander,Male,0,0,40,United-States,>=50k,0,0
4,42,Self-emp-not-inc,82297,7th-8th,10.0,Married-civ-spouse,Other-service,Wife,Black,Female,0,0,50,United-States,<50k,1,0


In [10]:
for col in df.select_dtypes(include=['object']):
    df[col] = df[col].astype('category')

In [11]:
for col in df.select_dtypes(include=['category']):
    df[col] = df[col].cat.codes

In [12]:
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary,missing_education-num,missing_occupation
0,49,4,101320,7,12.0,2,15,5,4,0,0,1902,40,39,1,0,1
1,44,4,236746,12,14.0,0,4,1,4,1,10520,0,45,39,1,0,0
2,38,4,96185,11,10.0,0,15,4,2,0,0,0,32,39,0,1,1
3,38,5,112847,14,15.0,2,10,0,1,1,0,0,40,39,1,0,0
4,42,6,82297,5,10.0,2,8,5,2,0,0,0,50,39,0,1,0


In [13]:
def split_vals(a,n): return a[:n].copy(), a[n:].copy()

In [14]:
X = df.drop('salary', axis = 1)
y = df['salary']

In [15]:
len(df)

32561

In [16]:
n_valid = int(len(df) * 0.2)  # same as Kaggle's test set size
n_trn = len(df)-n_valid

X_train, X_valid = split_vals(X, n_trn)
y_train, y_valid = split_vals(y, n_trn)

X_train.shape, y_train.shape, X_valid.shape

((26049, 16), (26049,), (6512, 16))

In [17]:
clf = RandomForestClassifier(n_jobs=-1, max_features=0.5, n_estimators=20)

In [18]:
clf = clf.fit(X_train, y_train)

In [19]:
predictions = clf.predict(X_valid)

In [20]:
from sklearn.metrics import accuracy_score

In [21]:
accuracy_score(y_valid, predictions)

0.863482800982801