In [1]:
%matplotlib inline

import matplotlib as plt
import numpy as np
import pandas as pd
import seaborn as sns

In [2]:
names = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 
         'marital-status', 'occupation', 'relationship', 
         'race', 'sex', 'capital-gain', 'capital-loss', 
         'hours-per-week', 'native-country', 'class']

In [3]:
adult_train_df = pd.read_csv('../data/adult.data.csv', header=None, names=names)
adult_test_df = pd.read_csv('../data/adult.test.csv', header=None, names=names)

In [4]:
adult_train_df.shape, adult_test_df.shape

((32561, 15), (16281, 15))

In [5]:
adult_train_df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,class
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [6]:
adult_test_df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,class
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K.
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K.
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K.
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K.
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K.


## Fill the nan/missing values

In [7]:
adult_train_df.isnull().sum()

age               0
workclass         0
fnlwgt            0
education         0
education-num     0
marital-status    0
occupation        0
relationship      0
race              0
sex               0
capital-gain      0
capital-loss      0
hours-per-week    0
native-country    0
class             0
dtype: int64

In [8]:
adult_train_df.isnull().sum()

age               0
workclass         0
fnlwgt            0
education         0
education-num     0
marital-status    0
occupation        0
relationship      0
race              0
sex               0
capital-gain      0
capital-loss      0
hours-per-week    0
native-country    0
class             0
dtype: int64

There are some "?"s in the data - let's convert them to NaNs

In [9]:
for col in adult_train_df.columns:
    if adult_train_df[col].dtype == object:
        print(col, (adult_train_df[col].str.contains(' \?')).sum())

workclass 1836
education 0
marital-status 0
occupation 1843
relationship 0
race 0
sex 0
native-country 583
class 0


Option 1:

In [10]:
#columns_with_na_vals = ['workclass', 'occupation', 'native-country']

In [11]:
#for col in columns_with_na_vals:
#    adult_train_df[col] = adult_train_df[col].replace(' ?', np.nan)

Option 2:

In [12]:
adult_test_df = adult_test_df.replace(' ?', np.nan)

In [13]:
adult_train_df = adult_train_df.replace(' ?', np.nan)

Check that all were replaced:

In [14]:
for col in adult_train_df.columns:
    if adult_train_df[col].dtype == object:
        print(col, (adult_train_df[col].str.contains(' \?')).sum())

workclass 0
education 0
marital-status 0
occupation 0
relationship 0
race 0
sex 0
native-country 0
class 0


## Check the data types

In [15]:
pd.DataFrame([adult_train_df.dtypes, adult_test_df.dtypes], index=['train set', 'test set']).T

Unnamed: 0,train set,test set
age,int64,int64
workclass,object,object
fnlwgt,int64,int64
education,object,object
education-num,int64,int64
marital-status,object,object
occupation,object,object
relationship,object,object
race,object,object
sex,object,object


In [17]:
adult_train_df.to_pickle('../data/adult_train.p')