In [1]:
%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib as plt
import seaborn as sns

In [2]:
names = ['age','workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship',
        'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'salary']

In [3]:
adult_train_df = pd.read_csv('../data/adult.data.txt', header = None)
adult_test_df = pd.read_csv('../data/adult.test.txt', header = None, skiprows = 1)
adult_train_df.columns = names
adult_test_df.columns = names

In [4]:
adult_train_df.shape, adult_test_df.shape

((32561, 15), (16281, 15))

In [5]:
adult_train_df.sample(4, random_state = 42)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,class
14160,27,Private,160178,Some-college,10,Divorced,Adm-clerical,Not-in-family,White,Female,0,0,38,United-States,<=50K
27048,45,State-gov,50567,HS-grad,9,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K
28868,29,Private,185908,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,Black,Male,0,0,55,United-States,>50K
5667,30,Private,190040,Bachelors,13,Never-married,Machine-op-inspct,Not-in-family,White,Female,0,0,40,United-States,<=50K


In [5]:
adult_test_df.sample(4, random_state = 42)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,class
13633,29,Private,189346,HS-grad,9,Never-married,Transport-moving,Unmarried,White,Male,0,0,40,United-States,<=50K.
1921,31,Private,137076,Bachelors,13,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,<=50K.
12140,52,Federal-gov,35546,HS-grad,9,Married-civ-spouse,Tech-support,Husband,White,Male,0,0,40,United-States,<=50K.
9933,54,Local-gov,116428,10th,6,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,40,United-States,<=50K.


## Fill the nan/missing values

In [6]:
adult_test_df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,class
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K.
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K.
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K.
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K.
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K.


In [7]:
adult_train_df.isnull().sum()

age               0
workclass         0
fnlwgt            0
education         0
education-num     0
marital-status    0
occupation        0
relationship      0
race              0
sex               0
capital-gain      0
capital-loss      0
hours-per-week    0
native-country    0
class             0
dtype: int64

In [8]:
adult_test_df.isnull().sum()

age               0
workclass         0
fnlwgt            0
education         0
education-num     0
marital-status    0
occupation        0
relationship      0
race              0
sex               0
capital-gain      0
capital-loss      0
hours-per-week    0
native-country    0
class             0
dtype: int64

In [9]:
for col in adult_train_df.columns:
    if adult_train_df[col].dtype == object:
        print(col, adult_train_df[col].str.contains(' \?').sum())

workclass 1836
education 0
marital-status 0
occupation 1843
relationship 0
race 0
sex 0
native-country 583
class 0


In [10]:
columns_with_na_vals = ['workclass', 'occupation', 'native-country']

In [11]:
for col in columns_with_na_vals:
    adult_train_df[col] = (adult_train_df[col].replace(to_replace = ' ?', value = np.nan))

In [12]:
for col in adult_train_df.columns:
    if adult_train_df[col].dtype == object:
        print(col, adult_train_df[col].str.contains(' \?').sum())

workclass 0
education 0
marital-status 0
occupation 0
relationship 0
race 0
sex 0
native-country 0
class 0


In [14]:
adult_train_df = adult_train_df.dropna()
adult_test_df = adult_test_df.dropna()
adult_train_df.isnull().sum()

age               0
workclass         0
fnlwgt            0
education         0
education-num     0
marital-status    0
occupation        0
relationship      0
race              0
sex               0
capital-gain      0
capital-loss      0
hours-per-week    0
native-country    0
class             0
dtype: int64

## Check the data types

In [13]:
pd.DataFrame([adult_train_df.dtypes, adult_test_df.dtypes], index = ['train_set','test_set']).T

Unnamed: 0,train_set,test_set
age,int64,int64
workclass,object,object
fnlwgt,int64,int64
education,object,object
education-num,int64,int64
marital-status,object,object
occupation,object,object
relationship,object,object
race,object,object
sex,object,object


In [14]:
for col in adult_train_df.columns:
    if adult_train_df[col].dtype == object:
        adult_train_df[col] = adult_train_df[col].astype('category')

for col in adult_test_df.columns:
    if adult_test_df[col].dtype == object:
        adult_test_df[col] = adult_test_df[col].astype('category')

In [15]:
pd.DataFrame([adult_train_df.dtypes, adult_test_df.dtypes], index = ['train_set','test_set']).T

Unnamed: 0,train_set,test_set
age,int64,int64
workclass,category,category
fnlwgt,int64,int64
education,category,category
education-num,int64,int64
marital-status,category,category
occupation,category,category
relationship,category,category
race,category,category
sex,category,category


In [16]:
adult_train_df.to_pickle('../data/adult_train.p')

In [18]:
adult_test_df.to_pickle('../data/adult_test.p')