In [1]:
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd

In [3]:
from folktables import ACSDataSource, ACSIncome
import folktables
from sklearn.linear_model import LogisticRegression

In [3]:
# Download 2014 data
data_source = ACSDataSource(survey_year=2014, horizon='1-Year', survey='person')
acs_data14 = data_source.get_data(download=False)

In [4]:
income_thresh = 30000

In [5]:
ACSIncomeNew = folktables.BasicProblem(
    features=[
        'AGEP',
        'COW',
        'SCHL',
        'MAR',
        'WKHP',
        'SEX',
    ],
    target='PINCP',
    target_transform=lambda x: x > income_thresh,    
    group='SEX',
    preprocess=folktables.adult_filter,
    postprocess=lambda x: np.nan_to_num(x, -1),
)

In [6]:
features14,labels14,_ = ACSIncomeNew.df_to_pandas(acs_data14)

In [8]:
np.sum(labels14)

PINCP    854636
dtype: int64

In [9]:
def preprocessAdultDataFeatures(features):
    #Age preprocessing
    age_max = 99
    if np.max(features['AGEP']) > 1:
        features['AGEP'] = features['AGEP'].apply(lambda x: (x)/age_max)

    # Work hours preprocessing
    work_max = 99
    if np.max(features['WKHP']) > 1:
        features['WKHP'] = features['WKHP'].apply(lambda x: (x)/work_max)

    # Education preprocessing
    edu_max = 24
    if np.max(features['SCHL']) > 1:
        features['SCHL'] = features['SCHL'].apply(lambda x: (x)/edu_max)

    # Sex preprocessing
    if np.max(features['SEX']) > 1:
        features['SEX'] = features['SEX'].apply(lambda x: (x - 1))

    # Categorical preprocessing
    features['COW'] = features['COW'].apply(int)
    features['MAR'] = features['MAR'].apply(int)
    features = pd.get_dummies(features, columns=['COW', 'MAR'])

    # Combine MAR columns:
    features['MAR_married'] = features['MAR_1'] + features['MAR_2']
    features['MAR_divorced'] = features['MAR_3'] + features['MAR_4']
    # features['MAR_single'] = features['MAR_5']

    # Drop old columns
    features = features.drop(columns=['MAR_1', 'MAR_2', 'MAR_3', 'MAR_4', 'MAR_5'])

    # Combine COW columns:
    features['COW_private'] = features['COW_1'] + features['COW_2']
    features['COW_public'] = features['COW_3'] + features['COW_4'] + features['COW_5']
    features['COW_self'] = features['COW_6'] + features['COW_7']
    # features['COW_unemployed'] = features['COW_8']

    # Drop old columns
    features = features.drop(columns=['COW_1', 'COW_2', 'COW_3', 'COW_4', 'COW_5', 'COW_6', 'COW_7', 'COW_8'])

    return features

In [10]:
preprocessed_features14 = preprocessAdultDataFeatures(features14)

In [11]:
preprocessed_features14.head()

Unnamed: 0,AGEP,SCHL,WKHP,SEX,MAR_married,MAR_divorced,COW_private,COW_public,COW_self
0,0.494949,0.666667,0.606061,0.0,1,0,0,0,1
1,0.515152,0.666667,0.40404,0.0,1,0,1,0,0
2,0.535354,0.666667,0.40404,1.0,1,0,1,0,0
3,0.515152,0.666667,0.40404,0.0,1,0,1,0,0
4,0.484848,0.833333,0.40404,1.0,1,0,0,1,0


In [12]:
np.sum(preprocessed_features14)

AGEP            6.933271e+05
SCHL            1.222079e+06
WKHP            6.109391e+05
SEX             7.611050e+05
MAR_married     9.137360e+05
MAR_divorced    2.044550e+05
COW_private     1.171981e+06
COW_public      2.472630e+05
COW_self        1.639750e+05
dtype: float64

In [13]:
preprocessed_features14.to_csv("data/preprocessed14/features.csv", index=False)
save_label14 = 2*labels14 - 1
save_label14.to_csv("data/preprocessed14/labels.csv", index=False)

In [4]:
read_features14 = pd.read_csv("data/preprocessed14/features.csv")
read_label14 = pd.read_csv("data/preprocessed14/labels.csv")

In [15]:
read_label14.head()

Unnamed: 0,PINCP
0,1
1,-1
2,-1
3,-1
4,1


In [16]:
read_features14.head()

Unnamed: 0,AGEP,SCHL,WKHP,SEX,MAR_married,MAR_divorced,COW_private,COW_public,COW_self
0,0.494949,0.666667,0.606061,0.0,1,0,0,0,1
1,0.515152,0.666667,0.40404,0.0,1,0,1,0,0
2,0.535354,0.666667,0.40404,1.0,1,0,1,0,0
3,0.515152,0.666667,0.40404,0.0,1,0,1,0,0
4,0.484848,0.833333,0.40404,1.0,1,0,0,1,0


In [17]:
save_label14.head()

Unnamed: 0,PINCP
0,1
1,-1
2,-1
3,-1
4,1


In [18]:
preprocessed_features14.head()

Unnamed: 0,AGEP,SCHL,WKHP,SEX,MAR_married,MAR_divorced,COW_private,COW_public,COW_self
0,0.494949,0.666667,0.606061,0.0,1,0,0,0,1
1,0.515152,0.666667,0.40404,0.0,1,0,1,0,0
2,0.535354,0.666667,0.40404,1.0,1,0,1,0,0
3,0.515152,0.666667,0.40404,0.0,1,0,1,0,0
4,0.484848,0.833333,0.40404,1.0,1,0,0,1,0


In [23]:
np.allclose(read_features14, preprocessed_features14)

True

In [24]:
np.all(read_label14 == save_label14)

True