In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [2]:
import os
import warnings
import pandas as pd
import seaborn as sns

from source.folktables.data_loader import load_folktables_data

In [3]:
warnings.filterwarnings('ignore')
os.environ["PYTHONWARNINGS"] = "ignore"

sns.set(rc={'figure.figsize':(15, 5)})

In [4]:
try:
    from folktables import ACSDataSource, ACSEmployment
except:
    !pip install folktables
    from folktables import ACSDataSource, ACSEmployment

In [5]:
DATASET_CONFIG = {
    'state': ["NY"],
    'year': '2018',
    'task': ACSEmployment
}

In [6]:
X_data, y_data = load_folktables_data(task=DATASET_CONFIG['task'], state=DATASET_CONFIG['state'], year=DATASET_CONFIG['year'], without_nulls=True)

Original: 24 mb
Optimized: 12 mb

Dataset shape before handling nulls:  (196967, 16)
Impute values:  {'SCHL': 0, 'ESP': 0, 'MIG': 0, 'MIL': 0, 'DREM': 0}
Dataset shape after handling nulls:  (196967, 16)

Rechecking if there are nulls in X_data:
AGEP        0
SCHL        0
MAR         0
RELP        0
DIS         0
ESP         0
CIT         0
MIG         0
MIL         0
ANC         0
NATIVITY    0
DEAR        0
DEYE        0
DREM        0
SEX         0
RAC1P       0
dtype: int64


In [7]:
y_data.value_counts()

0    105498
1     91469
Name: ESR, dtype: int64

In [8]:
X_data.head()

Unnamed: 0,AGEP,SCHL,MAR,RELP,DIS,ESP,CIT,MIG,MIL,ANC,NATIVITY,DEAR,DEYE,DREM,SEX,RAC1P
0,26,21.0,5,17,2,0.0,5,1.0,4.0,1,2,2,2,2.0,2,1
1,21,20.0,5,17,2,0.0,1,3.0,4.0,1,1,2,2,2.0,1,1
2,18,16.0,5,17,2,0.0,2,3.0,4.0,1,1,2,2,2.0,2,8
3,85,16.0,2,16,1,0.0,1,1.0,4.0,4,1,1,2,1.0,2,1
4,19,19.0,5,17,2,0.0,1,1.0,4.0,2,1,2,2,2.0,2,1


In [9]:
y_data.head()

0    0
1    1
2    0
3    0
4    1
Name: ESR, dtype: int64

In [10]:
full_df = pd.concat([X_data, y_data], axis=1)

In [11]:
columns_to_cast = ['SCHL', 'ESP', 'MIG', 'MIL', 'DREM']
for column in columns_to_cast:
    full_df[column] = full_df[column].astype('int')

full_df.head()

Unnamed: 0,AGEP,SCHL,MAR,RELP,DIS,ESP,CIT,MIG,MIL,ANC,NATIVITY,DEAR,DEYE,DREM,SEX,RAC1P,ESR
0,26,21,5,17,2,0,5,1,4,1,2,2,2,2,2,1,0
1,21,20,5,17,2,0,1,3,4,1,1,2,2,2,1,1,1
2,18,16,5,17,2,0,2,3,4,1,1,2,2,2,2,8,0
3,85,16,2,16,1,0,1,1,4,4,1,1,2,1,2,1,0
4,19,19,5,17,2,0,1,1,4,2,1,2,2,2,2,1,1


In [12]:
full_df.to_csv(f'../datasets/folktables-{DATASET_CONFIG["state"][0]}-{DATASET_CONFIG["year"]}.csv', index=False)