In [1]:
# !pip install folktables

In [1]:
#info about Data
# https://arxiv.org/pdf/2108.04884.pdf

#ACSDataSource ,ACSEmployment, ACSPublicCoverage, ACSIncome

-> (using ACSEmployment) from the paper: predict whether an individual is employed, after filtering the ACS PUMS data
sample to only include individuals between the ages of 16 and 90  

In [2]:
import pandas as pd
import numpy as np
import copy

import torch
import torch.nn as nn
from sklearn import preprocessing
import torch.optim as optim
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from folktables import ACSDataSource, ACSEmployment,ACSIncome
import matplotlib.pyplot as plt
import torch
import torch.nn.functional as F
import tqdm
from sklearn.metrics import roc_curve
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.preprocessing import LabelEncoder
import torch
import torch.nn as nn
from torchvision import models
from torchsummary import summary
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import accuracy_score


In [3]:
# detail about the state: load_acs.py

In [4]:
data_source = ACSDataSource(survey_year='2018', horizon='1-Year', survey='person')
acs_data = data_source.get_data(states=["AL"], download=True)

#  Info about the data

B.4 ACSEmployment

Predict whether an adult is employed.

Target: ESR (Employment status recode): an individual’s label is 1 if ESR == 1, and 0 otherwise.
Features:

• AGEP (Age): Range of values:
– 0 - 99 (integers)
– 0 indicates less than 1 year old.

• SCHL (Educational attainment): Range of values:
– N/A (less than 3 years old)
– 1: No schooling completed
– 2: Nursery school/preschool
– 3: Kindergarten
– 4: Grade 1
– 5: Grade 2
– 6: Grade 3
– 7: Grade 4
– 8: Grade 5
– 9: Grade 6
– 10: Grade 7
– 11: Grade 8
– 12: Grade 9
– 13: Grade 10
– 14: Grade 11
– 15: 12th Grade - no diploma
– 16: Regular high school diploma
– 17: GED or alternative credential
– 18: Some college but less than 1 year
– 19: 1 or more years of college credit but no degree
– 20: Associate’s degree
– 21: Bachelor’s degree
– 22: Master’s degree
– 23: Professional degree beyond a bachelor’s degree
– 24: Doctorate degree

• MAR (Marital status): Range of values:
– 1: Married
– 2: Widowed
– 3: Divorced
– 4: Separated
– 5: Never married or under 15 years old

• SEX (Sex): Range of values:
– 1: Male
– 2: Female

• DIS (Disability recode): Range of values:
– 1: With a disability
– 2: Without a disability

• ESP (Employment status of parents): Range of values:
– N/A (not own child of householder, and not child in subfamily)
– 1: Living with two parents: both parents in labor force
– 2: Living with two parents: Father only in labor force
– 3: Living with two parents: Mother only in labor force
– 4: Living with two parents: Neither parent in labor force
– 5: Living with father: Father in the labor force
– 6: Living with father: Father not in labor force
– 7: Living with mother: Mother in the labor force
– 8: Living with mother: Mother not in labor force

• MIG (Mobility status (lived here 1 year ago): Range of values:
– N/A (less than 1 year old)
– 1: Yes, same house (nonmovers)
– 2: No, outside US and Puerto Rico
– 3: No, different house in US or Puerto Rico

• CIT (Citizenship status): Range of values:
– 1: Born in the U.S.
– 2: Born in Puerto Rico, Guam, the U.S. Virgin Islands, or the Northern Marianas
– 3: Born abroad of American parent(s)
– 4: U.S. citizen by naturalization
– 5: Not a citizen of the U.S.

• MIL (Military service): Range of values:
– N/A (less than 17 years old)
– 1: Now on active duty
– 2: On active duty in the past, but not now
– 3: Only on active duty for training in Reserves/National Guard
– 4: Never served in the military

• ANC (Ancestry recode): Range of values:
– 1: Single
– 2: Multiple
– 3: Unclassified
– 4: Not reported
– 8: Suppressed for data year 2018 for select PUMAs

• NATIVITY (Nativity): Range of values:
– 1: Native
– 2: Foreign born

• RELP (Relationship): Range of values:
– 0: Reference person
– 1: Husband/wife
– 2: Biological son or daughter
– 3: Adopted son or daughter
– 4: Stepson or stepdaughter
– 5: Brother or sister
– 6: Father or mother
– 7: Grandchild
– 8: Parent-in-law
– 9: Son-in-law or daughter-in-law
– 10: Other relative
– 11: Roomer or boarder
– 12: Housemate or roommate
– 13: Unmarried partner
– 14: Foster child
– 15: Other nonrelative
– 16: Institutionalized group quarters population
– 17: Noninstitutionalized group quarters population

• DEAR (Hearing difficulty): Range of values:
– 1: Yes
– 2: No

• DEYE (Vision difficulty): Range of values:
– 1: Yes
– 2: No

• DREM (Cognitive difficulty): Range of values:
– N/A (less than 5 years old)
– 1: Yes
– 2: No

• RAC1P (Recoded detailed race code): Range of values:
– 1: White alone
– 2: Black or African American alone
– 3: American Indian alone
– 4: Alaska Native alone
– 5: American Indian and Alaska Native tribes specified, or American Indian or Alaska Native, not specified and no other races
– 6: Asian alone
– 7: Native Hawaiian and Other Pacific Islander alone
– 8: Some Other Race alone
– 9: Two or More Races

• GCL (Grandparents living with grandchildren): Range of values:
– N/A (less than 30 years/institutional GQ)
– 1: Yes
– 2: No

Filters:
• AGEP (Age) must be greater than 16 and less than 90.
• PWGTP (Person weight) must be greater than or equal to 1

In [5]:
# features_np, label_np, group_np = ACSEmployment.df_to_numpy(acs_data)
features, label, group = ACSEmployment.df_to_pandas(acs_data)

In [6]:
features

Unnamed: 0,AGEP,SCHL,MAR,RELP,DIS,ESP,CIT,MIG,MIL,ANC,NATIVITY,DEAR,DEYE,DREM,SEX,RAC1P
0,19.0,18.0,5.0,17.0,2.0,0.0,1.0,3.0,4.0,1.0,1.0,2.0,2.0,2.0,2.0,1.0
1,18.0,18.0,5.0,17.0,2.0,0.0,1.0,3.0,4.0,1.0,1.0,2.0,2.0,2.0,2.0,2.0
2,53.0,17.0,5.0,16.0,1.0,0.0,1.0,1.0,4.0,2.0,1.0,2.0,2.0,1.0,1.0,1.0
3,28.0,19.0,5.0,16.0,2.0,0.0,1.0,1.0,2.0,1.0,1.0,2.0,2.0,2.0,1.0,1.0
4,25.0,12.0,5.0,16.0,1.0,0.0,1.0,3.0,4.0,1.0,1.0,2.0,2.0,1.0,2.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47772,18.0,16.0,5.0,2.0,2.0,0.0,1.0,1.0,4.0,1.0,1.0,2.0,2.0,2.0,2.0,1.0
47773,15.0,11.0,5.0,2.0,2.0,7.0,1.0,1.0,0.0,1.0,1.0,2.0,2.0,2.0,1.0,1.0
47774,10.0,6.0,5.0,2.0,2.0,7.0,1.0,1.0,0.0,1.0,1.0,2.0,2.0,2.0,1.0,1.0
47775,4.0,1.0,5.0,2.0,2.0,7.0,1.0,1.0,0.0,1.0,1.0,2.0,2.0,0.0,1.0,1.0


In [7]:
label

Unnamed: 0,ESR
0,False
1,False
2,False
3,False
4,False
...,...
47772,True
47773,False
47774,False
47775,False


In [8]:
merged_df=pd.concat([features, label], axis=1)
merged_df.head()

Unnamed: 0,AGEP,SCHL,MAR,RELP,DIS,ESP,CIT,MIG,MIL,ANC,NATIVITY,DEAR,DEYE,DREM,SEX,RAC1P,ESR
0,19.0,18.0,5.0,17.0,2.0,0.0,1.0,3.0,4.0,1.0,1.0,2.0,2.0,2.0,2.0,1.0,False
1,18.0,18.0,5.0,17.0,2.0,0.0,1.0,3.0,4.0,1.0,1.0,2.0,2.0,2.0,2.0,2.0,False
2,53.0,17.0,5.0,16.0,1.0,0.0,1.0,1.0,4.0,2.0,1.0,2.0,2.0,1.0,1.0,1.0,False
3,28.0,19.0,5.0,16.0,2.0,0.0,1.0,1.0,2.0,1.0,1.0,2.0,2.0,2.0,1.0,1.0,False
4,25.0,12.0,5.0,16.0,1.0,0.0,1.0,3.0,4.0,1.0,1.0,2.0,2.0,1.0,2.0,1.0,False


In [9]:
len(merged_df)

47777

In [10]:
group.value_counts()

RAC1P
1        35551
2        10115
9          835
6          556
8          491
3          182
5           33
7           13
4            1
dtype: int64

In [11]:
merged_df["RAC1P"].value_counts()

# 1: White alone – 2: Black or African American alone

1.0    35551
2.0    10115
9.0      835
6.0      556
8.0      491
3.0      182
5.0       33
7.0       13
4.0        1
Name: RAC1P, dtype: int64

In [12]:
pd.unique(merged_df["RAC1P"])

array([1., 2., 8., 9., 6., 3., 5., 7., 4.])

In [13]:
#  for Black and white people
filtered_df= merged_df[["SEX","RAC1P","ESR"]][merged_df['RAC1P'].isin([1,2])]
print("len(filtered_df): ", len(filtered_df),"\n")
print(filtered_df["ESR"].value_counts(),"\n")
print(filtered_df["SEX"].value_counts(),"\n")
print(filtered_df["RAC1P"].value_counts(),"\n")
filtered_df.head() 

len(filtered_df):  45666 

False    26881
True     18785
Name: ESR, dtype: int64 

2.0    23737
1.0    21929
Name: SEX, dtype: int64 

1.0    35551
2.0    10115
Name: RAC1P, dtype: int64 



Unnamed: 0,SEX,RAC1P,ESR
0,2.0,1.0,False
1,2.0,2.0,False
2,1.0,1.0,False
3,1.0,1.0,False
4,2.0,1.0,False


In [14]:

# 1- Male 2- Female
# 1- White 2-Black

counts_frq = filtered_df.groupby(['SEX', 'RAC1P']).size()
counts_frq

SEX  RAC1P
1.0  1.0      17309
     2.0       4620
2.0  1.0      18242
     2.0       5495
dtype: int64

In [15]:
merged_df

Unnamed: 0,AGEP,SCHL,MAR,RELP,DIS,ESP,CIT,MIG,MIL,ANC,NATIVITY,DEAR,DEYE,DREM,SEX,RAC1P,ESR
0,19.0,18.0,5.0,17.0,2.0,0.0,1.0,3.0,4.0,1.0,1.0,2.0,2.0,2.0,2.0,1.0,False
1,18.0,18.0,5.0,17.0,2.0,0.0,1.0,3.0,4.0,1.0,1.0,2.0,2.0,2.0,2.0,2.0,False
2,53.0,17.0,5.0,16.0,1.0,0.0,1.0,1.0,4.0,2.0,1.0,2.0,2.0,1.0,1.0,1.0,False
3,28.0,19.0,5.0,16.0,2.0,0.0,1.0,1.0,2.0,1.0,1.0,2.0,2.0,2.0,1.0,1.0,False
4,25.0,12.0,5.0,16.0,1.0,0.0,1.0,3.0,4.0,1.0,1.0,2.0,2.0,1.0,2.0,1.0,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47772,18.0,16.0,5.0,2.0,2.0,0.0,1.0,1.0,4.0,1.0,1.0,2.0,2.0,2.0,2.0,1.0,True
47773,15.0,11.0,5.0,2.0,2.0,7.0,1.0,1.0,0.0,1.0,1.0,2.0,2.0,2.0,1.0,1.0,False
47774,10.0,6.0,5.0,2.0,2.0,7.0,1.0,1.0,0.0,1.0,1.0,2.0,2.0,2.0,1.0,1.0,False
47775,4.0,1.0,5.0,2.0,2.0,7.0,1.0,1.0,0.0,1.0,1.0,2.0,2.0,0.0,1.0,1.0,False


In [16]:
# # filter the dataframe for each combination

#White Men
WM = merged_df.loc[(merged_df['SEX'] == 1) & (merged_df['RAC1P'] == 1)]
#Black Men
BM = merged_df.loc[(merged_df['SEX'] == 1) & (merged_df['RAC1P'] == 2)]
#white Women
WW = merged_df.loc[(merged_df['SEX'] == 2) & (merged_df['RAC1P'] == 1)]
#Black Women
BW = merged_df.loc[(merged_df['SEX'] == 2) & (merged_df['RAC1P'] == 2)]

In [17]:
WM

Unnamed: 0,AGEP,SCHL,MAR,RELP,DIS,ESP,CIT,MIG,MIL,ANC,NATIVITY,DEAR,DEYE,DREM,SEX,RAC1P,ESR
2,53.0,17.0,5.0,16.0,1.0,0.0,1.0,1.0,4.0,2.0,1.0,2.0,2.0,1.0,1.0,1.0,False
3,28.0,19.0,5.0,16.0,2.0,0.0,1.0,1.0,2.0,1.0,1.0,2.0,2.0,2.0,1.0,1.0,False
7,38.0,12.0,5.0,16.0,1.0,0.0,1.0,1.0,4.0,4.0,1.0,1.0,2.0,2.0,1.0,1.0,False
8,41.0,16.0,5.0,17.0,2.0,0.0,1.0,1.0,4.0,4.0,1.0,2.0,2.0,2.0,1.0,1.0,True
14,21.0,19.0,5.0,17.0,2.0,0.0,1.0,1.0,1.0,2.0,1.0,2.0,2.0,2.0,1.0,1.0,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47768,76.0,13.0,3.0,0.0,1.0,0.0,1.0,1.0,4.0,2.0,1.0,1.0,2.0,2.0,1.0,1.0,False
47773,15.0,11.0,5.0,2.0,2.0,7.0,1.0,1.0,0.0,1.0,1.0,2.0,2.0,2.0,1.0,1.0,False
47774,10.0,6.0,5.0,2.0,2.0,7.0,1.0,1.0,0.0,1.0,1.0,2.0,2.0,2.0,1.0,1.0,False
47775,4.0,1.0,5.0,2.0,2.0,7.0,1.0,1.0,0.0,1.0,1.0,2.0,2.0,0.0,1.0,1.0,False


# Removing Sex and Gen from 4 Groups

In [18]:
WM.columns

Index(['AGEP', 'SCHL', 'MAR', 'RELP', 'DIS', 'ESP', 'CIT', 'MIG', 'MIL', 'ANC',
       'NATIVITY', 'DEAR', 'DEYE', 'DREM', 'SEX', 'RAC1P', 'ESR'],
      dtype='object')

In [19]:
WM_updated=WM[['AGEP', 'SCHL', 'MAR', 'RELP', 'DIS', 'ESP', 'CIT', 'MIG', 'MIL', 'ANC','NATIVITY', 'DEAR', 'DEYE', 'DREM', 'ESR']]
BM_updated=BM[['AGEP', 'SCHL', 'MAR', 'RELP', 'DIS', 'ESP', 'CIT', 'MIG', 'MIL', 'ANC','NATIVITY', 'DEAR', 'DEYE', 'DREM', 'ESR']]
WW_updated=WW[['AGEP', 'SCHL', 'MAR', 'RELP', 'DIS', 'ESP', 'CIT', 'MIG', 'MIL', 'ANC','NATIVITY', 'DEAR', 'DEYE', 'DREM', 'ESR']]
BW_updated=BW[['AGEP', 'SCHL', 'MAR', 'RELP', 'DIS', 'ESP', 'CIT', 'MIG', 'MIL', 'ANC','NATIVITY', 'DEAR', 'DEYE', 'DREM', 'ESR']]

# Grouping

In [20]:
#Group-1
#Train set: White men+black men+Black women—- validation set: black men
WM_BM_BW_train = pd.concat([WM_updated,BM_updated,BW_updated])
BM_validation=BM_updated.copy()

#Group-2
#Train set: White women+black women+black men—- validation set: black women
WW_BW_BM_train=pd.concat([WW_updated,BW_updated,BM_updated])
BW_validation=BW_updated.copy()

#Group-3
#Train set: black men+white men+white women+validation set: white men
BM_WM_WW_train=pd.concat([BM_updated,WM_updated,WW_updated])
WM_validation=WM_updated.copy()

#Group-4
#Train set: white women+white men+black women, validation set: white women
WW_WM_BW_train=pd.concat([WW_updated,WM_updated,BW_updated])
WW_validation=WW_updated.copy()

In [21]:

print("BM" ,len(BM),"RAC: ", pd.unique(BM['RAC1P']),"SEX: ", pd.unique(BM['SEX']))
print("BW" ,len(BW),"RAC: ", pd.unique(BW['RAC1P']),"SEX: ", pd.unique(BW['SEX']))
print("WM" ,len(WM),"RAC: ", pd.unique(WM['RAC1P']),"SEX: ", pd.unique(WM['SEX']))
print("WW" ,len(WW),"RAC: ", pd.unique(WW['RAC1P']),"SEX: ", pd.unique(WW['SEX']))

BM 4620 RAC:  [2.] SEX:  [1.]
BW 5495 RAC:  [2.] SEX:  [2.]
WM 17309 RAC:  [1.] SEX:  [1.]
WW 18242 RAC:  [1.] SEX:  [2.]


In [22]:
data_to_use_train= WM_BM_BW_train
data_to_use_val= BM_validation


X_train= data_to_use_train[['AGEP', 'SCHL', 'MAR', 'RELP', 'DIS', 'ESP', 'CIT', 'MIG', 'MIL', 'ANC','NATIVITY', 'DEAR', 'DEYE', 'DREM']]
y_train= data_to_use_train[['ESR']]

# X_train = torch.from_numpy(X_train.to_numpy()).float()
# y_train = torch.squeeze(torch.from_numpy(y_train.to_numpy()).float())

X_valid = data_to_use_val[['AGEP', 'SCHL', 'MAR', 'RELP', 'DIS', 'ESP', 'CIT', 'MIG', 'MIL', 'ANC','NATIVITY', 'DEAR', 'DEYE', 'DREM']]
y_valid = data_to_use_val[['ESR']]

# X_valid = torch.from_numpy(X_valid.to_numpy()).float()
# y_valid = torch.squeeze(torch.from_numpy(y_valid.to_numpy()).float())

# y_train = torch.nn.functional.one_hot(y_train.long())
# y_val = torch.nn.functional.one_hot(y_valid.long())


# X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2)
print("TYPE X: ",type(X_train)," Type Y", type(y_train),"\n")
print("len(X_train),len(y_train), len(X_val),len(y_val): ",len(X_train),len(y_train), len(X_valid),len(y_valid))
print("X_train.shape, y_train.shape ",X_train.shape, y_train.shape,"\n")
print("X_valid.shape, y_valid.shape ",X_valid.shape, y_valid.shape,"\n")

X_train

TYPE X:  <class 'pandas.core.frame.DataFrame'>  Type Y <class 'pandas.core.frame.DataFrame'> 

len(X_train),len(y_train), len(X_val),len(y_val):  27424 27424 4620 4620
X_train.shape, y_train.shape  (27424, 14) (27424, 1) 

X_valid.shape, y_valid.shape  (4620, 14) (4620, 1) 



Unnamed: 0,AGEP,SCHL,MAR,RELP,DIS,ESP,CIT,MIG,MIL,ANC,NATIVITY,DEAR,DEYE,DREM
2,53.0,17.0,5.0,16.0,1.0,0.0,1.0,1.0,4.0,2.0,1.0,2.0,2.0,1.0
3,28.0,19.0,5.0,16.0,2.0,0.0,1.0,1.0,2.0,1.0,1.0,2.0,2.0,2.0
7,38.0,12.0,5.0,16.0,1.0,0.0,1.0,1.0,4.0,4.0,1.0,1.0,2.0,2.0
8,41.0,16.0,5.0,17.0,2.0,0.0,1.0,1.0,4.0,4.0,1.0,2.0,2.0,2.0
14,21.0,19.0,5.0,17.0,2.0,0.0,1.0,1.0,1.0,2.0,1.0,2.0,2.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47710,28.0,19.0,1.0,1.0,2.0,0.0,1.0,1.0,4.0,1.0,1.0,2.0,2.0,2.0
47750,56.0,16.0,5.0,0.0,2.0,0.0,1.0,1.0,4.0,1.0,1.0,2.0,2.0,2.0
47751,25.0,21.0,5.0,2.0,2.0,0.0,1.0,1.0,4.0,1.0,1.0,2.0,2.0,2.0
47764,81.0,14.0,1.0,1.0,2.0,0.0,1.0,1.0,4.0,1.0,1.0,2.0,2.0,2.0


In [23]:
y_train

Unnamed: 0,ESR
2,False
3,False
7,False
8,True
14,False
...,...
47710,True
47750,False
47751,True
47764,False


In [24]:
X_valid = preprocessing.normalize(X_valid)
X_train = preprocessing.normalize(X_train)
print("X_train ",X_train)

# Binary encoding of labels
encoder = LabelEncoder()
encoder.fit(y_train)
y_train = encoder.transform(y_train)
y_valid = encoder.transform(y_valid)

# Convert to 2D PyTorch tensors
X_train= torch.tensor(X_train, dtype=torch.float32)
X_valid= torch.tensor(X_valid, dtype=torch.float32)
y_train = torch.tensor(y_train, dtype=torch.float32).reshape(-1, 1)
y_valid = torch.tensor(y_valid, dtype=torch.float32).reshape(-1, 1)

X_train  [[0.90734272 0.29103446 0.08559837 ... 0.03423935 0.03423935 0.01711967]
 [0.73531602 0.49896444 0.13130643 ... 0.05252257 0.05252257 0.05252257]
 [0.86858561 0.27429019 0.11428758 ... 0.02285752 0.04571503 0.04571503]
 ...
 [0.74337629 0.62443609 0.14867526 ... 0.0594701  0.0594701  0.0594701 ]
 [0.98263061 0.16983739 0.01213124 ... 0.02426248 0.02426248 0.02426248]
 [0.95612824 0.26838687 0.05032254 ... 0.03354836 0.03354836 0.03354836]]


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


In [25]:
X_train,X_valid

(tensor([[0.9073, 0.2910, 0.0856,  ..., 0.0342, 0.0342, 0.0171],
         [0.7353, 0.4990, 0.1313,  ..., 0.0525, 0.0525, 0.0525],
         [0.8686, 0.2743, 0.1143,  ..., 0.0229, 0.0457, 0.0457],
         ...,
         [0.7434, 0.6244, 0.1487,  ..., 0.0595, 0.0595, 0.0595],
         [0.9826, 0.1698, 0.0121,  ..., 0.0243, 0.0243, 0.0243],
         [0.9561, 0.2684, 0.0503,  ..., 0.0335, 0.0335, 0.0335]]),
 tensor([[0.9311, 0.2722, 0.0143,  ..., 0.0287, 0.0287, 0.0143],
         [0.9255, 0.2468, 0.0771,  ..., 0.0308, 0.0308, 0.0154],
         [0.8398, 0.3632, 0.0908,  ..., 0.0454, 0.0454, 0.0454],
         ...,
         [0.8279, 0.5424, 0.0285,  ..., 0.0571, 0.0571, 0.0571],
         [0.4376, 0.2188, 0.3647,  ..., 0.1459, 0.1459, 0.1459],
         [0.9803, 0.1845, 0.0115,  ..., 0.0231, 0.0231, 0.0231]]))

In [26]:
y_train,y_valid

(tensor([[0.],
         [0.],
         [0.],
         ...,
         [1.],
         [0.],
         [0.]]),
 tensor([[0.],
         [0.],
         [0.],
         ...,
         [0.],
         [0.],
         [0.]]))

In [27]:
X_train.shape, y_train.shape

(torch.Size([27424, 14]), torch.Size([27424, 1]))

In [28]:
class Deep_wide_Net(nn.Module):
    def __init__(self):
        super().__init__()
        # 14 : input shape
        self.layer1 = nn.Linear(14, 1024)
        self.act1 = nn.ReLU()
        self.dropout1 = nn.Dropout(p=0.5)
        
        self.layer2 = nn.Linear(1024, 512)
        self.dropout2 = nn.Dropout(p=0.5)
        self.act2 = nn.ReLU()
        
        self.layer3 = nn.Linear(512, 256)
        self.act3 = nn.ReLU()
        
        self.layer4 = nn.Linear(256, 128)
        self.act4 = nn.ReLU()
        
        self.layer5 = nn.Linear(128, 60)
        self.act5 = nn.ReLU()
        
        self.output = nn.Linear(60, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.act1(self.layer1(x))
        x = self.dropout1(x)
        x = self.act2(self.layer2(x))
        x = self.dropout2(x)
        x = self.act3(self.layer3(x))
        x = self.act4(self.layer4(x))
        x = self.act5(self.layer5(x))
        x = self.sigmoid(self.output(x))
        return x
    

In [29]:
class DeepNet(nn.Module):
    def __init__(self):
        super().__init__()
        # 14 : input shape
        self.layer1 = nn.Linear(14, 512)
        self.act1 = nn.ReLU()
        self.dropout1 = nn.Dropout(p=0.5)
        self.layer2 = nn.Linear(512, 256)
        self.act2 = nn.ReLU()
        self.layer3 = nn.Linear(256, 60)
        self.act3 = nn.ReLU()
        self.output = nn.Linear(60, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.act1(self.layer1(x))
        x = self.dropout1(x)
        x = self.act2(self.layer2(x))
        x = self.act3(self.layer3(x))
        x = self.sigmoid(self.output(x))
        return x
    

In [30]:
def model_train(model, X_train, y_train, X_val, y_val):
    # Loss function and optimizer
    loss_fn = nn.BCELoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    # Number of epochs and batch size
    n_epochs = 10
    batch_size = 1000

    # Initialize best accuracy and weights
    best_acc = -np.inf
    best_weights = None

    # Train the model
    for epoch in range(n_epochs):
        model.train()
        with tqdm.tqdm(range(len(X_train) // batch_size), unit="batch", mininterval=0, disable=True) as bar:
            for i in bar:
                # Get a batch of data
                X_batch = X_train[i * batch_size:(i + 1) * batch_size]
                y_batch = y_train[i * batch_size:(i + 1) * batch_size]

                # Forward pass
                y_pred = model(X_batch)

                # Calculate the loss
                loss = loss_fn(y_pred, y_batch)

                # Backpropagate the loss
                optimizer.zero_grad()
                loss.backward()

                # Update the weights
                optimizer.step()

                # Calculate the accuracy
                acc = (y_pred.round() == y_batch).float().mean()

                # Update the best accuracy and weights
                if acc > best_acc:
                    best_acc = acc
                    best_weights = copy.deepcopy(model.state_dict())

                # Track training loss
                train_loss = loss.item()

                # Evaluate the model on the validation set
                model.eval()
                y_pred = model(X_val)
                val_acc = (y_pred.round() == y_val).float().mean()

                # Track validation loss
                val_loss = loss_fn(y_pred, y_val).item()

        print("Epoch: %d, Training Loss: %.4f, Validation Loss: %.4f" % (epoch, train_loss, val_loss))

    # Return the best accuracy and validation accuracy
    return best_acc, val_acc



kfold = StratifiedKFold(n_splits=2, shuffle=True)

cv_scores_deep = []
cv_val_scores_deep = []

for train, test in kfold.split(X_train, y_train):
    
    model = DeepNet()
#     model = Deep_wide_Net()
    acc, val_acc = model_train(model, X_train[train], y_train[train], X_train[test], y_train[test])
    print("Accuracy (deep): %.2f, Validation Acc: %.2f" % (acc, val_acc))
    cv_scores_deep.append(acc)
    cv_val_scores_deep.append(val_acc)
    
    
deep_acc = np.mean(cv_scores_deep)
deep_std = np.std(cv_scores_deep)
deep_val_acc = np.mean(cv_val_scores_deep)
deep_val_std = np.std(cv_val_scores_deep)
print("Deep: %.2f%% (+/- %.2f%%)" % (deep_acc*100, deep_std*100))
print("Deep Validation: %.2f%% (+/- %.2f%%)" % (deep_val_acc*100, deep_val_std*100))

Epoch: 0, Training Loss: 0.6381, Validation Loss: 0.6507
Epoch: 1, Training Loss: 0.5896, Validation Loss: 0.5800
Epoch: 2, Training Loss: 0.5424, Validation Loss: 0.5425
Epoch: 3, Training Loss: 0.5003, Validation Loss: 0.5288
Epoch: 4, Training Loss: 0.4905, Validation Loss: 0.5073
Epoch: 5, Training Loss: 0.4800, Validation Loss: 0.4901
Epoch: 6, Training Loss: 0.4796, Validation Loss: 0.4972
Epoch: 7, Training Loss: 0.4831, Validation Loss: 0.5149
Epoch: 8, Training Loss: 0.4688, Validation Loss: 0.5172
Epoch: 9, Training Loss: 0.4700, Validation Loss: 0.4579
Accuracy (deep): 0.82, Validation Acc: 0.78
Epoch: 0, Training Loss: 0.6317, Validation Loss: 0.6515
Epoch: 1, Training Loss: 0.5962, Validation Loss: 0.5858
Epoch: 2, Training Loss: 0.5418, Validation Loss: 0.5480
Epoch: 3, Training Loss: 0.4960, Validation Loss: 0.5389
Epoch: 4, Training Loss: 0.4940, Validation Loss: 0.5376
Epoch: 5, Training Loss: 0.4894, Validation Loss: 0.5375
Epoch: 6, Training Loss: 0.4720, Validation 

In [31]:
# name_of_model='WM_BM_BW_deep_wide.pth'
name_of_model='WM_BM_BW.pth'

In [32]:
torch.save(model.state_dict(), name_of_model)

In [50]:
model = DeepNet()
# model = Deep_wide_Net()
model.load_state_dict(torch.load(name_of_model))
model.eval()

DeepNet(
  (layer1): Linear(in_features=14, out_features=512, bias=True)
  (act1): ReLU()
  (dropout1): Dropout(p=0.5, inplace=False)
  (layer2): Linear(in_features=512, out_features=256, bias=True)
  (act2): ReLU()
  (layer3): Linear(in_features=256, out_features=60, bias=True)
  (act3): ReLU()
  (output): Linear(in_features=60, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)

# Testing 

In [51]:
import pandas as pd
import numpy as np
from folktables import ACSDataSource, ACSEmployment,ACSIncome
import matplotlib.pyplot as plt

In [35]:
data_source_test = ACSDataSource(survey_year='2017', horizon='1-Year', survey='person')
acs_data_test = data_source_test.get_data(states=["AL"], download=True)
features_test, label_test, group_test = ACSEmployment.df_to_pandas(acs_data_test)
print(features_test.head(),"\n\n Lable: \n ", label_test.head())

   AGEP  SCHL  MAR  RELP  DIS  ESP  CIT  MIG  MIL  ANC  NATIVITY  DEAR  DEYE  \
0  73.0  10.0  4.0   0.0  1.0  0.0  1.0  1.0  4.0  1.0       1.0   2.0   2.0   
1  31.0  21.0  5.0   0.0  2.0  0.0  1.0  1.0  4.0  2.0       1.0   2.0   2.0   
2  41.0  17.0  1.0   0.0  2.0  0.0  1.0  1.0  4.0  1.0       1.0   2.0   2.0   
3  48.0  16.0  1.0   1.0  2.0  0.0  1.0  1.0  4.0  1.0       1.0   2.0   2.0   
4  16.0  13.0  5.0   2.0  2.0  2.0  1.0  1.0  0.0  1.0       1.0   2.0   2.0   

   DREM  SEX  RAC1P  
0   1.0  1.0    2.0  
1   2.0  2.0    1.0  
2   2.0  1.0    1.0  
3   2.0  2.0    1.0  
4   2.0  1.0    1.0   

 Lable: 
       ESR
0  False
1   True
2   True
3  False
4  False


In [36]:
len(features),len(features_test)

(47777, 47645)

In [37]:
merged_df_test=pd.concat([features_test, label_test], axis=1)
merged_df_test.head()

Unnamed: 0,AGEP,SCHL,MAR,RELP,DIS,ESP,CIT,MIG,MIL,ANC,NATIVITY,DEAR,DEYE,DREM,SEX,RAC1P,ESR
0,73.0,10.0,4.0,0.0,1.0,0.0,1.0,1.0,4.0,1.0,1.0,2.0,2.0,1.0,1.0,2.0,False
1,31.0,21.0,5.0,0.0,2.0,0.0,1.0,1.0,4.0,2.0,1.0,2.0,2.0,2.0,2.0,1.0,True
2,41.0,17.0,1.0,0.0,2.0,0.0,1.0,1.0,4.0,1.0,1.0,2.0,2.0,2.0,1.0,1.0,True
3,48.0,16.0,1.0,1.0,2.0,0.0,1.0,1.0,4.0,1.0,1.0,2.0,2.0,2.0,2.0,1.0,False
4,16.0,13.0,5.0,2.0,2.0,2.0,1.0,1.0,0.0,1.0,1.0,2.0,2.0,2.0,1.0,1.0,False


In [38]:
WM_test = merged_df_test.loc[(merged_df_test['SEX'] == 1) & (merged_df_test['RAC1P'] == 1)]


In [39]:
# # filter the dataframe for each combination

#White Men
WM_test = merged_df_test.loc[(merged_df_test['SEX'] == 1) & (merged_df_test['RAC1P'] == 1)]
#Black Men
BM_test = merged_df_test.loc[(merged_df_test['SEX'] == 1) & (merged_df_test['RAC1P'] == 2)]
#white Women
WW_test = merged_df_test.loc[(merged_df_test['SEX'] == 2) & (merged_df_test['RAC1P'] == 1)]
#Black Women
BW_test = merged_df_test.loc[(merged_df_test['SEX'] == 2) & (merged_df_test['RAC1P'] == 2)]

In [40]:
WM_BM_BW_test= pd.concat([WM_test,BM_test,BW_test])
WM_BM_BW_test

Unnamed: 0,AGEP,SCHL,MAR,RELP,DIS,ESP,CIT,MIG,MIL,ANC,NATIVITY,DEAR,DEYE,DREM,SEX,RAC1P,ESR
2,41.0,17.0,1.0,0.0,2.0,0.0,1.0,1.0,4.0,1.0,1.0,2.0,2.0,2.0,1.0,1.0,True
4,16.0,13.0,5.0,2.0,2.0,2.0,1.0,1.0,0.0,1.0,1.0,2.0,2.0,2.0,1.0,1.0,False
10,32.0,16.0,1.0,0.0,2.0,0.0,5.0,3.0,4.0,1.0,2.0,2.0,2.0,2.0,1.0,1.0,True
18,80.0,16.0,1.0,1.0,2.0,0.0,1.0,1.0,2.0,2.0,1.0,2.0,2.0,2.0,1.0,1.0,False
19,32.0,22.0,5.0,0.0,2.0,0.0,1.0,1.0,4.0,4.0,1.0,2.0,2.0,2.0,1.0,1.0,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47605,24.0,19.0,5.0,17.0,2.0,0.0,1.0,1.0,4.0,1.0,1.0,2.0,2.0,2.0,2.0,2.0,False
47623,20.0,16.0,5.0,17.0,2.0,0.0,1.0,3.0,4.0,1.0,1.0,2.0,2.0,2.0,2.0,2.0,True
47628,17.0,16.0,5.0,17.0,2.0,0.0,1.0,3.0,4.0,2.0,1.0,2.0,2.0,2.0,2.0,2.0,True
47635,19.0,19.0,5.0,17.0,2.0,0.0,1.0,1.0,4.0,1.0,1.0,2.0,2.0,2.0,2.0,2.0,False


In [41]:
# filtered_df_test= merged_df_test[["SEX","RAC1P","ESR"]][merged_df_test['RAC1P'].isin([1,2])]
# filtered_df_test.head() 

In [42]:
# counts_frq_test = filtered_df_test.groupby(['SEX', 'RAC1P']).size()
# counts_frq_test

In [43]:
merged_df_test.head()

Unnamed: 0,AGEP,SCHL,MAR,RELP,DIS,ESP,CIT,MIG,MIL,ANC,NATIVITY,DEAR,DEYE,DREM,SEX,RAC1P,ESR
0,73.0,10.0,4.0,0.0,1.0,0.0,1.0,1.0,4.0,1.0,1.0,2.0,2.0,1.0,1.0,2.0,False
1,31.0,21.0,5.0,0.0,2.0,0.0,1.0,1.0,4.0,2.0,1.0,2.0,2.0,2.0,2.0,1.0,True
2,41.0,17.0,1.0,0.0,2.0,0.0,1.0,1.0,4.0,1.0,1.0,2.0,2.0,2.0,1.0,1.0,True
3,48.0,16.0,1.0,1.0,2.0,0.0,1.0,1.0,4.0,1.0,1.0,2.0,2.0,2.0,2.0,1.0,False
4,16.0,13.0,5.0,2.0,2.0,2.0,1.0,1.0,0.0,1.0,1.0,2.0,2.0,2.0,1.0,1.0,False


In [44]:
X_test= WM_BM_BW_test[['AGEP', 'SCHL', 'MAR', 'RELP', 'DIS', 'ESP', 'CIT', 'MIG', 'MIL', 'ANC','NATIVITY', 'DEAR', 'DEYE', 'DREM']]
y_test= WM_BM_BW_test[['ESR']]

# X_test= merged_df_test[['AGEP', 'SCHL', 'MAR', 'RELP', 'DIS', 'ESP', 'CIT', 'MIG', 'MIL', 'ANC','NATIVITY', 'DEAR', 'DEYE', 'DREM']]
# y_test= merged_df_test[['ESR']]



X_test = preprocessing.normalize(X_test)
encoder = LabelEncoder()
encoder.fit(y_test)
y_test = encoder.transform(y_test)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


In [45]:
len(y_test)

27424

In [46]:
# y_test = encoder.transform(y_test)

X_test= torch.tensor(X_test, dtype=torch.float32)
y_test = torch.tensor(y_test, dtype=torch.float32).reshape(-1, 1)
print(X_test)
print(y_test)

# Test saved model

# name_of_model='WM_BM_BW.pt'

# model = DeepNet()
# model.load_state_dict(torch.load(name_of_model))
# model.eval()

tensor([[0.9152, 0.3795, 0.0223,  ..., 0.0446, 0.0446, 0.0446],
        [0.7318, 0.5946, 0.2287,  ..., 0.0915, 0.0915, 0.0915],
        [0.8703, 0.4351, 0.0272,  ..., 0.0544, 0.0544, 0.0544],
        ...,
        [0.5648, 0.5316, 0.1661,  ..., 0.0664, 0.0664, 0.0664],
        [0.5803, 0.5803, 0.1527,  ..., 0.0611, 0.0611, 0.0611],
        [0.6742, 0.5394, 0.1348,  ..., 0.0539, 0.0539, 0.0539]])
tensor([[1.],
        [0.],
        [1.],
        ...,
        [1.],
        [0.],
        [0.]])


In [47]:
pred_out=model(X_test)
pred_out,len(pred_out)

(tensor([[0.8054],
         [0.1220],
         [0.8115],
         ...,
         [0.1040],
         [0.2291],
         [0.3556]], grad_fn=<SigmoidBackward0>),
 27424)

In [48]:
y_pred = model(X_test)
y_pred

tensor([[0.8054],
        [0.1220],
        [0.8115],
        ...,
        [0.1040],
        [0.2291],
        [0.3556]], grad_fn=<SigmoidBackward0>)

In [49]:
y_pred_labels = (y_pred > 0.7).float() 
accuracy_score(y_test, y_pred_labels)

0.7505834305717619

In [60]:
# WM_BM_BW
# Test Acc: 0.5947

# WW_BW_BM
#Test Accuracy: 0.5947

#WW_WM_BW_
#Test Accuracy: 0.5947