In [32]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import dice_ml
from dice_ml.utils import helpers 

In [33]:
df = helpers.load_adult_income_dataset()

In [34]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26048 entries, 0 to 26047
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             26048 non-null  int64 
 1   workclass       26048 non-null  object
 2   education       26048 non-null  object
 3   marital_status  26048 non-null  object
 4   occupation      26048 non-null  object
 5   race            26048 non-null  object
 6   gender          26048 non-null  object
 7   hours_per_week  26048 non-null  int64 
 8   income          26048 non-null  int64 
dtypes: int64(3), object(6)
memory usage: 1.8+ MB


In [35]:
df.head()

Unnamed: 0,age,workclass,education,marital_status,occupation,race,gender,hours_per_week,income
0,28,Private,Bachelors,Single,White-Collar,White,Female,60,0
1,30,Self-Employed,Assoc,Married,Professional,White,Male,65,1
2,32,Private,Some-college,Married,White-Collar,White,Male,50,0
3,20,Private,Some-college,Single,Service,White,Female,35,0
4,41,Self-Employed,Some-college,Married,White-Collar,White,Male,50,0


In [36]:
df.drop(columns= ['race', 'gender'], inplace = True)
df_minority  = df[df['income']==1]
df_majority = df[df['income']==0]

### Now, downsamples majority labels equal to the number of samples in the minority class

df_majority = df_majority.sample(500, random_state=0)
df_minority = df_minority.sample(500, random_state=0)

### concat the majority and minority dataframes
df2 = pd.concat([df_majority,df_minority])

## Shuffle the dataset to prevent the model from getting biased by similar samples
df2 = df2.sample(frac=1, random_state=0)

In [37]:
df2.to_csv("data/income-ceml.csv")

In [38]:
df2.shape

(1000, 7)

## Workclass column - need dummies for all categories

In [7]:
df.workclass.nunique()

4

In [8]:
workclass_dummies = pd.get_dummies(df['workclass'], prefix = 'workclass', drop_first= True)

## Martial Status column - get dummies

In [9]:
df.marital_status.nunique()

5

In [10]:
marital_dummies = pd.get_dummies(df['marital_status'], prefix = 'marital_status', drop_first= True)

## Education column - need dummies

In [11]:
df.education.nunique()

8

In [12]:
edu_dummies = pd.get_dummies(df['education'], prefix = 'education', drop_first= True)

## Occupation column - need dummies

In [13]:
df.occupation.nunique()

6

In [14]:
occupation_dummies = pd.get_dummies(df['occupation'], prefix = 'occupation', drop_first= True)

## Removing columns which we will not be using

In [15]:
df.head()

Unnamed: 0,age,workclass,education,marital_status,occupation,hours_per_week,income
0,28,Private,Bachelors,Single,White-Collar,60,0
1,30,Self-Employed,Assoc,Married,Professional,65,1
2,32,Private,Some-college,Married,White-Collar,50,0
3,20,Private,Some-college,Single,Service,35,0
4,41,Self-Employed,Some-college,Married,White-Collar,50,0


## Adding the dummy variables back in 

In [16]:
df = pd.concat([df, workclass_dummies, marital_dummies, edu_dummies, occupation_dummies], axis = 1)

In [17]:
df.head()

Unnamed: 0,age,workclass,education,marital_status,occupation,hours_per_week,income,workclass_Other/Unknown,workclass_Private,workclass_Self-Employed,...,education_HS-grad,education_Masters,education_Prof-school,education_School,education_Some-college,occupation_Other/Unknown,occupation_Professional,occupation_Sales,occupation_Service,occupation_White-Collar
0,28,Private,Bachelors,Single,White-Collar,60,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
1,30,Self-Employed,Assoc,Married,Professional,65,1,0,0,1,...,0,0,0,0,0,0,1,0,0,0
2,32,Private,Some-college,Married,White-Collar,50,0,0,1,0,...,0,0,0,0,1,0,0,0,0,1
3,20,Private,Some-college,Single,Service,35,0,0,1,0,...,0,0,0,0,1,0,0,0,1,0
4,41,Self-Employed,Some-college,Married,White-Collar,50,0,0,0,1,...,0,0,0,0,1,0,0,0,0,1


In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26048 entries, 0 to 26047
Data columns (total 26 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   age                       26048 non-null  int64 
 1   workclass                 26048 non-null  object
 2   education                 26048 non-null  object
 3   marital_status            26048 non-null  object
 4   occupation                26048 non-null  object
 5   hours_per_week            26048 non-null  int64 
 6   income                    26048 non-null  int64 
 7   workclass_Other/Unknown   26048 non-null  uint8 
 8   workclass_Private         26048 non-null  uint8 
 9   workclass_Self-Employed   26048 non-null  uint8 
 10  marital_status_Married    26048 non-null  uint8 
 11  marital_status_Separated  26048 non-null  uint8 
 12  marital_status_Single     26048 non-null  uint8 
 13  marital_status_Widowed    26048 non-null  uint8 
 14  education_Bachelors   

In [19]:
# dropping the original job, marital and edu columns

df.drop(columns= ['workclass', 'education', 'marital_status', 'occupation'], inplace = True)

In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26048 entries, 0 to 26047
Data columns (total 22 columns):
 #   Column                    Non-Null Count  Dtype
---  ------                    --------------  -----
 0   age                       26048 non-null  int64
 1   hours_per_week            26048 non-null  int64
 2   income                    26048 non-null  int64
 3   workclass_Other/Unknown   26048 non-null  uint8
 4   workclass_Private         26048 non-null  uint8
 5   workclass_Self-Employed   26048 non-null  uint8
 6   marital_status_Married    26048 non-null  uint8
 7   marital_status_Separated  26048 non-null  uint8
 8   marital_status_Single     26048 non-null  uint8
 9   marital_status_Widowed    26048 non-null  uint8
 10  education_Bachelors       26048 non-null  uint8
 11  education_Doctorate       26048 non-null  uint8
 12  education_HS-grad         26048 non-null  uint8
 13  education_Masters         26048 non-null  uint8
 14  education_Prof-school     26048 non-nu

In [21]:
df.income.value_counts()

0    19820
1     6228
Name: income, dtype: int64

In [22]:
### Separate the majority and minority classes
df_minority  = df[df['income']==1]
df_majority = df[df['income']==0]

### Now, downsamples majority labels equal to the number of samples in the minority class

df_majority = df_majority.sample(500, random_state=0)
df_minority = df_minority.sample(500, random_state=0)

### concat the majority and minority dataframes
df = pd.concat([df_majority,df_minority])

## Shuffle the dataset to prevent the model from getting biased by similar samples
df = df.sample(frac=1, random_state=0)

In [23]:
# saving final df as a csv

df.to_csv("data/income-cleaned.csv")

In [24]:
df.shape

(1000, 22)