In [None]:
import pandas as pd
import numpy as np

#The files adult.data and adult.test were extracted from the file 'adult.zip', 
#available at the UCI Machine Learning Repository https://archive.ics.uci.edu/dataset/2/adult

data_split = pd.read_csv('adult.data', header=None)

with open('adult.test', 'r') as file:
    first_line = file.readline()
    print("Line to drop: ", first_line)

test_split = pd.read_csv('adult.test', header=None, skiprows=1)

#Column names as shown in https://archive.ics.uci.edu/dataset/2/adult
column_names = [
    "age",
    "workclass",
    "fnlwgt",
    "education",
    "education_num",
    "marital_status",
    "occupation",
    "relationship",
    "race",
    "sex",
    "capital_gain",
    "capital_loss",
    "hours_per_week",
    "native_country",
    "income"
]

data_split.columns = column_names
test_split.columns = column_names

#And, for simplicity, we drop the missing values identified by the ' ?' string.
data_split.replace(' ?', np.nan, inplace=True)
data_split.dropna(inplace=True) 

test_split.replace(' ?', np.nan, inplace=True)
test_split.dropna(inplace=True)

print('Training instances last index:', len(data_split)-1)
# Instances up to the index 30162 (not including) will be training instances
# Instances from the index 30162 will be training instances

combined_data = pd.concat([data_split, test_split], ignore_index=True)
combined_data.reset_index(drop=True, inplace=True)

Line to drop:  |1x3 Cross validator

Training instances last index: 30161


In [154]:
combined_data.value_counts('income')

income
<=50K     22654
<=50K.    11360
>50K       7508
>50K.      3700
Name: count, dtype: int64

In [155]:
#Now we convert the dataset's label to a binary one
combined_data['income'] = combined_data['income'].apply(lambda x: 1 if (x == ' >50K') or (x == ' >50K.') else 0)
#We will also create a new column which binarizes the "race" column into "White" and "Non-White" citizens.
combined_data["binarized_race"] = combined_data['race'].apply(lambda x: 'White' if x == ' White' else 'Non-White')
# and we reorder the columns so that binarized_race appears next to the race column
cols = column_names[:9] + ['binarized_race'] + column_names[9:]
combined_data = combined_data[cols]

#We also drop the column "education" as it is redundant due to the "education_num" column, as well as the column "race", which we have binarized
combined_data.drop(columns=['education','race'], inplace=True)
#Furthermore, if we treat the education as a numerical value, we can induce bias, for example, more false positives for people with a higher education level.

In [156]:
#We will alter this column so it does not serve as a proxy for sex
combined_data["relationship"] = combined_data['relationship'].apply(lambda x: ' Married' if (x == ' Wife') or (x == ' Husband') else x)

In [157]:
#Now, similarly to what was done in the FiFAR dataset generation, we must define the dataset.cfg file.
config = dict()
#Let us start by first defining which columns correspond to the categorical variables, protected attributes, and labels
config["data_cols"] = dict()

config["data_cols"]["categorical"] = ["workclass", "marital_status", "occupation", "relationship", "binarized_race", "sex", "native_country"]

#We must also define the "categorical_dict", which is a dictionary where each key is a categorical column name and the value is a list with the possible values for that column.

config["categorical_dict"] = dict()
for col in config["data_cols"]["categorical"]:
    config["categorical_dict"][col] = combined_data[col].unique().tolist()

In [158]:
config["data_cols"]["protected"] = [{"feature": "sex", "type": "categorical", "protected_class": " Female"},
                                    {"feature": "binarized_race", "type": "categorical", "protected_class": "Non-White"},
                                    {"feature": "age", "type": "numeric", "protected_threshold": 50, "protected_class": "higher"}]

config["data_cols"]["label"] = "income"

In [159]:
import yaml

with open('dataset_cfg.yaml', 'w') as file:
    yaml.dump(config, file)

#Finally, we save the dataset as a parquet file
combined_data.to_parquet('preprocessed.parquet')