In [105]:
import pandas as pd
import numpy as np

In [126]:
def preprocess_adult_data(filepath, 
                    categorical_features, 
                    label_name="income",
                    favorable_label=">50K",
                    protected_attribute_name='gender',
                    privileged_class='Male',
                    features_to_drop=["fnlwgt"],
                    missing_val="?"):
    
    adult_df = pd.read_csv(filepath)
    adult_df.replace(missing_val, np.nan, inplace=True)
    adult_df.dropna(inplace=True)
    
    ### label and protect column
    adult_df[label_name] = (adult_df[label_name] == favorable_label)*1
    adult_df[protected_attribute_name] = (adult_df[protected_attribute_name] == privileged_class)*1
    
    ### categorizing age 
    age_buckets = [1, 18, 25, 30, 35, 40, 45, 50, 55, 60, 65,90]
    bin_labels = ["1-18", "18-25", "25-30", "30-35", "35-40", "40-45", "45-50", "50-55", "55-60", "60-65", "65-90"]
    adult_df["age_cat"] = pd.cut(adult_df.age, bins=age_buckets, labels=bin_labels)
    adult_df.drop(columns=["age"], inplace=True)
    
    ### one-hot encoding of categorical features
    for feature in categorical_features:
        adult_df = pd.concat([adult_df, pd.get_dummies(adult_df[feature], prefix=feature)], axis=1)
        adult_df.drop(columns=[feature], axis=1, inplace=True)
    
    ### dropping irrelevant columns
    adult_df.drop(columns=features_to_drop, axis=1, inplace=True)
    
    return adult_df

### Adult Dataset
src_filepath = "../data/Adult Dataset/raw_adult.csv"
dest_filepath = "../data/Adult Dataset/processed_adult.csv"
categorical_features = ["age_cat", "workclass", "occupation", "education","relationship", "race", "marital-status", "native-country"]
adult_df = preprocess_adult_data(src_filepath, categorical_features)
adult_df.to_csv(dest_filepath, index=False)
adult_df.head()



Unnamed: 0,educational-num,gender,capital-gain,capital-loss,hours-per-week,income,age_cat_1-18,age_cat_18-25,age_cat_25-30,age_cat_30-35,...,native-country_Portugal,native-country_Puerto-Rico,native-country_Scotland,native-country_South,native-country_Taiwan,native-country_Thailand,native-country_Trinadad&Tobago,native-country_United-States,native-country_Vietnam,native-country_Yugoslavia
0,7,1,0,0,40,0,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0
1,9,1,0,0,50,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,12,1,0,0,40,1,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
3,10,1,7688,0,40,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
5,6,1,0,0,30,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
