<a href="https://colab.research.google.com/github/cweidig/measuring_bias/blob/main/measure_bias_preprocess_1load.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
# read the file mortgage_data_project.pkl from Google drive with pandas' read_pickle()
import pandas as pd
import seaborn as sns
import numpy as np

# Load the datset into a pandas dataframe
from joblib import load

df = load("/content/drive/My Drive/mortgage_data_preprocessed.pkl.gz")

In [5]:
# (i) analyzing the number of rows and columns

df.shape

(165950, 36)

In [None]:
# (ii) visualizing the first few lines of the dataset
df.head(10)

In [11]:
# (iii) listing the names of each variable and type

df.columns
df.dtypes

loan_amount_000s                                                                      int64
applicant_income_000s                                                               float64
population                                                                          float64
minority_population                                                                 float64
hud_median_family_income                                                            float64
tract_to_msamd_income                                                               float64
number_of_owner_occupied_units                                                      float64
number_of_1_to_4_family_units                                                       float64
applicant_ethnicity_name_Hispanic or Latino                                         float64
applicant_ethnicity_name_Not Hispanic or Latino                                     float64
applicant_race_name_1_American Indian or Alaska Native                          

In [10]:
# remove redundent vars
keep_vars = ['agency_abbr', 'loan_type_name', 'loan_amount_000s', 'owner_occupancy_name', 'loan_purpose_name', 'property_type_name', 'applicant_ethnicity_name', 'applicant_race_name_1', 'applicant_sex_name', 'applicant_income_000s', 'population', 'minority_population', 'hud_median_family_income', 'tract_to_msamd_income', 'number_of_owner_occupied_units', 'number_of_1_to_4_family_units', 'action_taken_name']

keep_vars_df = df[keep_vars]

keep_vars_df.head()

KeyError: "['agency_abbr', 'loan_type_name', 'owner_occupancy_name', 'loan_purpose_name', 'property_type_name', 'applicant_ethnicity_name', 'applicant_race_name_1', 'applicant_sex_name'] not in index"

In [None]:
# categorical variables
cat_variables = ['applicant_ethnicity_name', 'applicant_race_name_1', 'applicant_sex_name', 'agency_abbr',
                                 'owner_occupancy_name', 'property_type_name', 'loan_purpose_name', 'loan_type_name']

### Pre-processing
# Mapping categorical variables to one-hot encoding
df_cat = pd.DataFrame(index=df.index)

# one-hot encoding of categorical variables
from sklearn.preprocessing import OneHotEncoder

# I will do a loop for pedagogical reasons, but it is not entirely necessary
for cat in cat_variables:
    # one-hot encoding fitting
    one_hot_func = OneHotEncoder().fit(df[[cat]])

    # mapping
    cat_mapped = one_hot_func.transform(df[[cat]]).toarray()

    # storing
    for (k, cat_label) in enumerate(one_hot_func.categories_[0]):
        df_cat[cat + "_" + cat_label] = cat_mapped[:, k]

assert(df_cat.shape == (165950,27))

In [None]:
# other integer variables
int_variables = ['loan_amount_000s', 'applicant_income_000s', 'population', 'minority_population',
                 'hud_median_family_income', 'tract_to_msamd_income', 'number_of_owner_occupied_units',
                 'number_of_1_to_4_family_units']


# target variable
output_variable = ['action_taken_name']

# consolidating a final dataset
df_final = pd.concat([df[int_variables], df_cat, (df[output_variable] == "Application denied by financial institution").copy()], axis=1)

assert(df_final.shape == (165950,36))

In [None]:
# Store the dataset in pickled dictionary
df_final.to_pickle("/content/drive/My Drive/mortgage_data_preprocessed.pkl.gz")