# License

Copyright 2019 Navdeep Gill, Patrick Hall, Kim Montgomery, Nick Schmidt

Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License.

**DISCLAIMER**: This notebook is not legal compliance advice.

In [1]:
"""
Load the training and test data
"""

import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

# Specify data and output directories
main ='~/article-information-2019/data/output/'

# Specify filenames
train_filename = 'hmda_train.csv'
test_filename = 'hmda_test.csv'

# Load data
TRAIN = pd.read_csv(main + train_filename)
TEST = pd.read_csv(main + test_filename)


training_columns = list(TRAIN.columns)
target_column = 'high_priced'

In [2]:

# Change debt_to_income_ratio to an integer

import numpy as np
import statistics 

def transform_debt_to_income_ratio(x):
    answer = x
    if x=='30%-<36%':
        answer = 33   
    elif x=='20%-<30%':   
        answer = 25
    elif x=='50%-60%': 
        answer = 55
    elif x=='<20%': 
        answer = 10
    elif x=='>60%': 
        answer = 80
    else:
        try:
            x_int = int(x)
            answer = x_int
        except:
            pass
        
    return answer

def isnan(x):
    return np.isnan(x)
        
def replace_nan(x, med):
    answer = x
    if np.isnan(x):
        answer = med
    return answer

# Change the debt to income ratio to a float column
TRAIN["debt_to_income_ratio"] = TRAIN["debt_to_income_ratio"].apply(transform_debt_to_income_ratio)
TEST["debt_to_income_ratio"] = TEST["debt_to_income_ratio"].apply(transform_debt_to_income_ratio)

# Create a flag for missing values
median = statistics.median(TRAIN["debt_to_income_ratio"])
TRAIN["debt_to_income_ratio_missing"] = TRAIN["debt_to_income_ratio"].apply(isnan)
TEST["debt_to_income_ratio_missing"] = TEST["debt_to_income_ratio"].apply(isnan)

# Replace missing with median
TRAIN["debt_to_income_ratio"] = TRAIN["debt_to_income_ratio"].apply(lambda x: replace_nan(x, median))
TEST["debt_to_income_ratio"] = TEST["debt_to_income_ratio"].apply(lambda x: replace_nan(x, median))

In [3]:
# Calculate the loan to value ratio based on the other features

TRAIN["loan_to_value_ratio"] = TRAIN["loan_amount"] / TRAIN["property_value"] 
TEST["loan_to_value_ratio"] = TEST["loan_amount"] / TEST["property_value"]

In [4]:
# Standardize some of the features

from sklearn.preprocessing import StandardScaler

features = ["loan_amount", "loan_to_value_ratio", "no_intro_rate_period"]
features += ["intro_rate_period", "property_value", "income"]
features += ["debt_to_income_ratio"] 
            
            
scaler = StandardScaler()
scaler.fit(TRAIN[features])
std_features_TRAIN = pd.DataFrame(scaler.transform(TRAIN[features]), columns=[item + "_std" for item in features])
std_features_TEST = pd.DataFrame(scaler.transform(TEST[features]), columns=[item + "_std" for item in features])

TRAIN = pd.concat([TRAIN, std_features_TRAIN], axis=1)
TEST = pd.concat([TEST, std_features_TEST], axis=1)

TRAIN = TRAIN.drop(features, axis=1)
TEST = TEST.drop(features, axis=1)

In [5]:
# Save transformed datasets

TRAIN.to_csv(main+'hmda_train_processed.csv', index=False)
TEST.to_csv(main+'hmda_test_processed.csv', index=False)