Before we look for loan biases in this housing data we must import and clean the data set so that we can perform analyses
### Import Data

In [28]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import csv
import hashlib
from functools import reduce

# Get the total number of rows in the data set prior to filtering out bad, missing, or corrupt lines
# use the number to compare the size of the data set after filtering 
columnNames = []
with open('headers.txt', 'r') as headerFile:
    headerReader = csv.reader(headerFile, delimiter=',')
    for row in headerReader:
        columnNames.append(row[1])
        
numCols = len(columnNames)

In [29]:
invalidCols = 0; duplicateRows = 0; keptRows = 0; missingCols = 0; totalRows = 0
onHeader = True
rows = set()
with open('hmda_lar.csv', 'r') as dataFile:
    with open('valid_rows_sample_small.csv', 'w') as outFile:
        dataReader = csv.reader(dataFile, delimiter=',')
        outWriter = csv.writer(outFile, delimiter = ',')
        for row in dataReader:
            # Skip the header line
            totalRows += 1
            # Ignore rows with incorrect number of columns
            if len(row) != numCols:
                invalidCols += 1
                continue 
            else:
                # Ignore rows where more than 1/2 of the entries are missing
                # Count the number of nan's in a row
                missingFields = reduce(lambda x, y: x + int(y == ""), row, 0) # do not change "" to ''
                if missingFields >= int(0.5 * numCols):
                    missingCols += 1
                    continue
                else:
                    keptRows += 1
                    outWriter.writerow(row)
print("Dropped: %d Missing: %d   Kept: %d   Total: %d" % (invalidCols, missingCols,
                                                                             keptRows, totalRows))

# If we only drop duplicates that match on all fields these are the results.   
# Dropped:     Duplicates:     De-duplicated:     Total:    

Dropped: 0 Missing: 0   Kept: 439655   Total: 439655


In [30]:
df_dup = pd.read_csv("valid_rows_sample_small.csv", sep=',', engine='python', error_bad_lines=False, dtype='unicode')

In [31]:
df_dedup = df.drop_duplicates(keep='first');
duplicateRows = df_dup.shape[0]- df_dedup.shape[0]
print("Duplicates: %d" % duplicateRows)

Duplicates: 0


In [32]:
# Guarantees all rows are accounted for after filtering data
invalidCols + duplicateRows + missingCols + keptRows == totalRows

True

The following object contains suggested data types for the corresponding columns. The column headers not in this object are best represented as strings

In [33]:
colToType = {
    "tract_to_msamd_income" : float, 
    "rate_spread" : float,
    "population" : int,
    "minority_population" : bool,
    "number_of_owner_occupied_units" : int, 
    "number_of_1_to_4_family_units" : int, 
    "loan_amount_000s" : float, 
    "hud_median_family_income" : float,
    "applicant_income_000s" : float,
    "sequence_number" : int, 
    "census_tract_number" : float, 
    "as_of_year" : int,
    "application_date_indicator" : int,     
}

In [34]:
def convertToBool(x):
    if x == 'True': return True
    else: return False

In [35]:
df_test = df_dedup
# Use Pandas drop_duplicates() as evidence that dataset is deduplicated
print("Deduplicated Valid Rows: %d\tFully Deduplicated: %r" 
      % (len(df_test), len(df_test) == len(df_test.drop_duplicates())))
print("Columns: %d" % len(df_test.columns.values))

# Convert types of columns
for colName, colType in colToType.items():
    if colType == int:
        df_test[colName] = df_test[colName].apply(lambda x: x if x != 'nan' else 0).astype(int)
    if colType == float:
        df_test[colName] = df_test[colstate_nameName].apply(lambda x: x if x != 'nan' else float('nan')).astype(float)

Deduplicated Valid Rows: 439654	Fully Deduplicated: True
Columns: 47


Some fields may have values that are incompatible types. This may occur when no data is stored for a variable, a user did not complete the course or course registration, or a column may contain multiple data types. A string representation of an age cannot be compared to a number. If a user inputted N/A, or left that field blank, it is interpreted differently as NA, na, NaN.

In [36]:
df_test.replace("nan", np.nan, inplace=True)
df_test.replace("None", np.nan, inplace=True)

In [45]:
# this data set is specific to New York State so there is no need to keep the state name and abbrevation NY
df_test.drop(["state_name","state_abbr"],axis=1);