In [None]:
import numpy as np
import pandas as pd
import math

In [None]:
##############################################################
# Data munging 


# hmda_init() – Read the data files and return a pointer or 
# object containing the expanded HMDA data
##############################################################
def hmda_init(loanfile=r"E:\JOB\Job_Application_Materials\Interview_by_company\Capital One\data-challenge-data-master\data_copy\2012_to_2014_loans_data.csv",
              institutionfile=r"E:\JOB\Job_Application_Materials\Interview_by_company\Capital One\data-challenge-data-master\data_copy\2012_to_2014_institutions_data.csv"):
    
    """
    This function infiles data from .csv into pandas DataFrame
    
    Parameters
    ----------
    loanfile, string
        the location path of the loans csv file
        
    institutionfile, string
        the location path of the institution csv file
        
    """
    ##############################################################
    # infile csv files
    ##############################################################
    
    # read in loan csv file
    loans = pd.read_csv(loanfile,
                        header=0,
                        low_memory=False)
    
    # fix up the format
    
    # a function make input list of columns to object format
    def toObj(s,data):
        """
        a function make input list of columns to object format

        Parameters
        ----------
        s, list
            list of column names

        data, DataFrame
            name of the dataset
        """
        for col in s:
            data[col] = data[col].astype(object)

        
    # a function make input list of columns to float format
    def objToFloat(s,data):
        """
        a function make input list of columns to float format.
        It takes care of np.nan format. Since np.nan is float format,
        the format for the numeric columns with np.nan value is float
        instead of int

        Parameters
        ----------
        s, list
            list of column names

        data, DataFrame
            name of the dataset
            
        Returns
        ----------
        DataFrame
        """
        for col in s:
            # check the abnormal value
            data[col].value_counts()

            # change 'NA  ' into np.nan
            data[col] = data[col].apply(
                lambda x: np.nan if isinstance(x,str) and x.strip()=='NA' else x)

            # str to float
            data[col] = data[col].astype(float)
            
    # fix the object columns
    colList = ['Agency_Code',
               'Census_Tract_Number',
               'County_Code',
               'MSA_MD',
               'Respondent_ID',
               'Sequence_Number',
               'State_Code',
               'MSA_MD_Description',
               'Loan_Purpose_Description',
               'Agency_Code_Description',
               'Lien_Status_Description',
               'Loan_Type_Description',
               'State',
               'County_Name',
               'Conventional_Status',
               'Conforming_Status',
               'Conventional_Conforming_Flag']
    
    # this function changes the list of the columns into object format
    toObj(s=colList, data=loans)

    # fix the numeric columns
    colList = ['Applicant_Income_000', 
               'FFIEC_Median_Family_Income',
               'Number_of_Owner_Occupied_Units',
               'Tract_to_MSA_MD_Income_Pct']
    # a function make input list of columns to float format, takes care of NA values.
    objToFloat(s=colList, data=loans)

    
    # infile institution data
    institution = pd.read_csv(institutionfile,
                        header=0,
                        low_memory=False)

    # change integer to object
    colList = ['Agency_Code']
    
    # this function changes the list of the columns into object format
    toObj(s=colList, data=institution)

    ##############################################################
    # Join loan data and insititution data
    ##############################################################
    # join key: As_of_Year + Agency_code + Respondent_ID
    df = loans.merge(right=institution, how='left',
                     left_on=['As_of_Year','Agency_Code','Respondent_ID'],
                     right_on=['As_of_Year','Agency_Code','Respondent_ID'])

    ##############################################################
    # Bucket loan amount
    ##############################################################
    df['Loan_Amount_Bucket'] = (df['Loan_Amount_000']/100+0.5).round() # round up
    # if > 1000k, put them into same bucket
    df.loc[df['Loan_Amount_Bucket'] > 10] = 11
    
    return df

In [None]:
# infile csv into DataFrame
df_hmda = hmda_init()
df_hmda.head()
df_hmda.info()

In [None]:
##############################################################
# Quality check 

# Data cleaning
##############################################################
 
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
# drop duplicates
df_hmda1 = df_hmda.drop_duplicates(keep='first')

In [None]:
df_hmda1.info()

In [None]:
# Descriptive statistics of numeric variables

# count NaN values of each variables
sr = 100*df_hmda1.isnull().sum()/1312989 #total number of entries 1312989
sr.sort_values(ascending=False)

# 8.96% of missing values of applicant_income

In [None]:
# descriptive statistics of numeric variables
df_hmda1.describe(percentiles=[0.01,0.1,0.25,0.5,0.75,0.9,0.99],include=[np.number]).transpose()

# unreasonable min value of  As_of_Year, Applicant_Income_000, Loan_Amount_000, Conforming_Limit_000
# will check these these variables

In [None]:
################################ 
# check the value of as_of_year
################################

df_hmda1['As_of_Year'].value_counts(normalize=True,sort=True, ascending=False, bins=None, dropna=True)

In [None]:
# check the the row with year=11

df_hmda1[df_hmda1['As_of_Year']==11]

In [None]:
# delete the rwo of as of year = 10
df_hmda1.drop(df_hmda1[df_hmda1['As_of_Year']==11].index, inplace=True, axis=0)

In [None]:
df_hmda1['As_of_Year'].value_counts(normalize=True,sort=True, ascending=False, bins=None, dropna=True)