### Saving the work


*Purpose: This notebook breaks down step-by-step a simple wrangling approach to creating a training data of 50,000 records (data is already pre-recorded randomly in original raw csv) for your single year (i.e. 2013, 2015, and 2017)*.

Documentation:  
http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.to_sql.html  
https://stackoverflow.com/questions/23103962/how-to-write-dataframe-to-postgres-table                    
https://github.com/metabase/metabase/issues/7214  
https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_sql.html

In [1]:
# Importing Libraries.

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import math

import os
import psycopg2
import pandas.io.sql as psql
import sqlalchemy
from sqlalchemy import create_engine

from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
from scipy import stats
from pylab import*
from matplotlib.ticker import LogLocator

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [13]:
# Creating a new engine to specify the "reporting_user" permissions
postgres_host = 'REDACT'  
postgres_port = '5432' 
postgres_username = 'reporting_user'
postgres_password = 'REDACT'
postgres_dbname = "paddle_loan_canoe"
postgres_str = ('postgresql://{username}:{password}@{host}:{port}/{dbname}'
                .format(username = postgres_username,
                        password = postgres_password,
                        host = postgres_host,
                        port = postgres_port,
                        dbname = postgres_dbname)
               )

# Creating another engine that connects to teammate's AWS
postgres_host2 = 'database-1.cogr19hev0gd.us-east-2.rds.amazonaws.com'  
postgres_port2 = '5432' 
postgres_username2 = 'REDACT' 
postgres_password2 = 'REDACT'
postgres_dbname2 = "postgres"
postgres_str2 = ('postgresql://{username}:{password}@{host}:{port}/{dbname}'
                .format(username = postgres_username2,
                        password = postgres_password2,
                        host = postgres_host2,
                        port = postgres_port2,
                        dbname = postgres_dbname2)
               )


# Creating the connection.
cnx = create_engine(postgres_str)


# Creating the connections.
cnx2 = create_engine(postgres_str2)

In [4]:
#  Reading YOUR HMDA YEAR (this case below is for 2013 -- a single year) dataset; join population and education datasets appropriately for YOUR YEAR 
#  for the first 25,000 rows -- as a dataframe using pandas: df.

df = pd.read_sql_query ('''SELECT
                              --> a. main: casting a few key MORTGAGE data fields:
                                   CAST(us13.action_taken_name As varchar(56)) As outcome, us13.as_of_year As year,
                                   CAST(denial_reason_name_1 As varchar(56)) dn_reason1 , CAST(us13.agency_name As varchar(56)) As agency,
                                   CAST(us13.state_name As varchar(28)) As state,         CAST(us13.county_name As varchar(56)) As county,
                                   CAST(us13.loan_type_name As varchar(56)) As ln_type,   CAST(us13.loan_purpose_name As varchar(56)) As ln_purp,
                                   us13.loan_amount_000s As ln_amt_000s, us13.hud_median_family_income As hud_med_fm_inc, population as pop,

                                       --two embedded fuctions and one CASE below: assigns hierarchy in CASE, and converts to num in two steps
                                   CAST ( CAST ( CASE
                                                     WHEN us13.rate_spread = '' Then '0'
                                                     ELSE us13.rate_spread
                                                 END As varchar(5)
                                               ) As numeric
                                        )
                                   As rt_spread,
                                       --categorize loan application outcome into two buckets: "Approved", "Denied, Not Accepted, or Withdrawn"
                                   CASE
                                       WHEN us13.action_taken_name In ('Loan originated', 'Loan purchased by the institution')
                                           THEN 'Approved or Loan Purchased by the Institution'
                                       ELSE 'Denied, Not Accepted, or Withdrawn'
                                   END outcome_bucket,
                              --*
                              --> b. macro-econ: casting and joining a few key EDUCATION data fields:
                                   CAST(educ17."Perc_adults w_less than a HS diploma_2013-17" As int)  As prc_blw_HS__2013_17_5yrAvg,
                                   CAST(educ17."Perc_adults w_ HS diploma only_2013-17" As int)        As prc_HS__2013_17_5yrAvg,
                                   CAST(educ17."Perc_adults w_BA deg or higher_2013-17" As int)        As prc_BA_plus__2013_17_5yrAvg,
                              --*
                              --> c. macro-econ: casting and joining a few key POPULATION data fields:
                                   CAST(pop13.r_birth_2013 AS INT)                                     As r_birth_2013,
                                   CAST(pop13.r_international_mig_2013 AS INT)                         As r_intl_mig_2013,
                                   CAST(pop13.r_natural_inc_2013 AS INT)                               As r_natural_inc_2013
                              --*
                           FROM usa_mortgage_market.hmda_lar_2013 us13
                           LEFT OUTER JOIN v__macro_economic_indicators.education__acs_1970_to_2017_5yravgs educ17
                                   ON us13.county_name = educ17."Area name"
                           LEFT OUTER JOIN v__macro_economic_indicators.populationestimates__usda_ers_2010_to_2018 pop13
                                   ON us13.county_name = pop13.area_name
                           LIMIT 25000''', cnx)

# Using pandas to view the first 5 rows (NB: why does it start at 0?).
df.head(5)

Unnamed: 0,outcome,year,dn_reason1,agency,state,county,ln_type,ln_purp,ln_amt_000s,hud_med_fm_inc,pop,rt_spread,outcome_bucket,prc_blw_hs__2013_17_5yravg,prc_hs__2013_17_5yravg,prc_ba_plus__2013_17_5yravg,r_birth_2013,r_intl_mig_2013,r_natural_inc_2013
0,Application denied by financial institution,2013,Collateral,Department of Housing and Urban Development,Michigan,Oakland County,FHA-insured,Refinancing,83,72400,3094,0.0,"Denied, Not Accepted, or Withdrawn",6,19,46,,,
1,Loan purchased by the institution,2013,,Consumer Financial Protection Bureau,Wyoming,Laramie County,VA-guaranteed,Home purchase,267,66300,4182,0.0,Approved or Loan Purchased by the Institution,7,25,29,,,
2,File closed for incompleteness,2013,,Department of Housing and Urban Development,Minnesota,Hennepin County,FHA-insured,Home purchase,88,82300,2940,0.0,"Denied, Not Accepted, or Withdrawn",7,17,48,,,
3,Loan purchased by the institution,2013,,Consumer Financial Protection Bureau,Ohio,Franklin County,FHA-insured,Home purchase,78,67900,4918,0.0,Approved or Loan Purchased by the Institution,26,27,16,9.0,0.0,0.0
4,Loan purchased by the institution,2013,,Consumer Financial Protection Bureau,Ohio,Franklin County,FHA-insured,Home purchase,78,67900,4918,0.0,Approved or Loan Purchased by the Institution,14,33,21,9.0,0.0,0.0


In [5]:
# Using PostgreSQL to count and group by the merged "r_" variables to see null values with "" appearing as INT
df_test = pd.read_sql_query ('''WITH count_r_vars AS 
                                ( SELECT 
                              
                                   CAST(us13.action_taken_name As varchar(56)) As outcome, us13.as_of_year As year,
                                   CAST(denial_reason_name_1 As varchar(56)) dn_reason1 , CAST(us13.agency_name As varchar(56)) As agency,
                                   CAST(us13.state_name As varchar(28)) As state,         CAST(us13.county_name As varchar(56)) As county,
                                   CAST(educ17."Perc_adults w_less than a HS diploma_2013-17" As int)  As prc_blw_HS__2013_17_5yrAvg,
                                   CAST(educ17."Perc_adults w_ HS diploma only_2013-17" As int)        As prc_HS__2013_17_5yrAvg,
                                   CAST(educ17."Perc_adults w_BA deg or higher_2013-17" As int)        As prc_BA_plus__2013_17_5yrAvg,
                                   CAST(pop13.r_birth_2013 AS INT)                                     As r_birth_2013,
                                   CAST(pop13.r_international_mig_2013 AS INT)                         As r_intl_mig_2013,
                                   CAST(pop13.r_natural_inc_2013 AS INT)                               As r_natural_inc_2013


                                   FROM usa_mortgage_market.hmda_lar_2013 us13
                                   LEFT OUTER JOIN v__macro_economic_indicators.education__acs_1970_to_2017_5yravgs educ17
                                           ON us13.county_name = educ17."Area name"
                                   LEFT OUTER JOIN v__macro_economic_indicators.populationestimates__usda_ers_2010_to_2018 pop13
                                           ON us13.county_name = pop13.area_name
                                   LIMIT 25000
                                ) 
                                SELECT 'r_birth_2013' As r_var__nm, COUNT(*) As null_counts FROM count_r_vars WHERE r_birth_2013 IS NULL
                                    UNION ALL
                                SELECT 'r_intl_mig_2013' As r_var__nm, COUNT(*) As null_counts  FROM count_r_vars WHERE r_birth_2013 IS NULL
                                    UNION ALL
                                SELECT 'r_nat_inc_2013' As r_var_nm, COUNT(*) As null_counts FROM count_r_vars WHERE r_natural_inc_2013 IS NULL
                                '''           
                             , cnx)
df_test.head()

Unnamed: 0,r_var__nm,null_counts
0,r_birth_2013,665
1,r_intl_mig_2013,665
2,r_nat_inc_2013,665


---

In [6]:
# Limit your prelimary analysis to just loan applications for $700K or less

df2 = df[df.ln_amt_000s < 700]

Note (For the years from 2013-2017): Since the ``` > $100K``` median household incomes appear to be outliers, we'll replace them with ```= $91K```, since it is the top of the upper wishker and therefore falls within the last quartile. Note, this is for preliminary modeling only and could not "standardized" so simply in our final model.

In [7]:
df3 = df2.dropna(subset=['r_natural_inc_2013', 'r_birth_2013', 'r_intl_mig_2013'])

In [8]:
df3_dtype = {'outcome': sqlalchemy.types.VARCHAR(length=56),        'year':  sqlalchemy.types.INTEGER(),
             'dn_reason1': sqlalchemy.types.VARCHAR(length=56),     'agency': sqlalchemy.types.VARCHAR(length=56), 
             'state': sqlalchemy.types.VARCHAR(length=28),          'county': sqlalchemy.types.VARCHAR(length=56), 
             'ln_type': sqlalchemy.types.VARCHAR(length=56),        'ln_purp': sqlalchemy.types.VARCHAR(length=56),
             'ln_amt_000s': sqlalchemy.types.INTEGER(),             'hud_med_fm_inc': sqlalchemy.types.INTEGER(),
             'pop': sqlalchemy.types.INTEGER(),                     'rt_spread': sqlalchemy.types.NUMERIC(),
             'outcome_bucket': sqlalchemy.types.VARCHAR(length=56), 'prc_blw_HS__2013_17_5yrAvg': sqlalchemy.types.INTEGER(),
             'prc_HS__2013_17_5yrAvg': sqlalchemy.types.INTEGER(),  'prc_BA_plus__2013_17_5yrAvg': sqlalchemy.types.INTEGER(),
             'r_birth_2013': sqlalchemy.types.INTEGER(),            'r_intl_mig_2013': sqlalchemy.types.INTEGER(),
             'r_natural_inc_2013': sqlalchemy.types.INTEGER()
            }

In [19]:
# Using pandas to write Dataframe to PostgreSQL and replacing table if it already exists
df3.to_sql(name='loans_2013__training', schema='aa_testing', chunksize=250,
           dtype= df3_dtype, method=None, con=cnx2, if_exists='replace', index=False)


In [25]:
# Check to see the training data has been written
df_check = psql.read_sql_query('Select * FROM aa_testing.loans_2013__training', cnx2)
df_check.info

<bound method DataFrame.info of                                      outcome  year dn_reason1  \
0          Loan purchased by the institution  2013              
1          Loan purchased by the institution  2013              
2          Loan purchased by the institution  2013              
3          Loan purchased by the institution  2013              
4          Loan purchased by the institution  2013              
5          Loan purchased by the institution  2013              
6          Loan purchased by the institution  2013              
7          Loan purchased by the institution  2013              
8          Loan purchased by the institution  2013              
9          Loan purchased by the institution  2013              
10         Loan purchased by the institution  2013              
11         Loan purchased by the institution  2013              
12         Loan purchased by the institution  2013              
13         Loan purchased by the institution  2013        