# Project 4 ETL Notebook

### Create Database Connection

#### Run

In [2]:
# imports
import sqlalchemy
import psycopg2
from sqlalchemy import create_engine
from sql_config import protocol, username, password, host, port, database_name
import pandas as pd
import os

# create db connection and engine
rds_connection_string = f'{protocol}://{username}:{password}@{host}:{port}/{database_name}'
engine = create_engine(rds_connection_string)

#### Optional - Run to test the database connection

In [3]:
test_df = pd.read_sql_query('select * from proj4_sch.test_temp', con=engine)
test_df

Unnamed: 0,column1,column2


#### Read Data File
#### Run

In [4]:
import pandas as pd

# This is the data file.
file_one = os.path.join('.','archive', 'application_data.csv')

# read in the CSV file.
app_df = pd.read_csv(file_one, encoding="utf-8")
app_df.head()


Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,100002,1,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
1,100003,0,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
2,100004,0,Revolving loans,M,Y,Y,0,67500.0,135000.0,6750.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
3,100006,0,Cash loans,F,N,Y,0,135000.0,312682.5,29686.5,...,0,0,0,0,,,,,,
4,100007,0,Cash loans,M,N,Y,0,121500.0,513000.0,21865.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0


#### Load the data file into your db instance
#### Run
Note: This may take a few minutes.  Three tests it ran in under 3 minutes.
The last test took over 12 minutes.

In [13]:
app_df.to_sql('app_data',con=engine,schema='proj4_sch',if_exists='replace',index=False)

511

#### Read the first 20 records
#### Run

In [5]:
pd.read_sql_query('select * from proj4_sch.app_data limit 20', con=engine)

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,244150,0,1,1,0,1,0,225000.0,792477.0,23301.0,...,0,0,0,0,,,,,,
1,133904,0,1,1,0,1,0,180000.0,314100.0,15241.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,4.0
2,418636,0,1,2,0,0,0,67500.0,254700.0,25321.5,...,0,0,0,0,,,,,,
3,442110,0,1,2,1,0,2,202500.0,400500.0,19264.5,...,0,0,0,0,0.0,0.0,0.0,0.0,1.0,4.0
4,102595,0,1,1,0,1,0,90000.0,152820.0,8901.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
5,442044,0,1,1,0,0,0,63000.0,76410.0,7573.5,...,0,0,0,0,,,,,,
6,270868,0,1,1,0,0,0,157500.0,808650.0,26217.0,...,0,0,0,0,0.0,0.0,0.0,3.0,0.0,1.0
7,319155,0,1,1,0,1,0,180000.0,545040.0,25537.5,...,0,0,0,0,,,,,,
8,338130,0,2,1,1,1,0,82800.0,135000.0,6750.0,...,0,0,0,0,,,,,,
9,108578,1,1,1,0,0,0,112500.0,269550.0,24723.0,...,0,0,0,0,,,,,,


### Pause Here
### Run proj4.sql (see Readme.md)
### Run db_views.sql (see Readme.md)

#### Code to test database views
Note: Please create the views (db_views.sql) before running the below code.

In [9]:
# This is a test 
pd.read_sql_query('select * from proj4_sch.vw_target_with_ext limit 20', con=engine).head()

Unnamed: 0,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,1,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,351000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
1,0,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,1129500.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
2,0,Revolving loans,M,Y,Y,0,67500.0,135000.0,6750.0,135000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
3,0,Cash loans,F,N,Y,0,135000.0,312682.5,29686.5,297000.0,...,0,0,0,0,,,,,,
4,0,Cash loans,M,N,Y,0,121500.0,513000.0,21865.5,513000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0


#### Code to test database views
Note: Please create the views (db_views.sql) before running the below code.

In [10]:
pd.read_sql_query('select * from proj4_sch.vw_target_with_ext', con=engine).head()

Unnamed: 0,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,1,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,351000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
1,0,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,1129500.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
2,0,Revolving loans,M,Y,Y,0,67500.0,135000.0,6750.0,135000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
3,0,Cash loans,F,N,Y,0,135000.0,312682.5,29686.5,297000.0,...,0,0,0,0,,,,,,
4,0,Cash loans,M,N,Y,0,121500.0,513000.0,21865.5,513000.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0


#### TEST CODE used for Transforming the db columns

In [None]:
# # This is the data file.
# file_one = os.path.join('.','archive', 'application_data.csv')

# # read in the CSV file.
# app_df = pd.read_csv(file_one, encoding="utf-8")
# app_df.head()

In [8]:
tmp_df = app_df
cols_df = pd.DataFrame(tmp_df.columns)
cols_df.reset_index()

# Save the file
file_col = os.path.join('.','archive', 'cols.csv')
cols_df.to_csv(file_col, index=False)

In [19]:
nm_cont_type = tmp_df['OCCUPATION_TYPE'].unique()
nm_cont_type

array(['Laborers', 'Core staff', 'Accountants', 'Managers', nan,
       'Drivers', 'Sales staff', 'Cleaning staff', 'Cooking staff',
       'Private service staff', 'Medicine staff', 'Security staff',
       'High skill tech staff', 'Waiters/barmen staff',
       'Low-skill Laborers', 'Realty agents', 'Secretaries', 'IT staff',
       'HR staff'], dtype=object)

In [20]:
tmp_cols_df = app_df
contract_type = tmp_cols_df['EMERGENCYSTATE_MODE'].unique()
contract_type

array(['No', nan, 'Yes'], dtype=object)

In [21]:
len(contract_type)
for i in range(len(contract_type)):
    col_type = f"update proj4_sch.app_data set \"EMERGENCYSTATE_MODE\" = {i+1} where \"EMERGENCYSTATE_MODE\" = '{contract_type[i]}';"
    c_type = col_type.replace('\'',"'")
    print(c_type)

update proj4_sch.app_data set "EMERGENCYSTATE_MODE" = 1 where "EMERGENCYSTATE_MODE" = 'No';
update proj4_sch.app_data set "EMERGENCYSTATE_MODE" = 2 where "EMERGENCYSTATE_MODE" = 'nan';
update proj4_sch.app_data set "EMERGENCYSTATE_MODE" = 3 where "EMERGENCYSTATE_MODE" = 'Yes';


#### Close DB Connection
#### Run when finished

In [22]:
engine.dispose()