# Lending Club Loan Data - ETL Pipeline

In [17]:
import sqlite3

import pandas as pd
import numpy as np
from sqlalchemy import *
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import sessionmaker
from sqlalchemy.sql import *

## Part 2. Data Pipeline Engineering

In this section I will create an ETL pipeline using an SQLite database and the SQLAlchemy library to define a data model, write queries and manipulate the SQL database using python.

### 2.1 Creating a database

I will use an SQLite database that comes pre-installed with python. The first step is to create a new database from the command line and then connecting to it. 



`$ touch loans.db`

In [29]:
engine = create_engine('sqlite:///loans.db')

### 2.2 Defining a data model / schema

I will start by defining a data schema. I will create a table in my database using the data types I determined in my exploratory analysis (see `Loan Data - EDA.ipynb` notebook), denoting `loan_id` as the primary key column.

In [30]:
Base = declarative_base()


In [27]:
class Loans(Base):
    __tablename__ = "loans"
    loan_id = Column(String, primary_key=True)
    member_id = Column(String)
    term = Column(String)
    grade = Column(String)
    url = Column(String)
    sub_grade = Column(String)
    emp_title = Column(String)
    emp_length = Column(String)
    home_ownership = Column(String)
    verification_status = Column(String)
    loan_status = Column(String)
    pymnt_plan = Column(String)
    desc = Column(String)
    purpose = Column(String)
    title = Column(String)
    zip_code = Column(String)
    addr_state = Column(String)
    earliest_cr_line = Column(String)
    initial_list_status = Column(String)
    application_type = Column(String)
    verification_status_joint = Column(String)
    sec_app_earliest_cr_line = Column(String)
    hardship_flag = Column(String)
    hardship_type = Column(String)
    hardship_reason = Column(String)
    hardship_status = Column(String)
    hardship_loan_status = Column(String)
    disbursement_method = Column(String)
    debt_settlement_flag = Column(String)
    settlement_status = Column(String)
    policy_code = Column(Integer)
    loan_amnt = Column(Float)
    funded_amnt = Column(Float)
    revol_bal = Column(Float)
    funded_amnt_inv = Column(Float)
    int_rate = Column(Float)
    installment = Column(Float)
    annual_inc = Column(Float)
    dti = Column(Float)
    delinq_2yrs = Column(Float)
    inq_last_6mths = Column(Float)
    mths_since_last_delinq = Column(Float)
    mths_since_last_record = Column(Float)
    open_acc = Column(Float)
    pub_rec = Column(Float)
    revol_util = Column(Float)
    total_acc = Column(Float)
    out_prncp = Column(Float)
    out_prncp_inv = Column(Float)
    total_pymnt = Column(Float)
    total_pymnt_inv = Column(Float)
    total_rec_prncp = Column(Float)
    total_rec_int = Column(Float)
    total_rec_late_fee = Column(Float)
    recoveries = Column(Float)
    collection_recovery_fee = Column(Float)
    last_pymnt_amnt = Column(Float)
    collections_12_mths_ex_med = Column(Float)
    mths_since_last_major_derog = Column(Float)
    annual_inc_joint = Column(Float)
    dti_joint = Column(Float)
    acc_now_delinq = Column(Float)
    tot_coll_amt = Column(Float)
    tot_cur_bal = Column(Float)
    open_acc_6m = Column(Float)
    open_act_il = Column(Float)
    open_il_12m = Column(Float)
    open_il_24m = Column(Float)
    mths_since_rcnt_il = Column(Float)
    total_bal_il = Column(Float)
    il_util = Column(Float)
    open_rv_12m = Column(Float)
    open_rv_24m = Column(Float)
    max_bal_bc = Column(Float)
    all_util = Column(Float)
    total_rev_hi_lim = Column(Float)
    inq_fi = Column(Float)
    total_cu_tl = Column(Float)
    inq_last_12m = Column(Float)
    acc_open_past_24mths = Column(Float)
    avg_cur_bal = Column(Float)
    bc_open_to_buy = Column(Float)
    bc_util = Column(Float)
    chargeoff_within_12_mths = Column(Float)
    delinq_amnt = Column(Float)
    mo_sin_old_il_acct = Column(Float)
    mo_sin_old_rev_tl_op = Column(Float)
    mo_sin_rcnt_rev_tl_op = Column(Float)
    mo_sin_rcnt_tl = Column(Float)
    mort_acc = Column(Float)
    mths_since_recent_bc = Column(Float)
    mths_since_recent_bc_dlq = Column(Float)
    mths_since_recent_inq = Column(Float)
    mths_since_recent_revol_delinq = Column(Float)
    num_accts_ever_120_pd = Column(Float)
    num_actv_bc_tl = Column(Float)
    num_actv_rev_tl = Column(Float)
    num_bc_sats = Column(Float)
    num_bc_tl = Column(Float)
    num_il_tl = Column(Float)
    num_op_rev_tl = Column(Float)
    num_rev_accts = Column(Float)
    num_rev_tl_bal_gt_0 = Column(Float)
    num_sats = Column(Float)
    num_tl_120dpd_2m = Column(Float)
    num_tl_30dpd = Column(Float)
    num_tl_90g_dpd_24m = Column(Float)
    num_tl_op_past_12m = Column(Float)
    pct_tl_nvr_dlq = Column(Float)
    percent_bc_gt_75 = Column(Float)
    pub_rec_bankruptcies = Column(Float)
    tax_liens = Column(Float)
    tot_hi_cred_lim = Column(Float)
    total_bal_ex_mort = Column(Float)
    total_bc_limit = Column(Float)
    total_il_high_credit_limit = Column(Float)
    revol_bal_joint = Column(Float)
    sec_app_inq_last_6mths = Column(Float)
    sec_app_mort_acc = Column(Float)
    sec_app_open_acc = Column(Float)
    sec_app_revol_util = Column(Float)
    sec_app_open_act_il = Column(Float)
    sec_app_num_rev_accts = Column(Float)
    sec_app_chargeoff_within_12_mths = Column(Float)
    sec_app_collections_12_mths_ex_med = Column(Float)
    sec_app_mths_since_last_major_derog = Column(Float)
    deferral_term = Column(Float)
    hardship_amount = Column(Float)
    hardship_length = Column(Float)
    hardship_dpd = Column(Float)
    orig_projected_additional_accrued_interest = Column(Float)
    hardship_payoff_balance_amount = Column(Float)
    hardship_last_payment_amount = Column(Float)
    settlement_amount = Column(Float)
    settlement_percentage = Column(Float)
    settlement_term = Column(Float)
    issue_d = Column(DateTime)
    last_pymnt_d = Column(DateTime)
    next_pymnt_d = Column(DateTime)
    last_credit_pull_d = Column(DateTime)
    hardship_start_date = Column(DateTime)
    hardship_end_date = Column(DateTime)
    payment_plan_start_date = Column(DateTime)
    debt_settlement_flag_date = Column(DateTime)
    settlement_date = Column(DateTime)

In [31]:
# creating the loans table if it doesn't exist
Loans.__table__.create(bind=engine, checkfirst=True)

### 2.3 Extracting the data from a file

In [53]:
# loading the data from csv file using pandas
data = pd.read_csv('/Users/christina/Desktop/LC/data/loan.csv', low_memory=False)

### 2.4 Transforming the data 

Editing the primary key column.

In [54]:
# since the `id` column is not populated we will reset the index and set it as our unique indentifier (loan_id)
# (assuming that each row represents one loan and there are no duplicates)
data = data.reset_index().rename(columns={"index": "loan_id"})

# drop id since its empty
data = data.drop(columns='id')

In [55]:
data.head()

Unnamed: 0,loan_id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,...,hardship_payoff_balance_amount,hardship_last_payment_amount,disbursement_method,debt_settlement_flag,debt_settlement_flag_date,settlement_status,settlement_date,settlement_amount,settlement_percentage,settlement_term
0,0,,2500,2500,2500.0,36 months,13.56,84.92,C,C1,...,,,Cash,N,,,,,,
1,1,,30000,30000,30000.0,60 months,18.94,777.23,D,D2,...,,,Cash,N,,,,,,
2,2,,5000,5000,5000.0,36 months,17.97,180.69,D,D1,...,,,Cash,N,,,,,,
3,3,,4000,4000,4000.0,36 months,18.94,146.51,D,D2,...,,,Cash,N,,,,,,
4,4,,30000,30000,30000.0,60 months,16.14,731.78,C,C4,...,,,Cash,N,,,,,,


In [56]:
# converting each row of the pandas dataframe to a dictionary
loans = []

for i, row in data.iterrows():
    loans.append(row.to_dict())

In [57]:
loans[0]

{'loan_id': 0,
 'member_id': nan,
 'loan_amnt': 2500,
 'funded_amnt': 2500,
 'funded_amnt_inv': 2500.0,
 'term': ' 36 months',
 'int_rate': 13.56,
 'installment': 84.92,
 'grade': 'C',
 'sub_grade': 'C1',
 'emp_title': 'Chef',
 'emp_length': '10+ years',
 'home_ownership': 'RENT',
 'annual_inc': 55000.0,
 'verification_status': 'Not Verified',
 'issue_d': 'Dec-2018',
 'loan_status': 'Current',
 'pymnt_plan': 'n',
 'url': nan,
 'desc': nan,
 'purpose': 'debt_consolidation',
 'title': 'Debt consolidation',
 'zip_code': '109xx',
 'addr_state': 'NY',
 'dti': 18.24,
 'delinq_2yrs': 0.0,
 'earliest_cr_line': 'Apr-2001',
 'inq_last_6mths': 1.0,
 'mths_since_last_delinq': nan,
 'mths_since_last_record': 45.0,
 'open_acc': 9.0,
 'pub_rec': 1.0,
 'revol_bal': 4341,
 'revol_util': 10.3,
 'total_acc': 34.0,
 'initial_list_status': 'w',
 'out_prncp': 2386.02,
 'out_prncp_inv': 2386.02,
 'total_pymnt': 167.02,
 'total_pymnt_inv': 167.02,
 'total_rec_prncp': 113.98,
 'total_rec_int': 53.04,
 'total_r

### 2.5 Loading the data into the database table

The data now is in a form that can be loaded into the database. 

In [None]:
Session = sessionmaker(bind=engine)
session = Session()

for loan in loans:
    row = Loans(**loan)
    session.add(row)
    
session.commit()