<a href="https://colab.research.google.com/github/davisdw/Lending_Tree_Loan_Prediction_Analysis/blob/main/pyspark_data_load.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Importing & Exporting CSV Data

**Reads the CSV files, accepted & rejected loans from Amazon AWS s3 Bucket**

**Reduce Un-needed Columns from both tables**

**Review the dataset and perform data wrangling and cleaning**

**Merge two datasets together**

**Export the cleaned_df dataset back to s3 bucket to prep for running modeling, prediction and visualization**

In [47]:
import boto3
import pandas as pd
from io import StringIO # uses this library for data conversion
import awsKeyConfig
import io
import numpy as np

# Use boto3 to get the object from S3
# Retrieve the aws credential keys

s3 = boto3.client('s3',
aws_access_key_id=awsKeyConfig.keyID,
aws_secret_access_key=awsKeyConfig.secretKey,
region_name='us-east-1'
)

**Data Wrangling and Cleaning for Loan Accepted Dataset**

In [52]:
# Retrieves the Bucket Name and Key <file_name> and 
obj = s3.get_object(Bucket='davis-data-cloud-of-wonders', Key='accepted_2007_to_2018Q4.csv')
data = obj['Body'].read().decode('utf-8')

# Validates whether the connection to s3 is successfull or fail 
status = obj.get("ResponseMetadata", {}).get("HTTPStatusCode")

if status == 200:
    print(f"Successful S3 put_object response. Status - {status}")
else:
    print(f"Unsuccessful S3 put_object response. Status - {status}")


# Use StringIO to convert the string data to a file-like object
data_file = StringIO(data)

# Create a DataFrame from the CSV data
accepted_df = pd.read_csv(data_file)

accepted_df.head()


Successful S3 put_object response. Status - 200


  accepted_df = pd.read_csv(data_file)


Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,...,hardship_payoff_balance_amount,hardship_last_payment_amount,disbursement_method,debt_settlement_flag,debt_settlement_flag_date,settlement_status,settlement_date,settlement_amount,settlement_percentage,settlement_term
0,68407277,,3600.0,3600.0,3600.0,36 months,13.99,123.03,C,C4,...,,,Cash,N,,,,,,
1,68355089,,24700.0,24700.0,24700.0,36 months,11.99,820.28,C,C1,...,,,Cash,N,,,,,,
2,68341763,,20000.0,20000.0,20000.0,60 months,10.78,432.66,B,B4,...,,,Cash,N,,,,,,
3,66310712,,35000.0,35000.0,35000.0,60 months,14.85,829.9,C,C5,...,,,Cash,N,,,,,,
4,68476807,,10400.0,10400.0,10400.0,60 months,22.45,289.91,F,F1,...,,,Cash,N,,,,,,


In [59]:
# view number of columns and rows
accepted_df.shape

(2260701, 151)

In [64]:
# selected the required columns needed for analysis
clean_accept_df = accepted_df[["loan_amnt",
                    "term",
                    "int_rate",
                    "installment",
                    "grade",
                    "sub_grade",
                    "emp_title",
                    "emp_length",
                    "home_ownership",
                    "annual_inc",
                    "verification_status",
                    "issue_d",
                    "loan_status",
                    "purpose",
                    "addr_state",
                    "dti",
                    "fico_range_low",
                    "fico_range_high" ]]

clean_accept_df.head()

Unnamed: 0,loan_amnt,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,issue_d,loan_status,purpose,addr_state,dti,fico_range_low,fico_range_high
0,3600.0,36 months,13.99,123.03,C,C4,leadman,10+ years,MORTGAGE,55000.0,Not Verified,Dec-2015,Fully Paid,debt_consolidation,PA,5.91,675.0,679.0
1,24700.0,36 months,11.99,820.28,C,C1,Engineer,10+ years,MORTGAGE,65000.0,Not Verified,Dec-2015,Fully Paid,small_business,SD,16.06,715.0,719.0
2,20000.0,60 months,10.78,432.66,B,B4,truck driver,10+ years,MORTGAGE,63000.0,Not Verified,Dec-2015,Fully Paid,home_improvement,IL,10.78,695.0,699.0
3,35000.0,60 months,14.85,829.9,C,C5,Information Systems Officer,10+ years,MORTGAGE,110000.0,Source Verified,Dec-2015,Current,debt_consolidation,NJ,17.06,785.0,789.0
4,10400.0,60 months,22.45,289.91,F,F1,Contract Specialist,3 years,MORTGAGE,104433.0,Source Verified,Dec-2015,Fully Paid,major_purchase,PA,25.37,695.0,699.0


In [63]:
# view the dataset shape after removal of un-needed columns:
clean_accept_df.shape

(2260701, 18)

In [62]:
# view full display of the data types for the accepted table
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(clean_accept_df.dtypes)



loan_amnt              float64
term                    object
int_rate               float64
installment            float64
grade                   object
sub_grade               object
emp_title               object
emp_length              object
home_ownership          object
annual_inc             float64
verification_status     object
issue_d                 object
loan_status             object
purpose                 object
addr_state              object
dti                    float64
fico_range_low         float64
fico_range_high        float64
dtype: object


In [57]:
# Display the number of unique values in each column
for a in clean_accept_df:
    print(a, len(clean_accept_df[a].unique()))

loan_amnt 1573
term 3
int_rate 674
installment 93302
grade 8
sub_grade 36
emp_title 512695
emp_length 12
home_ownership 7
annual_inc 89369
verification_status 4
issue_d 140
loan_status 10
purpose 15
addr_state 52
dti 10846
fico_range_low 49
fico_range_high 49




**Data Wrangling and Cleaning for Loan Rejected Dataset**



In [71]:
# Reads in the rejected data : 

obj = s3.get_object(Bucket='davis-data-cloud-of-wonders', Key='rejected_2007_to_2018Q4.csv')
data = obj['Body'].read().decode('utf-8')

# Validates whether the connection to s3 is successfull or fail 
status = obj.get("ResponseMetadata", {}).get("HTTPStatusCode")

if status == 200:
    print(f"Successful S3 put_object response. Status - {status}")
else:
    print(f"Unsuccessful S3 put_object response. Status - {status}")

# Use StringIO to convert the string data to a file-like object
data_file = StringIO(data)

# Create a DataFrame from the CSV data
rejected_df = pd.read_csv(data_file)

rejected_df.head()

Successful S3 put_object response. Status - 200


Unnamed: 0,amt_requested,date,purpose,risk_score,dti,zip_code,state,employment_length,policy_code
0,1000.0,2007-05-26,Wedding Covered but No Honeymoon,693.0,10%,481xx,NM,4 years,0.0
1,1000.0,2007-05-26,Consolidating Debt,703.0,10%,010xx,MA,< 1 year,0.0
2,11000.0,2007-05-27,Want to consolidate my debt,715.0,10%,212xx,MD,1 year,0.0
3,6000.0,2007-05-27,waksman,698.0,38.64%,017xx,MA,< 1 year,0.0
4,1500.0,2007-05-27,mdrigo,509.0,9.43%,209xx,MD,< 1 year,0.0


In [72]:
# view number of columns and rows
rejected_df.shape

(27648741, 9)

In [73]:
# Dropping policy_code column
clean_reject_df = rejected_df.drop(columns=['policy_code'])
clean_reject_df.head()

Unnamed: 0,amt_requested,date,purpose,risk_score,dti,zip_code,state,employment_length
0,1000.0,2007-05-26,Wedding Covered but No Honeymoon,693.0,10%,481xx,NM,4 years
1,1000.0,2007-05-26,Consolidating Debt,703.0,10%,010xx,MA,< 1 year
2,11000.0,2007-05-27,Want to consolidate my debt,715.0,10%,212xx,MD,1 year
3,6000.0,2007-05-27,waksman,698.0,38.64%,017xx,MA,< 1 year
4,1500.0,2007-05-27,mdrigo,509.0,9.43%,209xx,MD,< 1 year


In [74]:
# view the dataset shape after removal of un-needed columns:
clean_reject_df.shape

(27648741, 8)

In [75]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(clean_reject_df.dtypes)

amt_requested        float64
date                  object
purpose               object
risk_score           float64
dti                   object
zip_code              object
state                 object
employment_length     object
dtype: object


In [76]:
for b in clean_reject_df:
    print(b, len(clean_reject_df[b].unique()))

amt_requested 3640
date 4238
purpose 73928
risk_score 693
dti 126145
zip_code 1002
state 52
employment_length 12




**Merging two datasets**


In [77]:
# quick review of the clean_accept_df dataset
clean_accept_df.head()


Unnamed: 0,loan_amnt,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,issue_d,loan_status,purpose,addr_state,dti,fico_range_low,fico_range_high
0,3600.0,36 months,13.99,123.03,C,C4,leadman,10+ years,MORTGAGE,55000.0,Not Verified,Dec-2015,Fully Paid,debt_consolidation,PA,5.91,675.0,679.0
1,24700.0,36 months,11.99,820.28,C,C1,Engineer,10+ years,MORTGAGE,65000.0,Not Verified,Dec-2015,Fully Paid,small_business,SD,16.06,715.0,719.0
2,20000.0,60 months,10.78,432.66,B,B4,truck driver,10+ years,MORTGAGE,63000.0,Not Verified,Dec-2015,Fully Paid,home_improvement,IL,10.78,695.0,699.0
3,35000.0,60 months,14.85,829.9,C,C5,Information Systems Officer,10+ years,MORTGAGE,110000.0,Source Verified,Dec-2015,Current,debt_consolidation,NJ,17.06,785.0,789.0
4,10400.0,60 months,22.45,289.91,F,F1,Contract Specialist,3 years,MORTGAGE,104433.0,Source Verified,Dec-2015,Fully Paid,major_purchase,PA,25.37,695.0,699.0


In [78]:
# quick review of the clean_reject_df dataset
clean_reject_df.head()

Unnamed: 0,amt_requested,date,purpose,risk_score,dti,zip_code,state,employment_length
0,1000.0,2007-05-26,Wedding Covered but No Honeymoon,693.0,10%,481xx,NM,4 years
1,1000.0,2007-05-26,Consolidating Debt,703.0,10%,010xx,MA,< 1 year
2,11000.0,2007-05-27,Want to consolidate my debt,715.0,10%,212xx,MD,1 year
3,6000.0,2007-05-27,waksman,698.0,38.64%,017xx,MA,< 1 year
4,1500.0,2007-05-27,mdrigo,509.0,9.43%,209xx,MD,< 1 year


In [None]:
# add an new column called loan_request_id for both tables providing unique identifier to each (record) or row
clean_accept_df

In [None]:
# add an column to both tables that determines if applicant had approval of the loan  
# using boolean values "0" for those who were approved (accept_df) and "1" for denied (reject_df) for loan

In [None]:
# Joined both datasets together, "concat" using join inner on "loan_request_id" column 
joined_loan_df = pd.concat([clean_accept_df, clean_reject_df], axis=1, join="inner")
joined_loan_df.head()

**Write Output files back to s3**

--Once the dataset is formatted wrangled and cleaned we're outputting the data

In [None]:
# This is an dummy dataframe, i've created to test to see if able to write data into AWS s3 output_file folder 

"""  
data = [['tom', 10], ['nick', 15], ['juli', 14], ['solyiah', 10]]
test_df = pd.DataFrame(data, columns=['Name', 'Age'])

test_df
"""

In [None]:
# variables for buck name and path to save the output file to
bucket_name = "davis-data-cloud-of-wonders"
path = "output/output_data.csv"

# converts string into file format before saving the file
with io.StringIO() as csv_buffer:
    joined_loan_df.to_csv(csv_buffer, index=False)

    # place file into the specified buck and path location in the s3 directory
    response = s3.put_object(
        Bucket= bucket_name, Key=path, Body=csv_buffer.getvalue()
    )

    # Validates whether the connection to s3 is successfull or fail 
    status = response.get("ResponseMetadata", {}).get("HTTPStatusCode")

    if status == 200:
        print(f"Successful S3 put_object response. Status - {status}")
    else:
        print(f"Unsuccessful S3 put_object response. Status - {status}")