<a href="https://colab.research.google.com/github/davisdw/Lending_Tree_Loan_Prediction_Analysis/blob/main/pyspark_data_load.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Importing & Exporting CSV Data

**Reads the CSV files, accepted & rejected loans from Amazon AWS s3 Bucket**

**Reduce Un-needed Columns from both tables**

**Review the dataset and perform data wrangling and cleaning**

**Merge two datasets together**

**Export the cleaned_df dataset back to s3 bucket to prep for running modeling, prediction and visualization**

In [1]:
import boto3
import pandas as pd
from io import StringIO # uses this library for data conversion
import awsKeyConfig
import io
import datetime as dt


# Use boto3 to get the object from S3
# Retrieve the aws credential keys from the awsKeyConfig.py file

s3 = boto3.client('s3',
aws_access_key_id=awsKeyConfig.keyID,
aws_secret_access_key=awsKeyConfig.secretKey,
region_name='us-east-1'
)

In [2]:
# Retrieves the Bucket Name and Key <file_name> and 
obj = s3.get_object(Bucket='davis-data-cloud-of-wonders', Key='accepted_2007_to_2018Q4.csv')
data = obj['Body'].read().decode('utf-8')

# Validates whether the connection to s3 is successfull or fail 
status = obj.get("ResponseMetadata", {}).get("HTTPStatusCode")

if status == 200:
    print(f"Successful S3 put_object response. Status - {status}")
else:
    print(f"Unsuccessful S3 put_object response. Status - {status}")


# Use StringIO to convert the string data to a file-like object
data_file = StringIO(data)

# Create a DataFrame from the CSV data
loan_df = pd.read_csv(data_file)

loan_df.head()


Successful S3 put_object response. Status - 200


  loan_df = pd.read_csv(data_file)


Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,...,hardship_payoff_balance_amount,hardship_last_payment_amount,disbursement_method,debt_settlement_flag,debt_settlement_flag_date,settlement_status,settlement_date,settlement_amount,settlement_percentage,settlement_term
0,68407277,,3600.0,3600.0,3600.0,36 months,13.99,123.03,C,C4,...,,,Cash,N,,,,,,
1,68355089,,24700.0,24700.0,24700.0,36 months,11.99,820.28,C,C1,...,,,Cash,N,,,,,,
2,68341763,,20000.0,20000.0,20000.0,60 months,10.78,432.66,B,B4,...,,,Cash,N,,,,,,
3,66310712,,35000.0,35000.0,35000.0,60 months,14.85,829.9,C,C5,...,,,Cash,N,,,,,,
4,68476807,,10400.0,10400.0,10400.0,60 months,22.45,289.91,F,F1,...,,,Cash,N,,,,,,


**Data Cleaning for Loan Dataset**

Removing un-needed dataset columns

Correcting datatypes

In [3]:
# view number of columns and rows
loan_df.shape

(2260701, 151)

In [10]:
# selected the required columns needed for analysis
clean_loan_df = loan_df[["loan_amnt",
                    "term",
                    "int_rate",
                    "application_type", # single or joint account
                    "installment",
                    "grade", 
                    "sub_grade",
                    "emp_title",
                    "emp_length",
                    "home_ownership",
                    "annual_inc",
                    "annual_inc_joint",
                    "verification_status",
                    "issue_d",
                    "loan_status",
                    "delinq_2yrs", #  number of past 30+ days delinquent marks in past two-year history
                    "purpose",
                    "addr_state",
                    "dti",
                    "fico_range_low",
                    "fico_range_high" ]]

clean_loan_df.head()

Unnamed: 0,loan_amnt,term,int_rate,application_type,installment,grade,sub_grade,emp_title,emp_length,home_ownership,...,annual_inc_joint,verification_status,issue_d,loan_status,delinq_2yrs,purpose,addr_state,dti,fico_range_low,fico_range_high
0,3600.0,36 months,13.99,Individual,123.03,C,C4,leadman,10+ years,MORTGAGE,...,,Not Verified,Dec-2015,Fully Paid,0.0,debt_consolidation,PA,5.91,675.0,679.0
1,24700.0,36 months,11.99,Individual,820.28,C,C1,Engineer,10+ years,MORTGAGE,...,,Not Verified,Dec-2015,Fully Paid,1.0,small_business,SD,16.06,715.0,719.0
2,20000.0,60 months,10.78,Joint App,432.66,B,B4,truck driver,10+ years,MORTGAGE,...,71000.0,Not Verified,Dec-2015,Fully Paid,0.0,home_improvement,IL,10.78,695.0,699.0
3,35000.0,60 months,14.85,Individual,829.9,C,C5,Information Systems Officer,10+ years,MORTGAGE,...,,Source Verified,Dec-2015,Current,0.0,debt_consolidation,NJ,17.06,785.0,789.0
4,10400.0,60 months,22.45,Individual,289.91,F,F1,Contract Specialist,3 years,MORTGAGE,...,,Source Verified,Dec-2015,Fully Paid,1.0,major_purchase,PA,25.37,695.0,699.0


In [11]:
# Take two fico scores columns and find an average for the credit score and place it in new columns
clean_loan_df["fico_score"] = clean_loan_df[["fico_range_low", "fico_range_high"]].mean(axis=1)
clean_loan_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clean_loan_df["fico_score"] = clean_loan_df[["fico_range_low", "fico_range_high"]].mean(axis=1)


Unnamed: 0,loan_amnt,term,int_rate,application_type,installment,grade,sub_grade,emp_title,emp_length,home_ownership,...,verification_status,issue_d,loan_status,delinq_2yrs,purpose,addr_state,dti,fico_range_low,fico_range_high,fico_score
0,3600.0,36 months,13.99,Individual,123.03,C,C4,leadman,10+ years,MORTGAGE,...,Not Verified,Dec-2015,Fully Paid,0.0,debt_consolidation,PA,5.91,675.0,679.0,677.0
1,24700.0,36 months,11.99,Individual,820.28,C,C1,Engineer,10+ years,MORTGAGE,...,Not Verified,Dec-2015,Fully Paid,1.0,small_business,SD,16.06,715.0,719.0,717.0
2,20000.0,60 months,10.78,Joint App,432.66,B,B4,truck driver,10+ years,MORTGAGE,...,Not Verified,Dec-2015,Fully Paid,0.0,home_improvement,IL,10.78,695.0,699.0,697.0
3,35000.0,60 months,14.85,Individual,829.9,C,C5,Information Systems Officer,10+ years,MORTGAGE,...,Source Verified,Dec-2015,Current,0.0,debt_consolidation,NJ,17.06,785.0,789.0,787.0
4,10400.0,60 months,22.45,Individual,289.91,F,F1,Contract Specialist,3 years,MORTGAGE,...,Source Verified,Dec-2015,Fully Paid,1.0,major_purchase,PA,25.37,695.0,699.0,697.0


In [12]:
# Drop the fico range columns
clean_loan_df = clean_loan_df.drop(columns=["fico_range_low", "fico_range_high"])

In [13]:
# split issue_d to year column
dt_year = pd.to_datetime(clean_loan_df['issue_d'])
clean_loan_df['issued_year'] = dt_year

  dt_year = pd.to_datetime(clean_loan_df['issue_d'])


In [14]:
clean_loan_df = clean_loan_df.drop(columns=['issue_d'])

In [15]:
# reviewing the cleaned data
clean_loan_df.head()

Unnamed: 0,loan_amnt,term,int_rate,application_type,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,annual_inc_joint,verification_status,loan_status,delinq_2yrs,purpose,addr_state,dti,fico_score,issued_year
0,3600.0,36 months,13.99,Individual,123.03,C,C4,leadman,10+ years,MORTGAGE,55000.0,,Not Verified,Fully Paid,0.0,debt_consolidation,PA,5.91,677.0,2015-12-01
1,24700.0,36 months,11.99,Individual,820.28,C,C1,Engineer,10+ years,MORTGAGE,65000.0,,Not Verified,Fully Paid,1.0,small_business,SD,16.06,717.0,2015-12-01
2,20000.0,60 months,10.78,Joint App,432.66,B,B4,truck driver,10+ years,MORTGAGE,63000.0,71000.0,Not Verified,Fully Paid,0.0,home_improvement,IL,10.78,697.0,2015-12-01
3,35000.0,60 months,14.85,Individual,829.9,C,C5,Information Systems Officer,10+ years,MORTGAGE,110000.0,,Source Verified,Current,0.0,debt_consolidation,NJ,17.06,787.0,2015-12-01
4,10400.0,60 months,22.45,Individual,289.91,F,F1,Contract Specialist,3 years,MORTGAGE,104433.0,,Source Verified,Fully Paid,1.0,major_purchase,PA,25.37,697.0,2015-12-01



**Data Wrangling and Transformationa**

Set target values for loan_status

Remove un-needed loan_status data values


In [16]:
# Review the number of unique values in each column (particularly loan_status)
for a in clean_loan_df:
    print(a, len(clean_loan_df[a].unique()))

loan_amnt 1573
term 3
int_rate 674
application_type 3
installment 93302
grade 8
sub_grade 36
emp_title 512695
emp_length 12
home_ownership 7
annual_inc 89369
annual_inc_joint 17634
verification_status 4
loan_status 10
delinq_2yrs 38
purpose 15
addr_state 52
dti 10846
fico_score 49
issued_year 140


In [17]:
# reviewing the dataset for the loan status 
clean_loan_df['loan_status'].value_counts()

loan_status
Fully Paid                                             1076751
Current                                                 878317
Charged Off                                             268559
Late (31-120 days)                                       21467
In Grace Period                                           8436
Late (16-30 days)                                         4349
Does not meet the credit policy. Status:Fully Paid        1988
Does not meet the credit policy. Status:Charged Off        761
Default                                                     40
Name: count, dtype: int64

In [18]:
# Determine any NaN values in the loan_status
clean_loan_df['loan_status'].isna().value_counts()

loan_status
False    2260668
True          33
Name: count, dtype: int64

In [19]:
# Creating an target column to determine the loan_status default: 
# I've included the 'Charged Off' loan status as where it would consider an loss which 30 days after loan goes into default
target = [1 if i == 'Default' or i == 'Charged Off' else 0 for i in clean_loan_df['loan_status']]
clean_loan_df['default_loan'] = target
clean_loan_df['default_loan'].value_counts()

default_loan
0    1992102
1     268599
Name: count, dtype: int64

In [26]:
# reviewing the default_loan column
clean_loan_df['default_loan'].sample(n=10)

2088917    0
30996      0
168290     0
1462182    0
1158864    0
1670677    1
1189517    0
297777     0
2167449    1
1613907    0
Name: default_loan, dtype: int64

In [21]:
clean_loan_df.shape

(2260701, 21)

**Clearing "Does note meet the credit policy" Status**

Found some records with a loan_status of "Does not meet the credit policy". Potentially, these may be older loans that would simply not be accepted under LendingClubs current criteria. As these data points will provide no value moving forward, I've decided ecluded them from the data. Similiarily, recently issued loans could mislead the analysis, as no payment has been expected yet.

In [29]:
# calcuate number of rows with "does not meet the criteria" under loan status 
i = len(clean_loan_df)

clean_loan_df = pd.DataFrame(clean_loan_df[clean_loan_df['loan_status'] != "Does not meet the credit policy. Status:Fully Paid"])
clean_loan_df = pd.DataFrame(clean_loan_df[clean_loan_df['loan_status'] != "Does not meet the credit policy. Status:Charged Off"])
clean_loan_df = pd.DataFrame(clean_loan_df[clean_loan_df['loan_status'] != "Issued"])
clean_loan_df = pd.DataFrame(clean_loan_df[clean_loan_df['loan_status'] != "In Grace Period"])
a = len(clean_loan_df)
print(f"Total columns dropped {i-a} rows, a {((i-a)/((a+i)/2))*100}% reduction in rows")

Total columns dropped 11185 rows, a 0.49598500471263357% reduction in rows


In [30]:
# review the loan_status datatype values after removing "does not meet criteria status"
clean_loan_df['loan_status'].value_counts()

loan_status
Fully Paid            1076751
Current                878317
Charged Off            268559
Late (31-120 days)      21467
Late (16-30 days)        4349
Default                    40
Name: count, dtype: int64

In [31]:
# view number of objects for categorical features
clean_loan_df.select_dtypes('object').apply(pd.Series.nunique, axis = 0)


term                        2
application_type            2
grade                       7
sub_grade                  35
emp_title              509936
emp_length                 11
home_ownership              6
verification_status         3
loan_status                 6
purpose                    14
addr_state                 51
dtype: int64

In [33]:
# Final review the cleaned dataset before writing to s3 bucket
clean_loan_df.sample(n=10)

Unnamed: 0,loan_amnt,term,int_rate,application_type,installment,grade,sub_grade,emp_title,emp_length,home_ownership,...,annual_inc_joint,verification_status,loan_status,delinq_2yrs,purpose,addr_state,dti,fico_score,issued_year,default_loan
1542092,27000.0,36 months,5.31,Individual,812.98,A,A1,Property Manager,5 years,MORTGAGE,...,,Source Verified,Fully Paid,0.0,debt_consolidation,FL,20.16,772.0,2018-05-01,0
2179955,15000.0,36 months,16.99,Individual,534.72,D,D1,Driver,< 1 year,RENT,...,,Not Verified,Charged Off,0.0,debt_consolidation,TX,27.81,662.0,2016-12-01,1
2066060,10000.0,36 months,10.42,Individual,324.65,B,B3,service manager,10+ years,MORTGAGE,...,,Not Verified,Fully Paid,0.0,debt_consolidation,PA,15.08,732.0,2017-12-01,0
113650,25000.0,60 months,13.33,Individual,573.06,C,C3,FIELD APPRAISER,10+ years,MORTGAGE,...,,Source Verified,Fully Paid,0.0,credit_card,NV,9.54,662.0,2015-10-01,0
1674321,11125.0,36 months,14.99,Individual,385.6,C,C4,Screener,1 year,OWN,...,,Verified,Charged Off,0.0,debt_consolidation,FL,14.74,682.0,2017-03-01,1
723439,13600.0,60 months,25.88,Individual,406.23,F,F4,Mortgage Advocate,2 years,OWN,...,,Source Verified,Charged Off,0.0,debt_consolidation,MI,20.95,667.0,2016-05-01,1
704449,10000.0,36 months,13.67,Individual,340.18,C,C3,Bartender,5 years,OWN,...,,Source Verified,Current,0.0,debt_consolidation,NC,30.74,682.0,2016-05-01,0
137549,15000.0,36 months,13.99,Individual,512.6,C,C4,General Manager,10+ years,RENT,...,,Verified,Fully Paid,0.0,credit_card,CA,18.21,717.0,2015-09-01,0
3377,4000.0,36 months,15.77,Individual,140.18,D,D1,Corporate Auditor,6 years,MORTGAGE,...,,Not Verified,Fully Paid,0.0,vacation,KY,30.39,687.0,2015-12-01,0
2136531,35000.0,60 months,17.09,Joint App,871.54,D,D1,Nurse,< 1 year,RENT,...,200000.0,Verified,Current,0.0,small_business,NJ,5.42,722.0,2017-10-01,0


**Write Output files back to s3**

--Once the dataset is formatted wrangled and cleaned we're outputting the data

In [35]:
# variables for buck name and path to save the output file to
bucket_name = "davis-data-cloud-of-wonders"
path = "clean_loan_data.csv"

# converts string into file format before saving the file
with io.StringIO() as csv_buffer:
    clean_loan_df.to_csv(csv_buffer, index=False)

    # place file into the specified buck and path location in the s3 directory
    response = s3.put_object(
        Bucket= bucket_name, Key=path, Body=csv_buffer.getvalue()
    )

    # Validates whether the connection to s3 is successfull or fail 
    status = response.get("ResponseMetadata", {}).get("HTTPStatusCode")

    if status == 200:
        print(f"Successful S3 put_object response. Status - {status}")
    else:
        print(f"Unsuccessful S3 put_object response. Status - {status}")

Successful S3 put_object response. Status - 200


In [None]:

# This is an dummy dataframe, i've created to test to see if able to write data into AWS s3 output_file folder 

""" 
data1 = {'ID': [1, 2, 3, 4], 'Name': ['tom','nick','juli','solyiah'], 'Age' : [10, 15, 14, 10]}
data2 = {'ID': [5, 6, 7, 8], 'Name': ['dick', 'joe', 'harry', 'jake'], 'Age': [21, 30, 45, 30]}

test_df_1 = pd.DataFrame(data1)
test_df_2 = pd.DataFrame(data2)

joined_test_df = pd.merge(test_df_1, test_df_2)

joined_test_df.head()

# test_df_2.head()
# test_df_1.head()

"""