<a href="https://colab.research.google.com/github/davisdw/Lending_Tree_Loan_Prediction_Analysis/blob/main/pyspark_data_load.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Importing & Exporting CSV Data

**Reads the CSV files, accepted & rejected loans from Amazon AWS s3 Bucket**

**Reduce Un-needed Columns from both tables**

**Review the dataset and perform data wrangling and cleaning**

**Merge two datasets together**

**Export the cleaned_df dataset back to s3 bucket to prep for running modeling, prediction and visualization**

In [1]:
import boto3
import pandas as pd
from io import StringIO # uses this library for data conversion
import awsKeyConfig
import io
import numpy as np

# Use boto3 to get the object from S3
# Retrieve the aws credential keys

s3 = boto3.client('s3',
aws_access_key_id=awsKeyConfig.keyID,
aws_secret_access_key=awsKeyConfig.secretKey,
region_name='us-east-1'
)

**Data Wrangling and Cleaning for Loan Accepted Dataset**

In [2]:
# Retrieves the Bucket Name and Key <file_name> and 
obj = s3.get_object(Bucket='davis-data-cloud-of-wonders', Key='accepted_2007_to_2018Q4.csv')
data = obj['Body'].read().decode('utf-8')

# Validates whether the connection to s3 is successfull or fail 
status = obj.get("ResponseMetadata", {}).get("HTTPStatusCode")

if status == 200:
    print(f"Successful S3 put_object response. Status - {status}")
else:
    print(f"Unsuccessful S3 put_object response. Status - {status}")


# Use StringIO to convert the string data to a file-like object
data_file = StringIO(data)

# Create a DataFrame from the CSV data
accepted_df = pd.read_csv(data_file)

accepted_df.head()


Successful S3 put_object response. Status - 200


  accepted_df = pd.read_csv(data_file)


Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,...,hardship_payoff_balance_amount,hardship_last_payment_amount,disbursement_method,debt_settlement_flag,debt_settlement_flag_date,settlement_status,settlement_date,settlement_amount,settlement_percentage,settlement_term
0,68407277,,3600.0,3600.0,3600.0,36 months,13.99,123.03,C,C4,...,,,Cash,N,,,,,,
1,68355089,,24700.0,24700.0,24700.0,36 months,11.99,820.28,C,C1,...,,,Cash,N,,,,,,
2,68341763,,20000.0,20000.0,20000.0,60 months,10.78,432.66,B,B4,...,,,Cash,N,,,,,,
3,66310712,,35000.0,35000.0,35000.0,60 months,14.85,829.9,C,C5,...,,,Cash,N,,,,,,
4,68476807,,10400.0,10400.0,10400.0,60 months,22.45,289.91,F,F1,...,,,Cash,N,,,,,,


In [3]:
# view number of columns and rows
accepted_df.shape

(2260701, 151)

In [5]:
# selected the required columns needed for analysis
clean_accept_df = accepted_df[["loan_amnt",
                    "term",
                    "int_rate",
                    "installment",
                    "grade",
                    "sub_grade",
                    "emp_title",
                    "emp_length",
                    "home_ownership",
                    "annual_inc",
                    "verification_status",
                    "issue_d",
                    "loan_status",
                    "purpose",
                    "addr_state",
                    "dti",
                    "fico_range_low",
                    "fico_range_high" ]]

clean_accept_df.head()

Unnamed: 0,loan_amnt,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,issue_d,loan_status,purpose,addr_state,dti,fico_range_low,fico_range_high
0,3600.0,36 months,13.99,123.03,C,C4,leadman,10+ years,MORTGAGE,55000.0,Not Verified,Dec-2015,Fully Paid,debt_consolidation,PA,5.91,675.0,679.0
1,24700.0,36 months,11.99,820.28,C,C1,Engineer,10+ years,MORTGAGE,65000.0,Not Verified,Dec-2015,Fully Paid,small_business,SD,16.06,715.0,719.0
2,20000.0,60 months,10.78,432.66,B,B4,truck driver,10+ years,MORTGAGE,63000.0,Not Verified,Dec-2015,Fully Paid,home_improvement,IL,10.78,695.0,699.0
3,35000.0,60 months,14.85,829.9,C,C5,Information Systems Officer,10+ years,MORTGAGE,110000.0,Source Verified,Dec-2015,Current,debt_consolidation,NJ,17.06,785.0,789.0
4,10400.0,60 months,22.45,289.91,F,F1,Contract Specialist,3 years,MORTGAGE,104433.0,Source Verified,Dec-2015,Fully Paid,major_purchase,PA,25.37,695.0,699.0


In [6]:
# Take two fico scores columns and find an average for the credit score and place it in new columns
clean_accept_df["fico_score"] = clean_accept_df[["fico_range_low", "fico_range_high"]].mean(axis=1)
clean_accept_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clean_accept_df["fico_score"] = clean_accept_df[["fico_range_low", "fico_range_high"]].mean(axis=1)


Unnamed: 0,loan_amnt,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,issue_d,loan_status,purpose,addr_state,dti,fico_range_low,fico_range_high,fico_score
0,3600.0,36 months,13.99,123.03,C,C4,leadman,10+ years,MORTGAGE,55000.0,Not Verified,Dec-2015,Fully Paid,debt_consolidation,PA,5.91,675.0,679.0,677.0
1,24700.0,36 months,11.99,820.28,C,C1,Engineer,10+ years,MORTGAGE,65000.0,Not Verified,Dec-2015,Fully Paid,small_business,SD,16.06,715.0,719.0,717.0
2,20000.0,60 months,10.78,432.66,B,B4,truck driver,10+ years,MORTGAGE,63000.0,Not Verified,Dec-2015,Fully Paid,home_improvement,IL,10.78,695.0,699.0,697.0
3,35000.0,60 months,14.85,829.9,C,C5,Information Systems Officer,10+ years,MORTGAGE,110000.0,Source Verified,Dec-2015,Current,debt_consolidation,NJ,17.06,785.0,789.0,787.0
4,10400.0,60 months,22.45,289.91,F,F1,Contract Specialist,3 years,MORTGAGE,104433.0,Source Verified,Dec-2015,Fully Paid,major_purchase,PA,25.37,695.0,699.0,697.0


In [7]:
clean_accept_df = clean_accept_df.drop(columns=["fico_range_low", "fico_range_high"])


In [9]:
# split issue_d to year column
dt_year = pd.to_datetime(clean_accept_df['issue_d'])
clean_accept_df['issued_year'] = dt_year

  dt_year = pd.to_datetime(clean_accept_df['issue_d'])


In [10]:
# Drop the fico range columns
clean_accept_df = clean_accept_df.drop(columns=['issue_d'])

In [11]:
# reviewing the cleaned data
clean_accept_df.head()

Unnamed: 0,loan_amnt,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,loan_status,purpose,addr_state,dti,fico_score,issued_year
0,3600.0,36 months,13.99,123.03,C,C4,leadman,10+ years,MORTGAGE,55000.0,Not Verified,Fully Paid,debt_consolidation,PA,5.91,677.0,2015-12-01
1,24700.0,36 months,11.99,820.28,C,C1,Engineer,10+ years,MORTGAGE,65000.0,Not Verified,Fully Paid,small_business,SD,16.06,717.0,2015-12-01
2,20000.0,60 months,10.78,432.66,B,B4,truck driver,10+ years,MORTGAGE,63000.0,Not Verified,Fully Paid,home_improvement,IL,10.78,697.0,2015-12-01
3,35000.0,60 months,14.85,829.9,C,C5,Information Systems Officer,10+ years,MORTGAGE,110000.0,Source Verified,Current,debt_consolidation,NJ,17.06,787.0,2015-12-01
4,10400.0,60 months,22.45,289.91,F,F1,Contract Specialist,3 years,MORTGAGE,104433.0,Source Verified,Fully Paid,major_purchase,PA,25.37,697.0,2015-12-01


In [12]:
# Review the number of unique values in each column (particularly loan_status)
for a in clean_accept_df:
    print(a, len(clean_accept_df[a].unique()))

loan_amnt 1573
term 3
int_rate 674
installment 93302
grade 8
sub_grade 36
emp_title 512695
emp_length 12
home_ownership 7
annual_inc 89369
verification_status 4
loan_status 10
purpose 15
addr_state 52
dti 10846
fico_score 49
issued_year 140


In [13]:
# reviewing the dataset for the loan status 
clean_accept_df['loan_status'].value_counts()

loan_status
Fully Paid                                             1076751
Current                                                 878317
Charged Off                                             268559
Late (31-120 days)                                       21467
In Grace Period                                           8436
Late (16-30 days)                                         4349
Does not meet the credit policy. Status:Fully Paid        1988
Does not meet the credit policy. Status:Charged Off        761
Default                                                     40
Name: count, dtype: int64

In [14]:
# Determine any NaN values in the loan_status
clean_accept_df['loan_status'].isna().value_counts()

loan_status
False    2260668
True          33
Name: count, dtype: int64

In [15]:
# Creating an target column to determine the loan_status default: 
# I've included the 'Charged Off' loan status as where it would consider an loss which 30 days after loan goes into default
target = [1 if i == 'Default' or i == 'Charged Off' else 0 for i in clean_accept_df['loan_status']]
clean_accept_df['default_loan'] = target
clean_accept_df['default_loan'].value_counts()

default_loan
0    1992102
1     268599
Name: count, dtype: int64

In [16]:
# reviewing the default_loan column
clean_accept_df['default_loan'].sample(n=10)

1984586    0
914136     1
375743     0
1295506    1
470201     0
1221994    0
779586     0
1140105    0
260962     0
1889910    0
Name: default_loan, dtype: int64

In [17]:
clean_accept_df.shape

(2260701, 18)

In [18]:
# calcuate number of rows with "does not meet the criteria" under loan status 
i = len(clean_accept_df)

clean_accept_df = pd.DataFrame(clean_accept_df[clean_accept_df['loan_status'] != "Does not meet the credit policy. Status:Fully Paid"])
clean_accept_df = pd.DataFrame(clean_accept_df[clean_accept_df['loan_status'] != "Does not meet the credit policy. Status:Charged Off"])
clean_accept_df = pd.DataFrame(clean_accept_df[clean_accept_df['loan_status'] != "Issued"])
clean_accept_df = pd.DataFrame(clean_accept_df[clean_accept_df['loan_status'] != "In Grace Period"])
a = len(clean_accept_df)
print(f"Total columns dropped {i-a} rows, a {((i-a)/((a+i)/2))*100}% reduction in rows")

Total columns dropped 11185 rows, a 0.49598500471263357% reduction in rows


In [19]:
# review the loan_status datatype values after removing "does not meet criteria status"
clean_accept_df['loan_status'].value_counts()


loan_status
Fully Paid            1076751
Current                878317
Charged Off            268559
Late (31-120 days)      21467
Late (16-30 days)        4349
Default                    40
Name: count, dtype: int64

In [20]:
# view number of objects for categorical features
clean_accept_df.select_dtypes('object').apply(pd.Series.nunique, axis = 0)


term                        2
grade                       7
sub_grade                  35
emp_title              509936
emp_length                 11
home_ownership              6
verification_status         3
loan_status                 6
purpose                    14
addr_state                 51
dtype: int64

In [21]:
# Final review the cleaned dataset before writing to s3 bucket
clean_accept_df.sample(n=10)

Unnamed: 0,loan_amnt,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,loan_status,purpose,addr_state,dti,fico_score,issued_year,default_loan
1274549,5000.0,36 months,12.99,168.45,C,C1,Claims Representative,< 1 year,MORTGAGE,40000.0,Not Verified,Fully Paid,medical,NV,30.3,762.0,2014-05-01,0
1751315,11200.0,36 months,13.53,380.24,B,B5,Expeditor,10+ years,MORTGAGE,43500.0,Not Verified,Fully Paid,debt_consolidation,NC,17.71,677.0,2013-12-01,0
115241,14000.0,36 months,12.29,466.95,C,C1,Owner,1 year,MORTGAGE,60000.0,Source Verified,Fully Paid,home_improvement,MA,7.24,792.0,2015-10-01,0
2133573,32250.0,60 months,13.59,743.57,C,C2,Director,3 years,MORTGAGE,180000.0,Verified,Current,home_improvement,CO,27.92,682.0,2017-10-01,0
1117527,16000.0,60 months,13.66,369.48,C,C3,internal auditor senior lll,4 years,RENT,77000.0,Source Verified,Charged Off,home_improvement,AZ,19.12,692.0,2014-12-01,1
398894,20000.0,60 months,13.66,461.85,C,C3,Senior Property Manager,4 years,RENT,68000.0,Source Verified,Fully Paid,debt_consolidation,DC,20.6,667.0,2015-01-01,0
1756279,12175.0,60 months,14.47,286.27,C,C2,Operator 3,10+ years,MORTGAGE,35898.0,Verified,Fully Paid,debt_consolidation,TX,29.56,717.0,2013-12-01,0
1226293,20000.0,60 months,14.49,470.47,C,C4,Teacher,10+ years,MORTGAGE,85000.0,Source Verified,Fully Paid,debt_consolidation,PA,25.54,662.0,2014-07-01,0
1687186,4800.0,36 months,11.39,158.04,B,B3,Director,1 year,RENT,99779.0,Source Verified,Current,vacation,RI,12.1,687.0,2017-03-01,0
1293293,7500.0,36 months,6.62,230.28,A,A2,Owner Relations Manager,1 year,RENT,55000.0,Source Verified,Fully Paid,major_purchase,NY,18.92,717.0,2014-04-01,0


In [22]:
# variables for buck name and path to save the output file to
bucket_name = "davis-data-cloud-of-wonders"
path = "clean_loan_data.csv"

# converts string into file format before saving the file
with io.StringIO() as csv_buffer:
    clean_accept_df.to_csv(csv_buffer, index=False)

    # place file into the specified buck and path location in the s3 directory
    response = s3.put_object(
        Bucket= bucket_name, Key=path, Body=csv_buffer.getvalue()
    )

    # Validates whether the connection to s3 is successfull or fail 
    status = response.get("ResponseMetadata", {}).get("HTTPStatusCode")

    if status == 200:
        print(f"Successful S3 put_object response. Status - {status}")
    else:
        print(f"Unsuccessful S3 put_object response. Status - {status}")

Successful S3 put_object response. Status - 200
