<a href="https://colab.research.google.com/github/davisdw/Lending_Tree_Loan_Prediction_Analysis/blob/main/pyspark_data_load.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Importing & Exporting CSV Data

**Reads the CSV files, accepted & rejected loans from Amazon AWS s3 Bucket**

**Reduce Un-needed Columns from both tables**

**Review the dataset and perform data wrangling and cleaning**

**Merge two datasets together**

**Export the cleaned_df dataset back to s3 bucket to prep for running modeling, prediction and visualization**

In [21]:
import boto3
import pandas as pd
from io import StringIO # uses this library for data conversion
import awsKeyConfig
import io
import datetime as dt
import warnings

warnings.filterwarnings('ignore')


# Use boto3 to get the object from S3
# Retrieve the aws credential keys from the awsKeyConfig.py file

s3 = boto3.client('s3',
aws_access_key_id=awsKeyConfig.keyID,
aws_secret_access_key=awsKeyConfig.secretKey,
region_name='us-east-1'
)

In [22]:
# Retrieves the Bucket Name and Key <file_name> and 
obj = s3.get_object(Bucket='davis-data-cloud-of-wonders', Key='accepted_2007_to_2018Q4.csv')
data = obj['Body'].read().decode('utf-8')

# Validates whether the connection to s3 is successfull or fail 
status = obj.get("ResponseMetadata", {}).get("HTTPStatusCode")

if status == 200:
    print(f"Successful S3 put_object response. Status - {status}")
else:
    print(f"Unsuccessful S3 put_object response. Status - {status}")


# Use StringIO to convert the string data to a file-like object
data_file = StringIO(data)

# Create a DataFrame from the CSV data
loan_df = pd.read_csv(data_file)

loan_df.head()


Successful S3 put_object response. Status - 200


Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,...,hardship_payoff_balance_amount,hardship_last_payment_amount,disbursement_method,debt_settlement_flag,debt_settlement_flag_date,settlement_status,settlement_date,settlement_amount,settlement_percentage,settlement_term
0,68407277,,3600.0,3600.0,3600.0,36 months,13.99,123.03,C,C4,...,,,Cash,N,,,,,,
1,68355089,,24700.0,24700.0,24700.0,36 months,11.99,820.28,C,C1,...,,,Cash,N,,,,,,
2,68341763,,20000.0,20000.0,20000.0,60 months,10.78,432.66,B,B4,...,,,Cash,N,,,,,,
3,66310712,,35000.0,35000.0,35000.0,60 months,14.85,829.9,C,C5,...,,,Cash,N,,,,,,
4,68476807,,10400.0,10400.0,10400.0,60 months,22.45,289.91,F,F1,...,,,Cash,N,,,,,,


**Data Cleaning for Loan Dataset**

Removing un-needed dataset columns

Correcting datatypes

In [23]:
# view number of columns and rows
loan_df.shape

(2260701, 151)

In [24]:
# selected the required columns needed for analysis
clean_loan_df = loan_df[["loan_amnt",
                    "term",
                    "int_rate",
                    "application_type", # single or joint account
                    "installment",
                    "grade", 
                    "sub_grade",
                    "home_ownership",
                    "annual_inc",
                    "verification_status",
                    "issue_d",
                    "loan_status",
                    "delinq_2yrs", #  number of past 30+ days delinquent marks in past two-year history
                    "purpose",
                    "addr_state",
                    "dti",
                    "fico_range_low",
                    "fico_range_high" ]]

clean_loan_df.head()

Unnamed: 0,loan_amnt,term,int_rate,application_type,installment,grade,sub_grade,home_ownership,annual_inc,verification_status,issue_d,loan_status,delinq_2yrs,purpose,addr_state,dti,fico_range_low,fico_range_high
0,3600.0,36 months,13.99,Individual,123.03,C,C4,MORTGAGE,55000.0,Not Verified,Dec-2015,Fully Paid,0.0,debt_consolidation,PA,5.91,675.0,679.0
1,24700.0,36 months,11.99,Individual,820.28,C,C1,MORTGAGE,65000.0,Not Verified,Dec-2015,Fully Paid,1.0,small_business,SD,16.06,715.0,719.0
2,20000.0,60 months,10.78,Joint App,432.66,B,B4,MORTGAGE,63000.0,Not Verified,Dec-2015,Fully Paid,0.0,home_improvement,IL,10.78,695.0,699.0
3,35000.0,60 months,14.85,Individual,829.9,C,C5,MORTGAGE,110000.0,Source Verified,Dec-2015,Current,0.0,debt_consolidation,NJ,17.06,785.0,789.0
4,10400.0,60 months,22.45,Individual,289.91,F,F1,MORTGAGE,104433.0,Source Verified,Dec-2015,Fully Paid,1.0,major_purchase,PA,25.37,695.0,699.0


In [25]:
# Take two fico scores columns and find an average for the credit score and place it in new columns
clean_loan_df["fico_score"] = clean_loan_df[["fico_range_low", "fico_range_high"]].mean(axis=1)
clean_loan_df.head()

Unnamed: 0,loan_amnt,term,int_rate,application_type,installment,grade,sub_grade,home_ownership,annual_inc,verification_status,issue_d,loan_status,delinq_2yrs,purpose,addr_state,dti,fico_range_low,fico_range_high,fico_score
0,3600.0,36 months,13.99,Individual,123.03,C,C4,MORTGAGE,55000.0,Not Verified,Dec-2015,Fully Paid,0.0,debt_consolidation,PA,5.91,675.0,679.0,677.0
1,24700.0,36 months,11.99,Individual,820.28,C,C1,MORTGAGE,65000.0,Not Verified,Dec-2015,Fully Paid,1.0,small_business,SD,16.06,715.0,719.0,717.0
2,20000.0,60 months,10.78,Joint App,432.66,B,B4,MORTGAGE,63000.0,Not Verified,Dec-2015,Fully Paid,0.0,home_improvement,IL,10.78,695.0,699.0,697.0
3,35000.0,60 months,14.85,Individual,829.9,C,C5,MORTGAGE,110000.0,Source Verified,Dec-2015,Current,0.0,debt_consolidation,NJ,17.06,785.0,789.0,787.0
4,10400.0,60 months,22.45,Individual,289.91,F,F1,MORTGAGE,104433.0,Source Verified,Dec-2015,Fully Paid,1.0,major_purchase,PA,25.37,695.0,699.0,697.0


In [26]:
# Drop the fico range columns
clean_loan_df = clean_loan_df.drop(columns=["fico_range_low", "fico_range_high"])

In [27]:
# Extract year column 
clean_loan_df['issued_year'] = pd.to_datetime(clean_loan_df['issue_d'])
clean_loan_df['issued_year'] = clean_loan_df['issued_year'].dt.year.astype(str)


In [28]:
clean_loan_df = clean_loan_df.drop(columns=['issue_d'])

In [18]:
# Determine any NaN values in the loan_status
clean_loan_df.isna().mean()

loan_amnt              0.000015
term                   0.000015
int_rate               0.000015
application_type       0.000015
installment            0.000015
grade                  0.000015
sub_grade              0.000015
home_ownership         0.000015
annual_inc             0.000016
verification_status    0.000015
loan_status            0.000015
delinq_2yrs            0.000027
purpose                0.000015
addr_state             0.000015
dti                    0.000771
fico_score             0.000015
issued_year            0.000000
dtype: float64

In [29]:
# dropping any NaN from rows
clean_loan_df = clean_loan_df.dropna()

In [30]:
#review after dropping NAN columns
clean_loan_df.isna().mean()

loan_amnt              0.0
term                   0.0
int_rate               0.0
application_type       0.0
installment            0.0
grade                  0.0
sub_grade              0.0
home_ownership         0.0
annual_inc             0.0
verification_status    0.0
loan_status            0.0
delinq_2yrs            0.0
purpose                0.0
addr_state             0.0
dti                    0.0
fico_score             0.0
issued_year            0.0
dtype: float64

In [31]:
clean_loan_df.shape

(2258928, 17)

In [33]:
# reviewing the cleaned data
clean_loan_df.head()

Unnamed: 0,loan_amnt,term,int_rate,application_type,installment,grade,sub_grade,home_ownership,annual_inc,verification_status,loan_status,delinq_2yrs,purpose,addr_state,dti,fico_score,issued_year
0,3600.0,36 months,13.99,Individual,123.03,C,C4,MORTGAGE,55000.0,Not Verified,Fully Paid,0.0,debt_consolidation,PA,5.91,677.0,2015.0
1,24700.0,36 months,11.99,Individual,820.28,C,C1,MORTGAGE,65000.0,Not Verified,Fully Paid,1.0,small_business,SD,16.06,717.0,2015.0
2,20000.0,60 months,10.78,Joint App,432.66,B,B4,MORTGAGE,63000.0,Not Verified,Fully Paid,0.0,home_improvement,IL,10.78,697.0,2015.0
3,35000.0,60 months,14.85,Individual,829.9,C,C5,MORTGAGE,110000.0,Source Verified,Current,0.0,debt_consolidation,NJ,17.06,787.0,2015.0
4,10400.0,60 months,22.45,Individual,289.91,F,F1,MORTGAGE,104433.0,Source Verified,Fully Paid,1.0,major_purchase,PA,25.37,697.0,2015.0



**Data Wrangling and Transformationa**

Set target values for loan_status

Remove un-needed loan_status data values


In [34]:
# Review the number of unique values in each column (particularly loan_status)
for a in clean_loan_df:
    print(a, len(clean_loan_df[a].unique()))

loan_amnt 1572
term 2
int_rate 673
application_type 2
installment 93262
grade 7
sub_grade 35
home_ownership 6
annual_inc 89361
verification_status 3
loan_status 9
delinq_2yrs 37
purpose 14
addr_state 51
dti 10845
fico_score 48
issued_year 12


In [35]:
# reviewing the dataset for the loan status 
clean_loan_df['loan_status'].value_counts()

loan_status
Fully Paid                                             1076448
Current                                                 877018
Charged Off                                             268488
Late (31-120 days)                                       21443
In Grace Period                                           8427
Late (16-30 days)                                         4344
Does not meet the credit policy. Status:Fully Paid        1962
Does not meet the credit policy. Status:Charged Off        758
Default                                                     40
Name: count, dtype: int64

In [36]:
# Creating an target column to determine the loan_status default: 
# I've included the 'Charged Off' loan status as where it would consider an loss which 30 days after loan goes into default
target = [1 if i == 'Default' or i == 'Charged Off' else 0 for i in clean_loan_df['loan_status']]
clean_loan_df['default_loan'] = target
clean_loan_df['default_loan'].value_counts()

default_loan
0    1990400
1     268528
Name: count, dtype: int64

In [37]:
# reviewing the default_loan column
clean_loan_df['default_loan'].sample(n=10)

1565632    0
1691816    0
1552558    0
1261326    1
1190708    0
1773619    1
1761506    0
1967654    0
2151988    1
1509602    0
Name: default_loan, dtype: int64

In [38]:
clean_loan_df.shape

(2258928, 18)

**Clearing "Does note meet the credit policy" Status**

Found some records with a loan_status of "Does not meet the credit policy". Potentially, these may be older loans that would simply not be accepted under LendingClubs current criteria. As these data points will provide no value moving forward, I've decided ecluded them from the data. Similiarily, recently issued loans could mislead the analysis, as no payment has been expected yet.

In [39]:
# calcuate number of rows with "does not meet the criteria" under loan status 
i = len(clean_loan_df)

clean_loan_df = pd.DataFrame(clean_loan_df[clean_loan_df['loan_status'] != "Does not meet the credit policy. Status:Fully Paid"])
clean_loan_df = pd.DataFrame(clean_loan_df[clean_loan_df['loan_status'] != "Does not meet the credit policy. Status:Charged Off"])
clean_loan_df = pd.DataFrame(clean_loan_df[clean_loan_df['loan_status'] != "Issued"])
clean_loan_df = pd.DataFrame(clean_loan_df[clean_loan_df['loan_status'] != "In Grace Period"])
a = len(clean_loan_df)
print(f"Total columns dropped {i-a} rows, a {((i-a)/((a+i)/2))*100}% reduction in rows")

Total columns dropped 11147 rows, a 0.49468470229606565% reduction in rows


In [40]:
# review the loan_status datatype values after removing "does not meet criteria status"
clean_loan_df['loan_status'].value_counts()

loan_status
Fully Paid            1076448
Current                877018
Charged Off            268488
Late (31-120 days)      21443
Late (16-30 days)        4344
Default                    40
Name: count, dtype: int64

In [41]:
# view number of objects for categorical features
clean_loan_df.select_dtypes('object').apply(pd.Series.nunique, axis = 0)


term                    2
application_type        2
grade                   7
sub_grade              35
home_ownership          6
verification_status     3
loan_status             6
purpose                14
addr_state             51
issued_year            12
dtype: int64

In [42]:
# Final review the cleaned dataset before writing to s3 bucket
clean_loan_df.sample(n=10)

Unnamed: 0,loan_amnt,term,int_rate,application_type,installment,grade,sub_grade,home_ownership,annual_inc,verification_status,loan_status,delinq_2yrs,purpose,addr_state,dti,fico_score,issued_year,default_loan
887948,15000.0,60 months,9.44,Individual,314.59,B,B1,MORTGAGE,85000.0,Not Verified,Current,0.0,debt_consolidation,SC,29.87,737.0,2017.0,0
557368,40000.0,60 months,10.42,Joint App,858.18,B,B3,MORTGAGE,80100.0,Verified,Current,0.0,debt_consolidation,NJ,33.09,737.0,2017.0,0
2223159,40000.0,60 months,14.99,Individual,951.39,C,C4,MORTGAGE,155000.0,Verified,Fully Paid,0.0,debt_consolidation,NJ,13.5,757.0,2016.0,0
614224,10000.0,36 months,10.91,Individual,326.97,B,B4,MORTGAGE,39600.0,Not Verified,Fully Paid,0.0,credit_card,VA,27.24,707.0,2017.0,0
181312,10000.0,60 months,18.25,Individual,255.3,E,E1,MORTGAGE,156000.0,Verified,Current,1.0,moving,FL,10.19,667.0,2015.0,0
1485031,17000.0,36 months,6.19,Individual,518.64,A,A2,MORTGAGE,52138.0,Verified,Current,0.0,credit_card,OH,25.6,812.0,2018.0,0
94357,23925.0,60 months,16.55,Individual,588.83,D,D2,RENT,63000.0,Verified,Current,0.0,debt_consolidation,IL,16.91,662.0,2015.0,0
999719,40000.0,36 months,6.49,Individual,1225.78,A,A2,MORTGAGE,187000.0,Source Verified,Charged Off,0.0,debt_consolidation,AZ,4.66,742.0,2016.0,1
375786,16000.0,36 months,6.39,Individual,489.59,A,A2,MORTGAGE,121000.0,Verified,Fully Paid,0.0,debt_consolidation,CT,5.81,737.0,2015.0,0
1873183,18000.0,60 months,23.63,Individual,513.97,F,F3,RENT,52800.0,Verified,Charged Off,0.0,debt_consolidation,CA,26.61,662.0,2013.0,1


**Write Output files back to s3**

--Once the dataset is formatted wrangled and cleaned we're outputting the data

In [43]:
# variables for buck name and path to save the output file to
bucket_name = "davis-data-cloud-of-wonders"
path = "clean_loan_data.csv"

# converts string into file format before saving the file
with io.StringIO() as csv_buffer:
    clean_loan_df.to_csv(csv_buffer, index=False)

    # place file into the specified buck and path location in the s3 directory
    response = s3.put_object(
        Bucket= bucket_name, Key=path, Body=csv_buffer.getvalue()
    )

    # Validates whether the connection to s3 is successfull or fail 
    status = response.get("ResponseMetadata", {}).get("HTTPStatusCode")

    if status == 200:
        print(f"Successful S3 put_object response. Status - {status}")
    else:
        print(f"Unsuccessful S3 put_object response. Status - {status}")

Successful S3 put_object response. Status - 200


In [44]:

# This is an dummy dataframe, i've created to test to see if able to write data into AWS s3 output_file folder 

""" 
data1 = {'ID': [1, 2, 3, 4], 'Name': ['tom','nick','juli','solyiah'], 'Age' : [10, 15, 14, 10]}
data2 = {'ID': [5, 6, 7, 8], 'Name': ['dick', 'joe', 'harry', 'jake'], 'Age': [21, 30, 45, 30]}

test_df_1 = pd.DataFrame(data1)
test_df_2 = pd.DataFrame(data2)

joined_test_df = pd.merge(test_df_1, test_df_2)

joined_test_df.head()

# test_df_2.head()
# test_df_1.head()

"""

" \ndata1 = {'ID': [1, 2, 3, 4], 'Name': ['tom','nick','juli','solyiah'], 'Age' : [10, 15, 14, 10]}\ndata2 = {'ID': [5, 6, 7, 8], 'Name': ['dick', 'joe', 'harry', 'jake'], 'Age': [21, 30, 45, 30]}\n\ntest_df_1 = pd.DataFrame(data1)\ntest_df_2 = pd.DataFrame(data2)\n\njoined_test_df = pd.merge(test_df_1, test_df_2)\n\njoined_test_df.head()\n\n# test_df_2.head()\n# test_df_1.head()\n\n"