<a href="https://colab.research.google.com/github/davisdw/Lending_Tree_Loan_Prediction_Analysis/blob/main/pyspark_data_load.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Importing & Exporting CSV Data

**Reads the CSV files, accepted & rejected loans from Amazon AWS s3 Bucket**

**Reduce Un-needed Columns from both tables**

**Review the dataset and perform data wrangling and cleaning**

**Merge two datasets together**

**Export the cleaned_df dataset back to s3 bucket to prep for running modeling, prediction and visualization**

In [4]:
import boto3
import pandas as pd
from io import StringIO # uses this library for data conversion
import awsKeyConfig
import io
import numpy as np

# Use boto3 to get the object from S3
# Retrieve the aws credential keys

s3 = boto3.client('s3',
aws_access_key_id=awsKeyConfig.keyID,
aws_secret_access_key=awsKeyConfig.secretKey,
region_name='us-east-1'
)

**Data Wrangling and Cleaning for Loan Accepted Dataset**

In [None]:
# Retrieves the Bucket Name and Key <file_name> and 
obj = s3.get_object(Bucket='davis-data-cloud-of-wonders', Key='accepted_2007_to_2018Q4.csv')
data = obj['Body'].read().decode('utf-8')

# Validates whether the connection to s3 is successfull or fail 
status = obj.get("ResponseMetadata", {}).get("HTTPStatusCode")

if status == 200:
    print(f"Successful S3 put_object response. Status - {status}")
else:
    print(f"Unsuccessful S3 put_object response. Status - {status}")


# Use StringIO to convert the string data to a file-like object
data_file = StringIO(data)

# Create a DataFrame from the CSV data
accepted_df = pd.read_csv(data_file)

accepted_df.head()


In [None]:
# view number of columns and rows
accepted_df.shape

In [None]:
# selected the required columns needed for analysis
clean_accept_df = accepted_df[["loan_amnt",
                    "term",
                    "int_rate",
                    "installment",
                    "grade",
                    "sub_grade",
                    "emp_title",
                    "emp_length",
                    "home_ownership",
                    "annual_inc",
                    "verification_status",
                    "issue_d",
                    "loan_status",
                    "purpose",
                    "addr_state",
                    "dti",
                    "fico_range_low",
                    "fico_range_high" ]]

clean_accept_df.head()

In [None]:
# view the dataset shape after removal of un-needed columns:
clean_accept_df.shape

In [None]:
# view full display of the data types for the accepted table
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(clean_accept_df.dtypes)



In [None]:
# Display the number of unique values in each column
for a in clean_accept_df:
    print(a, len(clean_accept_df[a].unique()))



**Data Wrangling and Cleaning for Loan Rejected Dataset**



In [None]:
# Reads in the rejected data : 

obj = s3.get_object(Bucket='davis-data-cloud-of-wonders', Key='rejected_2007_to_2018Q4.csv')
data = obj['Body'].read().decode('utf-8')

# Validates whether the connection to s3 is successfull or fail 
status = obj.get("ResponseMetadata", {}).get("HTTPStatusCode")

if status == 200:
    print(f"Successful S3 put_object response. Status - {status}")
else:
    print(f"Unsuccessful S3 put_object response. Status - {status}")

# Use StringIO to convert the string data to a file-like object
data_file = StringIO(data)

# Create a DataFrame from the CSV data
rejected_df = pd.read_csv(data_file)

rejected_df.head()

In [None]:
# view number of columns and rows
rejected_df.shape

In [None]:
# Dropping policy_code column
clean_reject_df = rejected_df.drop(columns=['policy_code'])
clean_reject_df.head()

In [None]:
# Rename some of the columns to match the clean_accept_df dataframe
clean_reject_df.rename(columns={...}, inplace= True) # ADD IN COLUMNS TO RENAME

In [None]:
# view the dataset shape after removal of un-needed columns:
clean_reject_df.shape

In [None]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(clean_reject_df.dtypes)

In [None]:
for b in clean_reject_df:
    print(b, len(clean_reject_df[b].unique()))



**Merging two datasets**


In [None]:
# quick review of the clean_accept_df dataset
clean_accept_df.head()


In [None]:
# quick review of the clean_reject_df dataset
clean_reject_df.head()

In [None]:
# add an column to both tables that determines if applicant had approval of the loan  
# using boolean values "0" for those who were approved (accept_df) and "1" for denied (reject_df) for loan
clean_accept_df['is_approve'] = np.where(...) # insert condition and set 0 as True
clean_reject_df['is_approve'] = np.where(...) # insert condition and set 1 as False

In [None]:
# Joined both datasets together, "concat" using join inner on "loan_request_id" column 
joined_loan_df = pd.merge(clean_accept_df, clean_reject_df)
joined_loan_df.head()

In [None]:
# Create an index for the combined dataframes as unique identifier
joined_loan_df['loan_app_id'] = range(1, len(clean_reject_df) + 1)

**Write Output files back to s3**

--Once the dataset is formatted wrangled and cleaned we're outputting the data

In [29]:

# This is an dummy dataframe, i've created to test to see if able to write data into AWS s3 output_file folder 

""" 
data1 = {'ID': [1, 2, 3, 4], 'Name': ['tom','nick','juli','solyiah'], 'Age' : [10, 15, 14, 10]}
data2 = {'ID': [5, 6, 7, 8], 'Name': ['dick', 'joe', 'harry', 'jake'], 'Age': [21, 30, 45, 30]}

test_df_1 = pd.DataFrame(data1)
test_df_2 = pd.DataFrame(data2)

joined_test_df = pd.merge(test_df_1, test_df_2)

joined_test_df.head()

# test_df_2.head()
# test_df_1.head()

"""

Unnamed: 0,ID,Name,Age


In [None]:
# variables for buck name and path to save the output file to
bucket_name = "davis-data-cloud-of-wonders"
path = "output/output_data.csv"

# converts string into file format before saving the file
with io.StringIO() as csv_buffer:
    joined_loan_df.to_csv(csv_buffer, index=False)

    # place file into the specified buck and path location in the s3 directory
    response = s3.put_object(
        Bucket= bucket_name, Key=path, Body=csv_buffer.getvalue()
    )

    # Validates whether the connection to s3 is successfull or fail 
    status = response.get("ResponseMetadata", {}).get("HTTPStatusCode")

    if status == 200:
        print(f"Successful S3 put_object response. Status - {status}")
    else:
        print(f"Unsuccessful S3 put_object response. Status - {status}")