# PPP Loan Analysis
Team 22032: Danny Rivas, Javan Reuto

Date: 03/05/22

---

First we begin, we'll first import libraries we will be using. 

In [9]:
import pandas as pd
import numpy as np
from numpy import nan
import re
import datetime as dt
import time

Here is an overview what we'll cover:
* [**Part One: Data Preparation**](#part01)
    * [Load the Data](#load-the-data)
    * [Data Cleaning](#data-cleaning)
    * [Location Cleaning](#location-cleaning)
    * [Combining & Removing Duplicates](#combining--removing-duplicates)
* [**Part Two: Analysis**](#part02)
    * [Defining Characteristics Of Removed Loans](#defining-characteristics-of-removed-loans)
    * [Removed Loans Versus All Loans](#removed-loans-versus-all-loans)
* [**Part Three: Prediction**](#part03)

## Part One: Data Preparation <a class="anchor" id="part01"></a>

### Load the Data <a class="anchor" id="load-the-data"></a>

In [10]:
# Load files as dataframe
ga_full_df = pd.read_csv('ppp_applicants_ga_full.csv')
ga_removed_df = pd.read_excel('ppp-removed-ga.xlsx')
test_df = pd.read_csv('test_file.csv')

### Data Cleaning <a class="anchor" id="data-cleaning"></a>

In [11]:
# Creating function to clean dataframes - cleaner
def cleaner(dataframe):
    
    # Creating nested function to format string columns
    def uppercase_columns(df, column):
        df[column] = df[column].str.upper()
        return df
    
    # Invoking functions
    dataframe = uppercase_columns(dataframe, 'name')
    dataframe = uppercase_columns(dataframe, 'address')
    dataframe = uppercase_columns(dataframe, 'city')
    dataframe = uppercase_columns(dataframe, 'business_type')
    dataframe = uppercase_columns(dataframe, 'loan_status')
    dataframe = uppercase_columns(dataframe, 'lender')
    dataframe = uppercase_columns(dataframe, 'servicing_lender_name')
    dataframe = uppercase_columns(dataframe, 'servicing_lender_address')
    dataframe = uppercase_columns(dataframe, 'servicing_lender_city')
    dataframe = uppercase_columns(dataframe, 'business_age_description')
    dataframe = uppercase_columns(dataframe, 'project_city')
    dataframe = uppercase_columns(dataframe, 'project_county_name')
    dataframe = uppercase_columns(dataframe, 'originating_lender_city')
    
    # Creating nested function to format dates
    def format_date(df, column):
        df[column] = pd.to_datetime(df[column], format="%Y-%m-%d")
        return df
    
    # Invoking format_date function to date columns
    dataframe = format_date(dataframe,'date_approved')
    dataframe = format_date(dataframe,'loan_status_date')
    dataframe = format_date(dataframe,'forgiveness_date')

    def standard_zip(df,zip_col):
        df[zip_col] = df[zip_col].str[:5]
        return df

    dataframe = standard_zip(dataframe,'zip')
    dataframe = standard_zip(dataframe, 'servicing_lender_zip')
    dataframe = standard_zip(dataframe, 'project_zip')

    # Set loan number as index
    dataframe = dataframe.set_index('loan_number')
    
    # Return dataframe
    return dataframe

# Invoke function on dataframes
ga_full_df = cleaner(ga_full_df)
ga_removed_df = cleaner(ga_removed_df)


In [12]:
# Adding dummy variable for removed dataset
ga_full_df["Removed"] = 0
ga_removed_df["Removed"] = 1

#Combining both datasets in ga_df
ga_df = pd.concat([ga_full_df, ga_removed_df])

### Location Cleaning <a class="anchor" id="location-cleaning"></a>

In [13]:

# Create function to clean city names and zip codes
def standard_city(df, zip_col, city_col):
    start = time.process_time() #Start timer
    city_dict = dict(zip(df[zip_col], df[city_col])) # Create dictionary
    df[city_col] = df[zip_col] 
    df[city_col] = df[city_col].replace(city_dict) # Replace city values with name from dictionary
    print(f"{city_col} finished in {(time.process_time() - start):.2f} seconds.") # End timer
    return df

# Invoking function on all our zip codes and cities
ga_df = standard_city(ga_df,'zip', 'city')
ga_df = standard_city(ga_df, 'servicing_lender_zip','servicing_lender_city')
ga_df = standard_city(ga_df, 'project_zip', 'project_city')


city finished in 12.73 seconds.
servicing_lender_city finished in 12.21 seconds.
project_city finished in 12.76 seconds.


### Combining & Removing Duplicates <a class="anchor" id="combining--removing-duplicates"></a>

In [14]:
# Sorting by index
ga_df = ga_df.sort_index()

# Removing any duplicates based on index: loan number
ga_df = ga_df[~ga_df.index.duplicated(keep='first')]

# Updating dataframes 
ga_full_df = ga_df[ga_df["Removed"] == 0]
ga_removed_df = ga_df[ga_df["Removed"] == 1]



## Part Two: Analysis <a class="anchor" id="part02"></a>

### Defining Characteristics Of Removed Loans <a class="anchor" id="defining-characteristics-of-removed-loans"></a>

### Removed Loans Versus All Loans <a class="anchor" id="removed-loans-versus-all-loans"></a>

## Part Three: Prediction <a class="anchor" id="part03"></a>