# Superday

In [37]:
import pandas as pd

In [38]:
df = pd.read_csv('Pre-Super_Day_candidate_dataset__28candidate_29.csv')
print(df.shape)
df.head(5)

(100000, 14)


Unnamed: 0,User ID,applications,Reason,Loan_Amount,FICO_score,Fico_Score_group,Employment_Status,Employment_Sector,Monthly_Gross_Income,Monthly_Housing_Payment,Ever_Bankrupt_or_Foreclose,Lender,Approved,bounty
0,00007820-89cb-4c1d-9940-eb270d605a35,1,cover_an_unexpected_cost,100000,669,fair,full_time,consumer_discretionary,5024,927,0,B,0,0
1,00012b55-514c-421e-9c76-3300abbc1134,1,credit_card_refinancing,70000,594,fair,full_time,information_technology,5764,1177,0,B,0,0
2,000157c1-b6a3-4c86-82c7-9ec1bda3799a,1,home_improvement,10000,596,fair,full_time,information_technology,4017,1487,0,A,0,0
3,00020400-efab-4b10-8812-2a0aaf774841,1,home_improvement,100000,642,fair,part_time,energy,3129,904,0,A,0,0
4,0002f737-0cda-48fb-91ed-533f3d0eab05,1,major_purchase,30000,642,fair,full_time,energy,4220,1620,0,A,0,0


## Feature Engineering

In [None]:
from sklearn.preprocessing import LabelEncoder

In [39]:
df['Fico_Score_group'].unique()

array(['fair', 'poor', 'good', 'very_good', 'excellent'], dtype=object)

In [40]:
df['Employment_Status'].unique()

array(['full_time', 'part_time', 'unemployed'], dtype=object)

In [47]:
# Encode ordinal categorical variables in order and non-ordinal with one hot encoding 

fico_group_map = { 'poor': 0, 'fair': 1, 'good': 2, 'very_good': 3, 'excellent': 4 }
employment_status_map = { 'unemployed': 0, 'part_time': 1, 'full_time': 2 }

df_encoded = df.copy()
df_encoded = pd.get_dummies(df_encoded, columns=['Reason'], prefix=['Reason'])
df_encoded = pd.get_dummies(df_encoded, columns=['Employment_Sector'], prefix=['Employment_Sector'])
df_encoded["Fico_Score_group"] = df_encoded["Fico_Score_group"].map(fico_group_map)
df_encoded["Employment_Status"] = df_encoded["Employment_Status"].map(employment_status_map)
df_encoded.head(2)

Unnamed: 0,User ID,applications,Loan_Amount,FICO_score,Fico_Score_group,Employment_Status,Monthly_Gross_Income,Monthly_Housing_Payment,Ever_Bankrupt_or_Foreclose,Lender,...,Employment_Sector_consumer_discretionary,Employment_Sector_consumer_staples,Employment_Sector_energy,Employment_Sector_financials,Employment_Sector_health_care,Employment_Sector_industrials,Employment_Sector_information_technology,Employment_Sector_materials,Employment_Sector_real_estate,Employment_Sector_utilities
0,00007820-89cb-4c1d-9940-eb270d605a35,1,100000,669,1,2,5024,927,0,B,...,True,False,False,False,False,False,False,False,False,False
1,00012b55-514c-421e-9c76-3300abbc1134,1,70000,594,1,2,5764,1177,0,B,...,False,False,False,False,False,False,True,False,False,False


## Tell us about the variables
_Possible things to consider: Which variables are the most helpful in understanding if a customer is going to be approved or denied for a loan? Are there certain variables that are not useful to collect?_

In [29]:
from sklearn.ensemble import RandomForestClassifier

In [30]:
features = df.drop('Lender', axis=1)  # Features
target = df['Lender']
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(features, target)

ValueError: could not convert string to float: '00007820-89cb-4c1d-9940-eb270d605a35'

## Tell us about the lenders
_Possible things to consider: What is each Lender’s average approval rate? How does their revenue per approval differ? Are there any clear differences between the three different lenders on what type of customers they approve?_

Percent% approval by Vendor

In [9]:
df[['Lender', 'Approved']].groupby('Lender').mean() * 100

Unnamed: 0_level_0,Approved
Lender,Unnamed: 1_level_1
A,10.965455
B,7.127273
C,17.057143


In [24]:
approved = df[df['Approved'] == 1]

In [25]:
approved[(approved['Lender'] == 'A')].describe()

Unnamed: 0,applications,Loan_Amount,FICO_score,Monthly_Gross_Income,Monthly_Housing_Payment,Ever_Bankrupt_or_Foreclose,Approved,bounty
count,6031.0,6031.0,6031.0,6031.0,6031.0,6031.0,6031.0,6031.0
mean,1.0,42417.509534,697.358481,7506.998674,1697.039794,0.005472,1.0,250.0
std,0.0,28226.467212,74.113488,3389.702307,670.735658,0.073775,0.0,0.0
min,1.0,5000.0,353.0,2007.0,300.0,0.0,1.0,250.0
25%,1.0,20000.0,659.5,4857.5,1263.0,0.0,1.0,250.0
50%,1.0,30000.0,702.0,7272.0,1724.0,0.0,1.0,250.0
75%,1.0,70000.0,739.0,9612.0,2109.5,0.0,1.0,250.0
max,1.0,100000.0,850.0,19997.0,3299.0,1.0,1.0,250.0


In [27]:
approved[(approved['Lender'] == 'B')].describe()

Unnamed: 0,applications,Loan_Amount,FICO_score,Monthly_Gross_Income,Monthly_Housing_Payment,Ever_Bankrupt_or_Foreclose,Approved,bounty
count,1960.0,1960.0,1960.0,1960.0,1960.0,1960.0,1960.0,1960.0
mean,1.0,41790.816327,732.108673,8053.57602,1694.571939,0.0,1.0,350.0
std,0.0,27723.422857,53.841987,3256.772291,667.931901,0.0,0.0,0.0
min,1.0,5000.0,600.0,2014.0,300.0,0.0,1.0,350.0
25%,1.0,20000.0,690.0,5569.5,1248.0,0.0,1.0,350.0
50%,1.0,30000.0,724.0,7891.5,1710.5,0.0,1.0,350.0
75%,1.0,60000.0,772.0,10016.0,2109.25,0.0,1.0,350.0
max,1.0,100000.0,850.0,19909.0,3287.0,0.0,1.0,350.0


In [28]:
approved[(approved['Lender'] == 'C')].describe()

Unnamed: 0,applications,Loan_Amount,FICO_score,Monthly_Gross_Income,Monthly_Housing_Payment,Ever_Bankrupt_or_Foreclose,Approved,bounty
count,2985.0,2985.0,2985.0,2985.0,2985.0,2985.0,2985.0,2985.0
mean,1.0,41135.678392,674.770519,6322.161809,1344.649581,0.01608,1.0,150.0
std,0.0,27824.071795,81.548255,3137.702364,672.804328,0.125806,0.0,0.0
min,1.0,5000.0,358.0,2008.0,301.0,0.0,1.0,150.0
25%,1.0,20000.0,622.0,3936.0,791.0,0.0,1.0,150.0
50%,1.0,30000.0,682.0,5634.0,1260.0,0.0,1.0,150.0
75%,1.0,60000.0,724.0,8253.0,1875.0,0.0,1.0,150.0
max,1.0,100000.0,850.0,19982.0,3297.0,1.0,1.0,150.0


### Takeaways
- Lender C is the most likely to approve applications at 17% approval, more than twice as much as vendor B at 7%
- Lender B targets the 'Prime' sector, while Lender C targets the lowest earners and Lender A splits the difference. Approved applicants for Lender B have an average income of `8053` per month, compared to Lender C at `6322`. Average Fico score for lender B is `732` compared to C's average of `674`.
- Lender B never approves a person with a foreclosure
- The average loan amount approved by each vendor is roughly the same at ~`$41,000`
- Lender B pays the most for each conversion at `$350`, followed by Lender A at `$250` and Lender C at `150`


Lender B pays the most for each conversion, but also only approves "highest quality" candidate. Lender C approves more people but also pays less. Lender A splits the difference

## Tell us about which customers we should match to each lender