## Prediction Model

**Creating a predictive analysis to determine company's loan decisions that were offered and rejected by the Lending Club**





In [1]:
# First import librar dependacies : 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt

from plotly import tools
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

%matplotlib inline

In [2]:
# Run Boto3 to establish data connection to s3 bucket
import boto3
import awsKeyConfig
from io import StringIO

s3 = boto3.client('s3',
aws_access_key_id=awsKeyConfig.keyID,
aws_secret_access_key=awsKeyConfig.secretKey,
region_name='us-east-1'
)

In [3]:
# Retrieves the Bucket Name and Key <file_name> and 
obj = s3.get_object(Bucket='davis-data-cloud-of-wonders', Key='clean_loan_data.csv')
data = obj['Body'].read().decode('utf-8')

# Validates whether the connection to s3 is successfull or fail 
status = obj.get("ResponseMetadata", {}).get("HTTPStatusCode")

if status == 200:
    print(f"Successful S3 put_object response. Status - {status}")
else:
    print(f"Unsuccessful S3 put_object response. Status - {status}")


# Use StringIO to convert the string data to a file-like object
data_file = StringIO(data)

# Create a DataFrame from the CSV data
loan_df = pd.read_csv(data_file)

loan_df.head()

Successful S3 put_object response. Status - 200


Unnamed: 0,loan_amnt,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,loan_status,purpose,addr_state,dti,fico_score,issued_year,default_loan
0,3600.0,36 months,13.99,123.03,C,C4,leadman,10+ years,MORTGAGE,55000.0,Not Verified,Fully Paid,debt_consolidation,PA,5.91,677.0,2015-12-01,0
1,24700.0,36 months,11.99,820.28,C,C1,Engineer,10+ years,MORTGAGE,65000.0,Not Verified,Fully Paid,small_business,SD,16.06,717.0,2015-12-01,0
2,20000.0,60 months,10.78,432.66,B,B4,truck driver,10+ years,MORTGAGE,63000.0,Not Verified,Fully Paid,home_improvement,IL,10.78,697.0,2015-12-01,0
3,35000.0,60 months,14.85,829.9,C,C5,Information Systems Officer,10+ years,MORTGAGE,110000.0,Source Verified,Current,debt_consolidation,NJ,17.06,787.0,2015-12-01,0
4,10400.0,60 months,22.45,289.91,F,F1,Contract Specialist,3 years,MORTGAGE,104433.0,Source Verified,Fully Paid,major_purchase,PA,25.37,697.0,2015-12-01,0


In [4]:
# quick look at loan_status data types
loan_df['loan_status'].value_counts()

loan_status
Fully Paid            1076751
Current                878317
Charged Off            268559
Late (31-120 days)      21467
Late (16-30 days)        4349
Default                    40
Name: count, dtype: int64

In [5]:
# create an comparison between good and bad loan by clustering loan_status

bad_loan = ['Default', 'Charged Off', 'Late (31-120 days)']

loan_df['loan_condition'] = np.nan # omits the non-value

def loan_condition(status):
    if status in bad_loan:
        return 'Bad Loan'
    else:
        return 'Good Loan'
    
    
loan_df['loan_condition'] = loan_df['loan_status'].apply(loan_condition)

In [6]:
# review the regions dataset by unique value
loan_df['addr_state'].unique()

array(['PA', 'SD', 'IL', 'NJ', 'GA', 'MN', 'SC', 'RI', 'TX', 'NC', 'CA',
       'VA', 'AZ', 'NY', 'IN', 'MD', 'KS', 'NM', 'AL', 'WA', 'MO', 'OH',
       'LA', 'FL', 'CO', 'MI', 'TN', 'DC', 'MA', 'WI', 'HI', 'VT', 'DE',
       'NH', 'NE', 'CT', 'OR', 'AR', 'MT', 'NV', 'WV', 'WY', 'OK', 'KY',
       'MS', 'ME', 'UT', 'ND', 'AK', nan, 'ID', 'IA'], dtype=object)

In [7]:
# going to take data by regions : North_East, South_East, Mid_West, South_West, West
west = ['CA', 'OR', 'UT','WA', 'CO', 'NV', 'AK', 'MT', 'HI', 'WY', 'ID']
south_west = ['AZ', 'TX', 'NM', 'OK']
south_east = ['GA', 'NC', 'VA', 'FL', 'KY', 'SC', 'LA', 'AL', 'WV', 'DC', 'AR', 'DE', 'MS', 'TN' ]
mid_west = ['IL', 'MO', 'MN', 'OH', 'WI', 'KS', 'MI', 'SD', 'IA', 'NE', 'IN', 'ND']
north_east = ['CT', 'NY', 'PA', 'NJ', 'RI','MA', 'MD', 'VT', 'NH', 'ME']

loan_df['region'] = np.nan

def finding_regions(state):
    if state in west:
        return 'West'
    elif state in south_west:
        return 'South West'
    elif state in south_east:
        return 'South East'
    elif state in mid_west:
        return 'Mid West'
    elif state in north_east:
        return 'North East'
    


loan_df['region'] = loan_df['addr_state'].apply(finding_regions)

In [8]:
loan_df.head()

Unnamed: 0,loan_amnt,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,loan_status,purpose,addr_state,dti,fico_score,issued_year,default_loan,loan_condition,region
0,3600.0,36 months,13.99,123.03,C,C4,leadman,10+ years,MORTGAGE,55000.0,Not Verified,Fully Paid,debt_consolidation,PA,5.91,677.0,2015-12-01,0,Good Loan,North East
1,24700.0,36 months,11.99,820.28,C,C1,Engineer,10+ years,MORTGAGE,65000.0,Not Verified,Fully Paid,small_business,SD,16.06,717.0,2015-12-01,0,Good Loan,Mid West
2,20000.0,60 months,10.78,432.66,B,B4,truck driver,10+ years,MORTGAGE,63000.0,Not Verified,Fully Paid,home_improvement,IL,10.78,697.0,2015-12-01,0,Good Loan,Mid West
3,35000.0,60 months,14.85,829.9,C,C5,Information Systems Officer,10+ years,MORTGAGE,110000.0,Source Verified,Current,debt_consolidation,NJ,17.06,787.0,2015-12-01,0,Good Loan,North East
4,10400.0,60 months,22.45,289.91,F,F1,Contract Specialist,3 years,MORTGAGE,104433.0,Source Verified,Fully Paid,major_purchase,PA,25.37,697.0,2015-12-01,0,Good Loan,North East


In [9]:
# creating dataframe called bad loan and retriving count of bad loans
bad_loan_df = loan_df.loc[loan_df['loan_condition'] == 'Bad Loan']
bad_loan_df['loan_condition'].value_counts()

loan_condition
Bad Loan    290066
Name: count, dtype: int64

In [10]:
# going to use an pandas function called crosstab 
# Crosstab builds a cross-tabulation table that can show the frequency with which certain groups of data appear.

loan_cross_tab = pd.crosstab(bad_loan_df['region'], bad_loan_df['loan_status']).apply(lambda x: x/x.sum() * 100)
num_loan_status = pd.crosstab(bad_loan_df['region'], bad_loan_df['loan_status'])

# using lambda to round the values
loan_cross_tab['Charged Off'] = loan_cross_tab['Charged Off'].apply(lambda x: round(x, 2))
loan_cross_tab['Default'] = loan_cross_tab['Default'].apply(lambda x: round(x, 2))
loan_cross_tab['Late (31-120 days)'] = loan_cross_tab['Late (31-120 days)'].apply(lambda x: round(x, 2))

# adding all the values in an Total column
num_loan_status['Total'] = num_loan_status.sum(axis=1)

num_loan_status

loan_status,Charged Off,Default,Late (31-120 days),Total
region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Mid West,46377,7,3476,49860
North East,62535,13,5181,67729
South East,67229,7,5549,72785
South West,32712,6,2671,35389
West,59706,7,4590,64303


In [11]:
# adding each loan status values to a list 
charged = loan_cross_tab['Charged Off'].values.tolist()
default = loan_cross_tab['Default'].values.tolist()
late = loan_cross_tab['Late (31-120 days)'].values.tolist()

In [12]:
# Import Dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.utils import resample


In [17]:
# Define X and y
X = loan_df.drop(columns=['default_loan', 'loan_condition', 'region'])  # Features
y = loan_df['default_loan']  # Target variable


In [18]:
# Step 1: Determine the sample size
sample_size = 10000  # Adjust the sample size as needed

# Take a sample of the dataset
X_sample, _, y_sample, _ = train_test_split(X, y, train_size=sample_size, stratify=y, random_state=42)

print("Sample Size:", sample_size)  # Print the sample size
print("X_sample shape:", X_sample.shape)  # Print the shape of X_sample
print("y_sample shape:", y_sample.shape)  # Print the shape of y_sample


Sample Size: 10000
X_sample shape: (10000, 17)
y_sample shape: (10000,)


In [19]:
# Step 2: Data Preprocessing
# Preprocessing categorical variables
categorical_features = X_sample.select_dtypes(include=['object']).columns.tolist()
numeric_features = X_sample.select_dtypes(include=['float64']).columns.tolist()

# Pipeline for preprocessing
# Preprocessing pipeline with imputation
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])


In [20]:
# Step 3: Model Selection and Training
model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('classifier', RandomForestClassifier(random_state=42))])

model.fit(X_sample, y_sample)



In [21]:
# Step 4: Model Evaluation
X_train, X_test, y_train, y_test = train_test_split(X_sample, y_sample, test_size=0.2, random_state=42)

y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Accuracy: 1.0
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1747
           1       1.00      1.00      1.00       253

    accuracy                           1.00      2000
   macro avg       1.00      1.00      1.00      2000
weighted avg       1.00      1.00      1.00      2000

Confusion Matrix:
[[1747    0]
 [   0  253]]


In [24]:
# Define the list of columns you want to include in the weighted average analysis
selected_columns = ['loan_amnt', 'int_rate', 'installment', 'annual_inc', 'delinq_2yrs', 'dti', 'fico_score', 'issued_year']

# Step 5: Weighted Averages Analysis
if hasattr(model.named_steps['classifier'], 'feature_importances_'):
    feature_importances = model.named_steps['classifier'].feature_importances_
    sorted_indices = np.argsort(feature_importances)[::-1]
    print("Feature Importances:")
    for idx in sorted_indices:
        if idx < len(selected_columns):  # Ensure the index is within the range of selected columns
            column_name = selected_columns[idx]
            print(f"{column_name}: {feature_importances[idx]}")
else:
    print("Model does not support feature importances.")

Feature Importances:
int_rate: 0.026624897114637412
installment: 0.01723936800454756
delinq_2yrs: 0.017136835955951113
annual_inc: 0.016395874650246202
dti: 0.015120195885442288
loan_amnt: 0.014833422858088191
fico_score: 0.004147001531150149
issued_year: 0.004123760892931875
