# Leanding Club Case Study

Imports and basic set-up

In [None]:
import warnings
from os import getcwd
from os.path import join

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

%matplotlib inline

In [None]:
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)
sns.set_style('darkgrid')
# plt.style.use('seaborn-v0_8-pastel')

In [None]:
PRJ_DIR = getcwd()
DATA_DIR = join(PRJ_DIR, 'data')

## Step 0: Reading data

In [None]:
df = pd.read_csv(join(DATA_DIR, 'loan.csv'))
df.head(10)

In [None]:
data_dict = pd.read_excel(join(DATA_DIR, 'Data_Dictionary.xlsx'))
data_dict = data_dict.dropna()
data_dict.sample(5)

## Step 1: Cleaning

### Choosing columns
There are columns with `Nan` values.

In [None]:
df.isna().sum()

As seen above, there are columns which do not contain any values. Some of the columns contain not but few values.

In [None]:
(df.isna().sum() >= 0.5 * df.shape[0]).sum()

57 columns have more than 50% vlues as null. I am choosing not to use them and dropping them from analysis.

In [None]:
column_names = df.columns[df.isna().sum() <= 0.50 * df.shape[0]].tolist()
data_df = df[column_names]
del df
data_df.sample(3)

Another thing to check in the remaining column is the numbeor of unique values found in the each column

In [None]:
data_df.nunique()

There are columns which contain only 1 value for all the rows. We are choosing not to use them either as they are not adding any information wrt to our target variable

In [None]:
(data_df.nunique() <= 1).sum()

9 columns will be additionally removed from our analysis.

In [None]:
column_names = data_df.columns[data_df.nunique() > 1]
data_df = data_df[column_names]
data_df.sample(3)

In [None]:
column_desc_df = pd.DataFrame([(i, data_df[i].isna().sum(),
                                data_df[i].nunique(),
                                data_dict[data_dict.LoanStatNew == i]['Description'].values[0])
                               for i in data_df.columns],
                              columns=['Column_Name', 'Num_NAs',
                                       'Num_unique_vals', 'Description'])
column_desc_df

In [None]:
data_df.info()

In [None]:
def get_earliest_yr(x):
    x = int(x.split("-")[1])
    if 0 <= x <= 11:
        return str(2000 + x)
    return str(1900 + x)

In [None]:
data_df['term'] = data_df.term.apply(lambda x: int(x.strip().split(' ')[0]))
data_df['int_rate'] = data_df.int_rate.apply(lambda x: float(x.strip().strip('%')))

In [None]:
data_df['issue_d_month'] = data_df.issue_d.apply(lambda x: x.split('-')[0])
data_df['issue_d_year'] = data_df.issue_d.apply(lambda x: "20" + x.split('-')[1])

In [None]:
data_df['earliest_cr_line_month'] = data_df.earliest_cr_line.apply(lambda x: x.split('-')[0])
data_df['earliest_cr_line_year'] = data_df.earliest_cr_line.apply(get_earliest_yr)

In [None]:
data_df['emp_title'] = data_df.emp_title.apply(lambda x: x.strip().upper() if not isinstance(x, float) else x)

In [None]:
data_df['emp_length'] = data_df.emp_length.apply(lambda x: x if '<' not in str(x) else '<1 year')

In [None]:
data_df = data_df.drop(['issue_d', 'earliest_cr_line', 'url'], axis=1)

## Step 2: Uni-variate Analysis

### Annual Income

In [None]:
data_df.annual_inc.describe()

In [None]:
plt.figure(figsize=(3, 8))
sns.boxplot(data_df.annual_inc)
plt.title('Annual Income')
plt.show()

Here we encounter some outliers. It is evident from the 5 point summary that 75% of the annual incomes are below USD 100K, but the highest income is USD 6M. Analyzing this columns while considering all the values would not yield correct results and therefore we choose drop rows where the annual income is greater than USD 150K.

In [None]:
data_df = data_df[(data_df.annual_inc <= 150_000)]
data_df.annual_inc.describe()

In [None]:
plt.figure(figsize=(3, 8))
sns.boxplot(data_df.annual_inc)
plt.title('Annual Income')
plt.xticks(ticks=[])
plt.show()

In [None]:
plt.figure(figsize=(10, 5))
sns.histplot(data_df.annual_inc, kde=True)
plt.title('Annual Income')
plt.xlabel("")
plt.ylabel("")
plt.show()

We observe that:
- The median annual income is USD 57K
- Income of most of the applicants lie below USD 80K

### Loan amount

In [None]:
data_df.loan_amnt.describe()

In [None]:
plt.figure(figsize=(3, 8))
sns.boxplot(data_df.loan_amnt)
plt.title('Loan Amount')
plt.show()

We observe that:
- The median amount that is applied for is USD 9600
- Only 25% of the applicants have applied for amounts greater than USD 15000

In [None]:
plt.figure(figsize=(10, 5))
sns.histplot(data_df.loan_amnt, bins=20, kde=True)
plt.title('Loan Amount')
plt.show()

### Funded Amount

In [None]:
data_df.funded_amnt.describe()

In [None]:
plt.figure(figsize=(3, 8))
sns.boxplot(data_df.funded_amnt)
plt.title('Funded Amount')
plt.show()

In [None]:
plt.figure(figsize=(10, 5))
sns.histplot(data_df.funded_amnt, kde=True)
plt.title('Funded Amount')
plt.show()

We observe that:
- The median amount that is funded by the club is approximately USD 9400

### Amount Funded by Investor

In [None]:
data_df.funded_amnt_inv.describe()

In [None]:
plt.figure(figsize=(3, 8))
sns.boxplot(data_df.funded_amnt_inv)
plt.title('Amount Funded by Invertors')
plt.xticks(ticks=[])
plt.show()

In [None]:
plt.figure(figsize=(10, 5))
sns.histplot(data_df.funded_amnt_inv, bins=20, kde=True)
plt.title('Amount Funded by Invertors')
plt.show()

We observe that:
- The median amount that is funded by the investors is a approximately USD 8600
- Only 25% of the loans have been funded for more than USD 14000

### Term of the loan

In [None]:
plt.figure(figsize=(4, 5))
data_df.term.value_counts().plot.bar()
plt.title("Term of the loan")
plt.xticks(ticks=[0 ,1], labels=['36 months', '60 months'], rotation=0)
plt.xlabel("")
plt.ylabel("Number of loans")
plt.show()

We have loans of 2 type of terms:
- 36 months or 3 years
- 60 months or 5 years

We observe that amount of loans issued for the 36 month period are far more than those issued for 60 month period.

### Interest Rates

In [None]:
data_df.int_rate.describe()

In [None]:
plt.figure(figsize=(3, 8))
sns.boxplot(data_df.int_rate)
plt.title('Interest Rate')
plt.xticks(ticks=[])
plt.show()

In [None]:
plt.figure(figsize=(10, 5))
sns.histplot(data_df.int_rate, bins=50, kde=True)
plt.title('Interest Rate')
plt.ylabel(None)
plt.show()

We observe that:
- **More than 75%** of the loans have an interest rate less than **15%**
- **50%** of the loans have an interest rate between ~**9%** and ~**14.5%** 

### Installments

In [None]:
data_df.installment.describe()

In [None]:
plt.figure(figsize=(3, 8))
sns.boxplot(data_df.installment)
plt.title('Installments')
plt.xticks(ticks=[])
plt.show()

In [None]:
plt.figure(figsize=(10, 5))
sns.histplot(data_df.installment, kde=True, bins=50)
plt.title('Installments')
plt.ylabel(None)
plt.show()

We observe that:
- Median installment paid is ~**USD 275**
- **More than 75%** of the monthly installments are **below USD 415**

### Grade and Sub-Grade

In [None]:
plt.figure(figsize=(5, 4))
data_df.grade.value_counts().plot.bar()
plt.title("Grade of loans")
plt.xticks(rotation=0)
plt.xlabel("")
plt.ylabel("Number of loans")
plt.show()

In [None]:
plt.figure(figsize=(10, 5))
data_df.sub_grade.value_counts().plot.bar()
plt.title("Sub-Grade of loans")
plt.xticks(rotation=0)
plt.xlabel("")
plt.ylabel("Number of loans")
plt.show()

We observe that:
- Most of the loans are **Grade B**, followed by **A**, and **C**
- Most of the loans are of sub-grade **A4** followed by **B3**, and**A5**

### Employee Title

> **NOTE**: The values of this columns are text. We observe that values indicating same employer name are repeated in different cases, and spellings. We tackled the case earlier by converting every value to upper case. 

In [None]:
data_df.emp_title.value_counts()[:10]

We observed that:
- Most of the loans are issued to **US Army**

### Employee Length

In [None]:
data_df.emp_length.value_counts()

In [None]:
plt.figure(figsize=(10, 5))
data_df.emp_length.value_counts().plot.bar()
plt.title("Years of Continuous Employement")
plt.xticks(rotation=0)
plt.xlabel("")
plt.ylabel("Number of loans")
plt.show()

We observe that:
- More than ~**8300** applicants have **more than 10 years** of continuous employment
- Applicant with **less than 1 year** of experience are a distant second

### Home Ownership

In [None]:
data_df.home_ownership.value_counts()

In [None]:
plt.figure(figsize=(10, 5))
data_df.home_ownership.value_counts().plot.bar()
plt.title("Type of home ownership")
plt.xticks(rotation=0)
plt.xlabel("")
plt.ylabel("Number of loans")
plt.show()

We observe that:
- More than ~**18500** applicants are renters, closely followed by applicates with mortgage

### Verification Status of Income

In [None]:
data_df.verification_status.value_counts()

In [None]:
plt.figure(figsize=(10, 5))
data_df.verification_status.value_counts().plot.bar()
plt.title("Status of Income/Income source Verification")
plt.xticks(rotation=0)
plt.xlabel("")
plt.ylabel("Number of loans")
plt.show()

We observe that:
- ~**44%** applicants' income is not verified
- Only ~**25%** of the applicants' source of income is verified

### Loan Status

In [None]:
data_df.loan_status.value_counts() # / (~data_df.loan_status.isna()).sum()

In [None]:
plt.figure(figsize=(10, 5))
data_df.loan_status.value_counts().plot.bar()
plt.title("Status of loan")
plt.xticks(rotation=0)
plt.xlabel("")
plt.ylabel("Number of loans")
plt.show()

We observe that:
- More than **31K** loans have been fully paid
- **~5.5K** loans are actually bad loans i.e. the applicants defaulted

### Description of the loan

In [None]:
data_df['desc'].apply(lambda x: x.strip().lower() if isinstance(x, str) else x)

### Purpose of Loan

In [None]:
data_df.purpose.value_counts() / (~data_df.purpose.isna()).sum()

In [None]:
plt.figure(figsize=(10, 5))
data_df.purpose.value_counts().plot.barh()
plt.title("Category of loan purpose")
plt.ylabel("")
plt.xlabel("Number of loans")
plt.show()

We observe that:
- Most popular reason for applying for a loan is **debt consilidation (~47%)**
- Least number of loan application are for **Renewable Energy**, **Education**, and **Housing** 

### Title

In [None]:
data_df['title'] = data_df.title.apply(lambda x: x.strip().lower() if not isinstance(x, float) else x)

In [None]:
data_df.title.value_counts()[:10]

We observe that:
- Most loans are applied for **Debt Consolidation**

### State of residence 

In [None]:
data_df.addr_state.value_counts()

In [None]:
plt.figure(figsize=(10, 5))
data_df.addr_state.value_counts()[:5].plot.bar()
plt.title("Top-5 States with most applicants")
plt.xticks(rotation=0)
plt.xlabel("")
plt.ylabel("Number of loans")
plt.show()

In [None]:
plt.figure(figsize=(10, 5))
data_df.addr_state.value_counts()[-5:].plot.bar()
plt.title("Top-5 States with least applicants")
plt.xticks(rotation=0)
plt.xlabel("")
plt.ylabel("Number of loans")
plt.show()

We observe that:
- Most applicants are from the California, New York and Florida
- Iowa, Nebraska, and Maine are the states with least number of loan applications

### Debt-To-Income Ratio

In [None]:
data_df.dti.describe()

In [None]:
plt.figure(figsize=(3, 8))
sns.boxplot(data_df.dti)
plt.title('Debt to Income Ratio')
plt.xticks(ticks=[])
plt.show()

In [None]:
plt.figure(figsize=(10, 5))
sns.histplot(data_df.dti, kde=True)
plt.title('Amount Funded by Invertors')
plt.show()

We observe that:
- The median ratio is 13.5, which is a quite good
- 25% of the applicants have DTI > 18.7

### Deliquincy in the past 2 years

In [None]:
data_df.delinq_2yrs.value_counts()

In [None]:
plt.figure(figsize=(10, 5))
data_df.delinq_2yrs.value_counts()[1:].plot.bar()
plt.title("Number of delinqueny cases")
plt.xticks(rotation=0)
plt.xlabel("")
plt.ylabel("Number of loans")
plt.show()

We observe that:
- Most of the applicants have not shown any delinquent behviour in the past 2 years
- Less than 100 applicant have a record of 4 or more delinquency incidence in past 2 year

### Inquiries made in the past 6 months

In [None]:
data_df.inq_last_6mths.value_counts()

In [None]:
plt.figure(figsize=(10, 5))
data_df.inq_last_6mths.value_counts().plot.bar()
plt.title("Number of delinqueny cases")
plt.xticks(rotation=0)
plt.xlabel("")
plt.ylabel("Number of loans")
plt.show()