In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import re
from sklearn.impute import SimpleImputer

## About

This notebook contains process of modifying data from accepted loans dataset in order to do dimensiopnality reduction later. 

## Loading the Accepted Loans dataset

In [2]:
accepted_path = './Datasets/accepted_2007_to_2018Q4.csv'
data_accepted = pd.read_csv(accepted_path)

  data_accepted = pd.read_csv(accepted_path)


In [3]:
data_accepted.shape

(2260701, 151)

Starting with 151 features

## Missing Data Handling

In [4]:
# Setting threshold for proportion of missing values per feature
missing_threshold = 0.1

data_accepted = (
    data_accepted.loc[:, data_accepted.isnull().sum() / data_accepted.shape[0] <= missing_threshold]
    .dropna(subset=['grade', 'sub_grade'])
)

In [5]:
data_accepted.shape

(2260668, 92)

- By just removing features with substantial amount of missing values, we reduced dataset dimensionality by 59 features.
- More comprehensive prediction of missing values using various ML models could work as an alternative to simple threshold removal.

In [6]:
data_accepted.duplicated().sum()

0

- No duplicates in the data

## High cardinality handling
- Threshold of 50 unique categories was selected

In [7]:
high_cardinality_columns = [
    col for col in data_accepted.columns 
    if data_accepted[col].dtype == 'object' and data_accepted[col].nunique() > 50
]

## Taking a look at filtered high cardinality features

In [8]:
data_accepted[high_cardinality_columns].head()

Unnamed: 0,id,emp_title,issue_d,url,title,zip_code,addr_state,earliest_cr_line,last_pymnt_d,last_credit_pull_d
0,68407277,leadman,Dec-2015,https://lendingclub.com/browse/loanDetail.acti...,Debt consolidation,190xx,PA,Aug-2003,Jan-2019,Mar-2019
1,68355089,Engineer,Dec-2015,https://lendingclub.com/browse/loanDetail.acti...,Business,577xx,SD,Dec-1999,Jun-2016,Mar-2019
2,68341763,truck driver,Dec-2015,https://lendingclub.com/browse/loanDetail.acti...,,605xx,IL,Aug-2000,Jun-2017,Mar-2019
3,66310712,Information Systems Officer,Dec-2015,https://lendingclub.com/browse/loanDetail.acti...,Debt consolidation,076xx,NJ,Sep-2008,Feb-2019,Mar-2019
4,68476807,Contract Specialist,Dec-2015,https://lendingclub.com/browse/loanDetail.acti...,Major purchase,174xx,PA,Jun-1998,Jul-2016,Mar-2018


In [9]:
data_accepted[high_cardinality_columns].nunique()

id                    2260668
emp_title              512694
issue_d                   139
url                   2260668
title                   63154
zip_code                  956
addr_state                 51
earliest_cr_line          754
last_pymnt_d              136
last_credit_pull_d        141
dtype: int64

In [10]:
data_accepted.drop(['url','title','id','zip_code'], axis=1, inplace=True)

- Removing url, title, id and zip_code.

- Standardizing format of emp_title feature

In [11]:
data_accepted['emp_title'] = (
    data_accepted['emp_title'].str.lower()
    .str.replace('_', ' ')
    .apply(lambda x: re.sub(r'[^a-z\s]', '', str(x)))
)

data_accepted['emp_title'] = data_accepted['emp_title'].replace('nan', np.nan)
top_20_titles = data_accepted['emp_title'].value_counts().nlargest(20).index.tolist()
data_accepted['emp_title'] = data_accepted['emp_title'].apply(
    lambda x: x if x in top_20_titles else ('other' if pd.notna(x) else x)
)

In [12]:
data_accepted['emp_title'].nunique()

21

- We are left with 21 unique categories 1 being 'Other' representing all categories not in top 20


## Datetime feature conversion


In [13]:
data_accepted['issue_d'] = pd.to_datetime(data_accepted['issue_d'], format='%b-%Y')
data_accepted['earliest_cr_line'] = pd.to_datetime(data_accepted['earliest_cr_line'], format='%b-%Y')
data_accepted['last_pymnt_d'] = pd.to_datetime(data_accepted['last_pymnt_d'], format='%b-%Y')
data_accepted['last_credit_pull_d'] = pd.to_datetime(data_accepted['last_credit_pull_d'], format='%b-%Y')

In [14]:
data_accepted.dtypes.value_counts()

float64           68
object            16
datetime64[ns]     4
Name: count, dtype: int64

In [15]:
data_accepted['addr_state'].nunique()

51

- add_state feature will be kept with its 51 categories as it might bring important modeling importance later.

In [16]:
data_accepted.shape

(2260668, 88)

- **Features reduced by 63 during this process with 88 still left and over 2.26 milion data points**

In [17]:
numerical_columns = data_accepted.select_dtypes(include=['float64', 'int64'])
missing_values = numerical_columns.isnull().sum()

In [18]:
missing_values.describe()

count        68.000000
mean      35166.617647
std       39010.214282
min           0.000000
25%           0.000000
50%        1756.500000
75%       70276.000000
max      153657.000000
dtype: float64

In [19]:
numerical_columns.shape

(2260668, 68)

In [20]:
# Calculating the proportion of outliers based on 1.5IQR rule, and imputing based on that. 

numerical_columns = data_accepted.select_dtypes(include=['float64', 'int64']).columns
outliers_info = {}
for column in numerical_columns:
    Q1 = data_accepted[column].quantile(0.25)
    Q3 = data_accepted[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = data_accepted[(data_accepted[column] < lower_bound) | (data_accepted[column] > upper_bound)]
    outliers_count = len(outliers)
    non_nan_count = data_accepted[column].count()
    outliers_percentage = (outliers_count / non_nan_count) * 100 if non_nan_count > 0 else 0
    outliers_info[column] = {'count': outliers_count, 'percentage': outliers_percentage}

sorted_outliers_info = dict(sorted(outliers_info.items(), key=lambda item: item[1]['count'], reverse=True))

for column, info in sorted_outliers_info.items():
    print(f"{column}: {info['count']} outliers ({info['percentage']:.2f}%)")

num_accts_ever_120_pd: 502976 outliers (22.96%)
delinq_2yrs: 421531 outliers (18.65%)
pub_rec: 357881 outliers (15.83%)
tot_coll_amt: 334263 outliers (15.26%)
last_pymnt_amnt: 313415 outliers (13.86%)
pub_rec_bankruptcies: 271920 outliers (12.04%)
out_prncp_inv: 212305 outliers (9.39%)
out_prncp: 212242 outliers (9.39%)
mths_since_recent_bc: 186169 outliers (8.51%)
recoveries: 185432 outliers (8.20%)
bc_open_to_buy: 180429 outliers (8.25%)
collection_recovery_fee: 177013 outliers (7.83%)
mo_sin_rcnt_rev_tl_op: 171177 outliers (7.81%)
total_rec_int: 163420 outliers (7.23%)
pct_tl_nvr_dlq: 148872 outliers (6.80%)
revol_bal: 137095 outliers (6.06%)
total_bal_ex_mort: 131127 outliers (5.93%)
total_bc_limit: 129699 outliers (5.87%)
total_rev_hi_lim: 127152 outliers (5.80%)
avg_cur_bal: 118015 outliers (5.39%)
num_tl_90g_dpd_24m: 117332 outliers (5.36%)
total_il_high_credit_limit: 111518 outliers (5.09%)
mo_sin_rcnt_tl: 110296 outliers (5.04%)
annual_inc: 110041 outliers (4.87%)
num_bc_sats:

## Imputation of missing values based on number of outliers

In [21]:
outliers_df = pd.DataFrame(sorted_outliers_info).T

median_imputer = SimpleImputer(strategy='median')
mean_imputer = SimpleImputer(strategy='mean')

for column, data in outliers_df.iterrows():
    if data['percentage'] > 5:
        data_accepted[column] = data_accepted[column].apply(lambda x: np.log1p(x) if x > 0 else x)

        data_accepted[column] = median_imputer.fit_transform(data_accepted[[column]])
    elif 1 <= data['percentage'] <= 5:
        # Moderate percentage of outliers - median imputation
        data_accepted[column] = median_imputer.fit_transform(data_accepted[[column]])
    elif data['percentage'] < 1:
        # Low percentage of outliers - mean imputation
        data_accepted[column] = mean_imputer.fit_transform(data_accepted[[column]])

- High Percentage of Outliers (over 5%): Apply a logarithmic transformation to positive values in the column to reduce the impact of outliers.
-  Moderate Percentage of Outliers (1% to 5%): Replace outliers using median values without any prior transformation.
- Low Percentage of Outliers (less than 1%): Replace outliers using mean values.

In [22]:
data_accepted.isnull().sum()[data_accepted.isnull().sum() != 0]

emp_title             166969
emp_length            146907
earliest_cr_line          29
last_pymnt_d            2427
last_credit_pull_d        72
dtype: int64

In [23]:
data_accepted.dropna(subset=['earliest_cr_line','last_pymnt_d','last_credit_pull_d'], inplace=True)

In [24]:
data_accepted.isnull().sum()[data_accepted.isnull().sum() != 0]

emp_title     166677
emp_length    146657
dtype: int64

In [25]:
def proportional_imputation(column):
    distribution = data_accepted[column].value_counts(normalize=True)
    missing_count = data_accepted[column].isnull().sum()
    imputed_values = np.random.choice(distribution.index, size=missing_count, p=distribution.values)
    data_accepted.loc[data_accepted[column].isnull(), column] = imputed_values

proportional_imputation('emp_length')
proportional_imputation('emp_title')

- Proportinal imputation of categorical values of these two features

In [26]:
data_accepted.isnull().sum().sum()

0

No missing values.

In [27]:
data_accepted.dtypes.value_counts()

float64           68
object            16
datetime64[ns]     4
Name: count, dtype: int64

## Saving the dataset

In [28]:
data_accepted.to_csv('./Datasets/Data Accepted Cleaned For Dimensionality Reduction.csv',index=False)

## What could have I done better:

- Use more advanced imputation techniques.
- Streamlined this process using pipeline objects.