In [1]:
# Import Dependencies and Configure Environment
import os
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

# sklearn
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_auc_score
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# imbalanced data
from imblearn.over_sampling import RandomOverSampler
from imblearn.combine import SMOTETomek

# reproducibility
RND = 42
os.environ['PYTHONHASHSEED'] = str(RND)
np.random.seed(RND)
random.seed(RND)


In [2]:
# Load Dataset and Check Basic Information
df = pd.read_csv("accepted_2007_to_2018Q4.csv")
print("Dataset shape:", df.shape)


Dataset shape: (2260701, 151)


In [3]:
# Preview Dataset
df.head()


Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,...,hardship_payoff_balance_amount,hardship_last_payment_amount,disbursement_method,debt_settlement_flag,debt_settlement_flag_date,settlement_status,settlement_date,settlement_amount,settlement_percentage,settlement_term
0,68407277,,3600.0,3600.0,3600.0,36 months,13.99,123.03,C,C4,...,,,Cash,N,,,,,,
1,68355089,,24700.0,24700.0,24700.0,36 months,11.99,820.28,C,C1,...,,,Cash,N,,,,,,
2,68341763,,20000.0,20000.0,20000.0,60 months,10.78,432.66,B,B4,...,,,Cash,N,,,,,,
3,66310712,,35000.0,35000.0,35000.0,60 months,14.85,829.9,C,C5,...,,,Cash,N,,,,,,
4,68476807,,10400.0,10400.0,10400.0,60 months,22.45,289.91,F,F1,...,,,Cash,N,,,,,,


In [4]:
# Display Column Names
df.columns


Index(['id', 'member_id', 'loan_amnt', 'funded_amnt', 'funded_amnt_inv',
       'term', 'int_rate', 'installment', 'grade', 'sub_grade',
       ...
       'hardship_payoff_balance_amount', 'hardship_last_payment_amount',
       'disbursement_method', 'debt_settlement_flag',
       'debt_settlement_flag_date', 'settlement_status', 'settlement_date',
       'settlement_amount', 'settlement_percentage', 'settlement_term'],
      dtype='object', length=151)

In [5]:
# Filter Relevant Loan Statuses and Create Binary Target Variable
keep_status = ['Fully Paid', 'Charged Off', 'Default']
df = df[df['loan_status'].isin(keep_status)].copy()
df['target'] = df['loan_status'].apply(lambda x: 1 if x in ['Charged Off', 'Default'] else 0)
print("Target variable distribution:\n", df['target'].value_counts(normalize=True))


Target variable distribution:
 target
0    0.80035
1    0.19965
Name: proportion, dtype: float64


In [6]:
# Select Numeric and Categorical Features
num_cols = [
    'loan_amnt', 'funded_amnt', 'int_rate', 'installment', 'annual_inc',
    'dti', 'delinq_2yrs', 'inq_last_6mths', 'open_acc', 'pub_rec',
    'revol_bal', 'revol_util', 'total_acc', 'emp_length'
]
cat_cols = ['term', 'grade', 'home_ownership', 'purpose', 'verification_status']
df = df[num_cols + cat_cols + ['target']].copy()
df = df.dropna(subset=['loan_amnt', 'int_rate', 'annual_inc', 'dti'])


In [7]:
# Clean and Convert Interest Rate and Employment Length Columns
df['int_rate'] = df['int_rate'].astype(str).str.rstrip('%').astype(float)
df['emp_length'] = (
    df['emp_length']
    .astype(str)
    .str.extract('(\d+)')
    .fillna(0)
    .astype(int)
)


In [8]:
# Encode Categorical Columns using One-Hot Encoding
df = pd.get_dummies(df, columns=cat_cols, drop_first=True)


In [9]:
# Scale Numeric Columns using StandardScaler
scaler = StandardScaler()
num_scaled = pd.DataFrame(
    scaler.fit_transform(df[num_cols]),
    columns=num_cols
)
df_final = pd.concat([num_scaled, df.drop(columns=num_cols)], axis=1)


In [10]:
# Save Processed Dataset for Downstream Modeling
df_final.to_csv("processed_loan_data.csv", index=False)
print("processed_loan_data.csv saved with shape:", df_final.shape)


processed_loan_data.csv saved with shape: (1803164, 42)
