# Loan Repayment Prediction

This notebook demonstrates the steps to preprocess data, explore features, and build a predictive model for student loan repayment using neural networks.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset with column descriptions
# This file provides metadata about the columns in the dataset.
data_info = pd.read_csv('.../DATA/lending_club_info.csv', index_col='LoanStatNew')
print(data_info.loc['revol_util']['Descripton'])

def feat_info(col_name):
    """Function to print feature information."""
    print(data_info.loc[col_name]['Descripton'])

feat_info('mort_acc')  # Display description of 'mort_acc'

# Load the main dataset
df = pd.read_csv('.../DATA/lending_club_info.csv')
df.info()  # View dataset info

In [None]:
# Countplot for loan status
sns.countplot(x='loan_status', data=df)

# Distribution of loan amounts
plt.figure(figsize=(12, 4))
sns.histplot(x='loan_amnt', kde=False, bins=40)

In [None]:
# Correlation heatmap
plt.figure(figsize=(12, 6))
sns.heatmap(df.corr(), annot=True, cmap='viridis')
plt.yticks(rotation=0)

In [None]:
# Feature info and scatter plot
feat_info('installment')
feat_info('loan_amnt')
sns.scatterplot(x='installment', y='loan_amnt', data=df)

# Boxplot for loan status vs. loan amount
sns.boxplot(x='loan_status', y='loan_amnt', data=df)

# Summary statistics grouped by loan status
df.groupby('loan_status')['loan_amnt'].describe()

In [None]:
# Unique values in 'grade' and 'sub_grade'
df['grade'].unique()
df['sub_grade'].unique()
feat_info('sub_grade')

# Countplot for grades with loan status as hue
sns.countplot(x='grade', data=df, hue='loan_status')

In [None]:
# Countplot for sub-grades
plt.figure(figsize=(12, 4))
sub_order = sorted(df['sub_grade'].unique())
sns.countplot(x='sub_grade', data=df, order=sub_order, palette='coolwarm', hue='loan_status')

In [None]:
# Focus on grades F and G
f_and_g = df[(df['grade'] == 'G') | (df['grade'] == 'F')]
plt.figure(figsize=(12, 4))
sns.countplot(x='sub_grade', data=f_and_g, palette='coolwarm', hue='loan_status')

In [None]:
# Map loan status to binary labels
df['loan_repaid'] = df['loan_status'].map({'Fully Paid': 1, 'Charged Off': 0})

# Check correlation with loan_repaid
df.corr()['loan_repaid'].sort_values().drop('loan_repaid').plot(kind='bar')

In [None]:
# Handling missing data
100 * df.isnull().sum() / len(df)

# Drop 'emp_title' as it's high-cardinality
df = df.drop('emp_title', axis=1)

# Visualize employment length distribution
emp_order = sorted(df['emp_length'].dropna().unique())
plt.figure(figsize=(12, 4))
sns.countplot(x='emp_length', data=df, order=emp_order, hue='loan_status')

In [None]:
# Calculate ratios for employment length
df = df.drop('emp_length', axis=1)
df.isnull().sum()

# Drop 'title' (redundant with 'purpose')
df = df.drop('title', axis=1)

# Handle missing values in 'mort_acc'
total_acc_avg = df.groupby('total_acc').mean()['mort_acc']

def fill_mort_acc(total_acc_, mort_acc_):
    if np.isnan(mort_acc_):
        return total_acc_avg[total_acc_]
    else:
        return mort_acc_

df['mort_acc'] = df.apply(lambda x: fill_mort_acc(x['total_acc'], x['mort_acc']), axis=1)
df.isnull().sum()
df = df.dropna()

In [None]:
# Encode categorical variables
df['term'] = df['term'].apply(lambda x: int(x[:3]))
df = pd.get_dummies(df, columns=['sub_grade'], drop_first=True)
df = pd.get_dummies(df, columns=['verification_status', 'application_type', 'initial_list_status', 'purpose'], drop_first=True)

# Simplify 'home_ownership' categories
df['home_ownership'] = df['home_ownership'].replace(['NONE', 'ANY'], 'OTHER')
df = pd.get_dummies(df, columns=['home_ownership'], drop_first=True)

# Extract zip code from address
df['zip_code'] = df['address'].apply(lambda x: x[-5:])
df = pd.get_dummies(df, columns=['zip_code'], drop_first=True)
df = df.drop('address', axis=1)

# Handle 'issue_d' and 'earliest_cr_line'
df = df.drop('issue_d', axis=1)
df['earliest_cr_line'] = df['earliest_cr_line'].apply(lambda x: int(x[-4:]))

In [None]:
# Prepare data for training
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

x = df.drop(['loan_status', 'loan_repaid'], axis=1).values
y = df['loan_repaid'].values

train_x, test_x, train_y, test_y = train_test_split(x, y, test_size=0.3, random_state=101)

# Scale data
scaler = MinMaxScaler()
train_x = scaler.fit_transform(train_x)
test_x = scaler.transform(test_x)

In [None]:
# Build neural network
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

model = Sequential()
model.add(Dense(78, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(39, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(19, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam')
model.fit(train_x, train_y, epochs=25, batch_size=256, validation_data=(test_x, test_y))

In [None]:
# Evaluate model
losses = pd.DataFrame(model.history.history)
losses.plot()

from sklearn.metrics import classification_report, confusion_matrix
predictions = (model.predict(test_x) > 0.5).astype("int32")
print(classification_report(test_y, predictions))
print(confusion_matrix(test_y, predictions))

In [None]:
# Predict for a random customer
import random
random.seed(101)
random_ind = random.randint(0, len(df))
new_customer = df.drop('loan_repaid', axis=1).iloc[random_ind]
new_customer = scaler.transform(new_customer.values.reshape(1, -1))

print("Prediction for new customer:", model.predict(new_customer))
print("Actual repayment status:", df.iloc[random_ind]['loan_repaid'])