Task 3: Credit risk analysis

In [1]:
from google.colab import files
uploaded = files.upload()

Saving Task 3 and 4_Loan_Data.csv to Task 3 and 4_Loan_Data.csv


In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score

# Load the dataset
df = pd.read_csv('Task 3 and 4_Loan_Data.csv')

# Inspect the data
print(df.head())

   customer_id  credit_lines_outstanding  loan_amt_outstanding  \
0      8153374                         0           5221.545193   
1      7442532                         5           1958.928726   
2      2256073                         0           3363.009259   
3      4885975                         0           4766.648001   
4      4700614                         1           1345.827718   

   total_debt_outstanding       income  years_employed  fico_score  default  
0             3915.471226  78039.38546               5         605        0  
1             8228.752520  26648.43525               2         572        1  
2             2027.830850  65866.71246               4         602        0  
3             2501.730397  74356.88347               5         612        0  
4             1768.826187  23448.32631               6         631        0  


In [6]:
# Preprocessing: Split into features and target
X = df.drop(columns=['customer_id', 'default'])  # Drop customer_id and target column 'default'
y = df['default']  # Target column

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the data (Logistic regression works better with standardized data)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Train a logistic regression model
model = LogisticRegression(random_state=42)
model.fit(X_train, y_train)

In [19]:
# Evaluate the model
y_pred = model.predict(X_test)
y_pred_prob = model.predict_proba(X_test)[:, 1]

accuracy = accuracy_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_prob)

print(f"Accuracy: {accuracy:.2f}")
print(f"ROC AUC Score: {roc_auc:.2f}")

Accuracy: 1.00
ROC AUC Score: 1.00


In [20]:
# Function to predict the expected loss
def calculate_expected_loss(model, scaler, borrower_features, loan_amount_outstanding, recovery_rate=0.1):
    """
    Predict the expected loss on a loan.

    Parameters:
    - model: The trained model (logistic regression, decision tree, etc.).
    - scaler: The fitted scaler object for standardizing features.
    - borrower_features: A list of features such as [credit_lines_outstanding, loan_amt_outstanding, total_debt_outstanding, income, years_employed, fico_score].
    - loan_amount_outstanding: The amount of the loan outstanding.
    - recovery_rate: The recovery rate (default is 10%).

    Returns:
    - expected_loss: The expected loss on the loan.
    """
    # Convert features to a DataFrame for compatibility with scaler
    borrower_features_df = pd.DataFrame([borrower_features], columns=X.columns)

    # Standardize the features if the model requires it (e.g., logistic regression)
    if isinstance(model, LogisticRegression):
        borrower_features_scaled = scaler.transform(borrower_features_df)
    else:
        borrower_features_scaled = borrower_features_df

    # Predict the probability of default (PD)
    pd1 = model.predict_proba(borrower_features_scaled)[:, 1][0]

    # Calculate the expected loss
    expected_loss = pd1 * (1 - recovery_rate) * loan_amount_outstanding
    return round(expected_loss, 2)

In [21]:
# Example usage of the function with logistic regression model
borrower_features_lr = [2, 50000, 15000, 70000, 5, 650]  # Example borrower features
loan_amount_outstanding = 50000

borrower_expected_loss_lr = calculate_expected_loss(model, scaler, borrower_features_lr, loan_amount_outstanding)
print(f"Expected Loss (Logistic Regression): ${borrower_expected_loss_lr}")

Expected Loss (Logistic Regression): $4869.1
