# Introduction

This Jupyter notebook provides an introduction to Machine Learning (ML) in finance, with a specific focus on risk forecasting, coviering some fields within risk forecasting:

1. A simple Logistical Regression model.
2. Application of the model in credit risk forecasting.


# Example of a Logistical Regression Model

In [None]:
# Import necessary libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score, roc_curve
import matplotlib.pyplot as plt
import seaborn as sns


# Set a random seed for reproducibility
np.random.seed(42)

In [None]:
# Step 1: Data Preprocessing - Generate Synthetic Data
# ----------------------------------------------------
# Creating a synthetic dataset with finance-related features

n_samples = 500
data = pd.DataFrame({
    'income': np.random.normal(50000, 15000, n_samples),           # Average income around $50,000
    'age': np.random.randint(18, 65, n_samples),                   # Customer age between 18 and 65
    'account_balance': np.random.normal(10000, 5000, n_samples)    # Average account balance around $10,000
})

# Define target: customers with higher income, moderate age, and higher balances are more likely to subscribe
data['subscribed'] = ((data['income'] > 60000) & (data['age'] < 50) & (data['account_balance'] > 12000)).astype(int)

print("Sample Data:")
print(data.head())

In [None]:
# Step 2: Define Features, Assign Target and Split Data into Training and Testing Sets
# ----------------------------------
# Our features are 'income', 'age', and 'account_balance'.
# Our target variable is 'subscribed', which indicates whether the customer subscribed to the product (1) or not (0).

x = data[['income', 'age', 'account_balance']]
y = data['subscribed']

# Splitting the data with 80% for training and 20% for testing
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [None]:
# Step 3: Initialize and Train the Logistic Regression Model
# ----------------------------------------------------------
# Creating the logistic regression model using sklearn

model = LogisticRegression()

# Training the model on the training data
model.fit(x_train, y_train)

In [None]:
# Step 4: Make Predictions on the Test Set
# ----------------------------------------
# Predict the class labels for the test set

y_pred = model.predict(x_test)

# Predict the probabilities for the positive class (subscribed = 1)
y_pred_prob = model.predict_proba(x_test)[:, 1]


In [None]:
# Step 5: Evaluate the Model
# --------------------------
# Calculate and print the accuracy of the model

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Generate a classification report to show precision, recall, and F1-score
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

In [None]:
# Step 6: Confusion Matrix Visualization
# --------------------------------------
# Create a confusion matrix to show true positives, true negatives, false positives, and false negatives

conf_matrix = confusion_matrix(y_test, y_pred)
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", cbar=False)
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

In [None]:
# Step 7: ROC Curve and AUC
# -------------------------
# Calculate the AUC-ROC score

roc_auc = roc_auc_score(y_test, y_pred_prob)

# Generate the false positive rate and true positive rate for the ROC curve
fpr, tpr, _ = roc_curve(y_test, y_pred_prob)

# Plot the ROC curve
plt.plot(fpr, tpr, label=f"AUC = {roc_auc:.2f}")
plt.plot([0, 1], [0, 1], 'k--')                    # Diagonal line for reference
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve")
plt.legend(loc="lower right")
plt.show()

# Logistical Regression

Here is an example of how to use a logistical regression for credit risk

In [None]:
# Import necessary libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score, roc_curve
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt
import seaborn as sns

# Set a random seed for reproducibility
np.random.seed(42)

In [None]:
# Step 1: Data Preprocessing - Generate Synthetic Data
# -----------------------------------------------------
# Creating a synthetic dataset with features relevant to credit risk forecasting
n_samples = 1000
credit_data = pd.DataFrame({
    'credit_score': np.random.normal(700, 50, n_samples),       # Average credit score around 700
    'income': np.random.normal(50000, 15000, n_samples),       # Average income around $50,000
    'debt_ratio': np.random.uniform(0, 1, n_samples),          # Debt-to-income ratio between 0 and 1
    'loan_amount': np.random.normal(20000, 5000, n_samples)    # Average loan amount around $20,000
})

# Define target variable 'default': high debt ratio and low credit score increase default risk
credit_data['default'] = ((credit_data['debt_ratio'] > 0.4) & (credit_data['credit_score'] < 650)).astype(int)

# Display sample data for reference
print("Sample Credit Data:")
print(credit_data.head())

In [None]:
# Step 2: Define Features and Target
# ----------------------------------
# Define target variable (y) and feature variables (X)
X = credit_data[['credit_score', 'income', 'debt_ratio', 'loan_amount']]
y = credit_data['default']

In [None]:
# Step 3: Split the Data into Training and Testing Sets
# -----------------------------------------------------
# Splitting the data with 80% for training and 20% for testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Step 4: Feature Scaling
# -----------------------
# Standardize features by removing the mean and scaling to unit variance
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)  # Fit and transform training data
X_test = scaler.transform(X_test)        # Transform test data using the same scaler

In [None]:
# Step 5: Handle Class Imbalance Using SMOTE
# ------------------------------------------
# Synthetic Minority Over-sampling Technique (SMOTE) to balance classes
# In cases where 'default' is underrepresented, SMOTE creates synthetic samples of the minority class
smote = SMOTE(random_state=42)
X_train, y_train = smote.fit_resample(X_train, y_train)

In [None]:
# Step 6: Initialize and Train the Logistic Regression Model
# ----------------------------------------------------------
# Initialize the logistic regression model
model = LogisticRegression()

# Train the model on the training data
model.fit(X_train, y_train)

In [None]:
# Step 7: Make Predictions on the Test Set
# ----------------------------------------
# Predict the class labels for the test set
y_pred = model.predict(X_test)

# Predict the probabilities for the positive class (default = 1)
y_pred_prob = model.predict_proba(X_test)[:, 1]

In [None]:
# Step 8: Evaluate the Model
# --------------------------
# Calculate and print the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Generate a classification report to show precision, recall, and F1-score
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

In [None]:
# Step 9: Confusion Matrix Visualization
# --------------------------------------
# Create a confusion matrix to show true positives, true negatives, false positives, and false negatives
conf_matrix = confusion_matrix(y_test, y_pred)
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", cbar=False)
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

In [None]:
# Step 10: ROC Curve and AUC
# --------------------------
# Calculate the AUC-ROC score
roc_auc = roc_auc_score(y_test, y_pred_prob)

# Generate the false positive rate and true positive rate for the ROC curve
fpr, tpr, _ = roc_curve(y_test, y_pred_prob)

# Plot the ROC curve
plt.plot(fpr, tpr, label=f"AUC = {roc_auc:.2f}")
plt.plot([0, 1], [0, 1], 'k--')  # Diagonal line for reference
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve")
plt.legend(loc="lower right")
plt.show()

In [None]:
# Step 11: Interpret Model Coefficients
# -------------------------------------
# Coefficients provide insights into the impact of each feature on the probability of default
coefficients = pd.DataFrame({"Feature": X.columns, "Coefficient": model.coef_[0]})
print("\nLogistic Regression Coefficients:")
print(coefficients)