In [1]:
#Goal: predict if a company goes bankrupt or not
%pip install imbalanced-learn
%pip install xgboost

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# import imbalanced-learn as imblearn
from scipy.stats import kurtosis, skew
import xgboost as xgb
from sklearn.metrics import recall_score
from imblearn.combine import SMOTEENN

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import EditedNearestNeighbours
from imblearn.pipeline import Pipeline
from sklearn.metrics import average_precision_score, precision_recall_curve

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [2]:
df = pd.read_csv('datasets/data.csv')

In [3]:
y = df['Bankrupt?']
x = df.drop('Bankrupt?', axis=1)

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)


smote = SMOTE(random_state=42) #initialise SMOTE
x_train_smote, y_train_smote = smote.fit_resample(x_train, y_train)
print("Before SMOTE:", y_train.value_counts())
print("After SMOTE:", pd.Series(y_train_smote).value_counts())

pipeline = Pipeline([
    ('smote', smote),
    ('rf', RandomForestClassifier(random_state=42))
])


model = RandomForestClassifier(n_estimators=100, random_state=42) # Initialize Random Forest
model.fit(x_train_smote, y_train_smote) # fit model to trained data

y_pred = model.predict(x_test) # make predictions on the test set

# Accuracy of model
accuracy = accuracy_score(y_test, y_pred)
print(f'\nAccuracy: {accuracy:.4f}')

# Evaluate the model
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Before SMOTE: Bankrupt?
0    5286
1     169
Name: count, dtype: int64
After SMOTE: Bankrupt?
0    5286
1    5286
Name: count, dtype: int64

Accuracy: 0.9531

Confusion Matrix:
[[1272   41]
 [  23   28]]

Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.97      0.98      1313
           1       0.41      0.55      0.47        51

    accuracy                           0.95      1364
   macro avg       0.69      0.76      0.72      1364
weighted avg       0.96      0.95      0.96      1364



Let's add a class_weight to the model so that we prioritise minority classes.

In [4]:
y = df['Bankrupt?']
x = df.drop('Bankrupt?', axis=1)

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)


smote = SMOTE(random_state=42) #initialise SMOTE
x_train_smote, y_train_smote = smote.fit_resample(x_train, y_train)
print("Before SMOTE:", y_train.value_counts())
print("After SMOTE:", pd.Series(y_train_smote).value_counts())


model = RandomForestClassifier(
    n_estimators=100,
    random_state=42,
    class_weight={0: 10, 1: 10}
) # Initialize Random Forest
model.fit(x_train_smote, y_train_smote) # fit model to trained data

y_pred = model.predict(x_test) # make predictions on the test set

# Accuracy of model
accuracy = accuracy_score(y_test, y_pred)
print(f'\nAccuracy: {accuracy:.4f}')

# Evaluate the model
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Before SMOTE: Bankrupt?
0    5286
1     169
Name: count, dtype: int64
After SMOTE: Bankrupt?
0    5286
1    5286
Name: count, dtype: int64

Accuracy: 0.9531

Confusion Matrix:
[[1272   41]
 [  23   28]]

Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.97      0.98      1313
           1       0.41      0.55      0.47        51

    accuracy                           0.95      1364
   macro avg       0.69      0.76      0.72      1364
weighted avg       0.96      0.95      0.96      1364



Recall has icnreased for class 1 from 55% to 73% - which is great.

But ideally, we can increase the precision too since its quite low at only 37% of firms identified to be bankrupt are actually bankrupt. Ie. a lot of false positives. 

To find an optimal balance for class weights etc. lets do RandomizedSearchCV.

In [5]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
import numpy as np

# Define the parameter distribution
param_dist = {
    'n_estimators': np.arange(100, 1001, 100),  # Number of trees in the forest (from 100 to 1000)
    'max_depth': [None, 10, 20, 30, 40],  # Maximum depth of trees
    'min_samples_split': [2, 5, 10, 20],  # Minimum samples required to split a node
    'min_samples_leaf': [1, 2, 4, 10],  # Minimum samples required at leaf node
    'class_weight': ['balanced', {0: 10, 1: 1}, {0: 8, 1: 2}]  # Class weights
}

# Initialize the model
model = RandomForestClassifier(random_state=42)

# Setup RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=model,
    param_distributions=param_dist,
    n_iter=10,  # Number of random combinations to test
    cv=3,  # Number of cross-validation splits
    n_jobs=-1,  # Use all available CPU cores
    scoring='recall',  # Optimize for recall
    verbose=2,
    random_state=42
)

# Fit the randomized search to your training data
random_search.fit(x_train_smote, y_train_smote)

# Print the best parameters found
print(f'Best parameters: {random_search.best_params_}')


Fitting 3 folds for each of 10 candidates, totalling 30 fits


  _data = np.array(data, dtype=dtype, copy=copy,


Best parameters: {'n_estimators': np.int64(100), 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_depth': 20, 'class_weight': 'balanced'}


In [6]:
# Prepare data
y = df['Bankrupt?']
x = df.drop('Bankrupt?', axis=1)

# Initialize SMOTE
smote = SMOTE(random_state=42)

# Define the parameter grid for Random Forest
param_grid = {
    'rf__n_estimators': [100, 200, 300, 500],  # Number of trees
    'rf__min_samples_split': [2, 5, 10, 15],   # Minimum samples required to split a node
    'rf__min_samples_leaf': [1, 2, 4, 6],     # Minimum samples required at leaf node
    'rf__max_depth': [None, 10, 20, 30, 40],   # Maximum depth of trees
    'rf__class_weight': ['balanced', {0: 10, 1: 1}, {0: 8, 1: 2}, {0: 5, 1: 3}]  # Class weights
}

# Create a pipeline that first applies SMOTE, then trains Random Forest
pipeline = Pipeline([
    ('smote', smote),
    ('rf', RandomForestClassifier(random_state=42))
])

# Set up RandomizedSearchCV with recall as the scoring metric
random_search = RandomizedSearchCV(
    estimator=pipeline,
    param_distributions=param_grid,
    n_iter=50,  # Number of random combinations to test
    cv=3,  # Number of cross-validation splits
    n_jobs=-1,  # Use all available CPU cores
    scoring='recall',  # Optimize for recall
    verbose=2,
    random_state=42
)

# Split the data
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# Fit the model using RandomizedSearchCV
random_search.fit(x_train, y_train)

# Print the best parameters found by RandomizedSearchCV
print(f'Best parameters: {random_search.best_params_}')

# Get the best model found by RandomizedSearchCV
best_model = random_search.best_estimator_

# Make predictions on the test set
y_pred = best_model.predict(x_test)

print("Before SMOTE:", y_train.value_counts())
print("After SMOTE:", pd.Series(y_train_smote).value_counts())

cv_scores = random_search.cv_results_['mean_test_score']
print(f'Cross-validation scores: {cv_scores}')


# Evaluate the model
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Fitting 3 folds for each of 50 candidates, totalling 150 fits


  _data = np.array(data, dtype=dtype, copy=copy,


Best parameters: {'rf__n_estimators': 300, 'rf__min_samples_split': 5, 'rf__min_samples_leaf': 2, 'rf__max_depth': None, 'rf__class_weight': {0: 10, 1: 1}}
Before SMOTE: Bankrupt?
0    5286
1     169
Name: count, dtype: int64
After SMOTE: Bankrupt?
0    5286
1    5286
Name: count, dtype: int64
Cross-validation scores: [0.55002089 0.57946951 0.60327903 0.59732665 0.59137427 0.60902256
 0.5914787  0.61518379 0.57967836 0.54991646 0.61507937 0.55012531
 0.62708855 0.59743108 0.54981203 0.58542189 0.54396408 0.5914787
 0.56182122 0.60338346 0.59732665 0.46094403 0.58552632 0.58542189
 0.58552632 0.61518379 0.58563074 0.64452799 0.4789056  0.61518379
 0.54417293 0.60923141 0.60912698 0.48475355 0.58552632 0.60327903
 0.61518379 0.60338346 0.55002089 0.57372598 0.54396408 0.61518379
 0.5380117  0.52621136 0.55576441 0.58552632 0.57967836 0.57967836
 0.54991646 0.59743108]

Confusion Matrix:
[[1261   52]
 [  14   37]]

Classification Report:
              precision    recall  f1-score   suppo

trying again while account for computational power. 