In [None]:
#Goal: predict if a company goes bankrupt or not
%pip install imbalanced-learn
%pip install xgboost

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [32]:
df = pd.read_csv('datasets/data.csv')

In [33]:
y = df['Bankrupt?']
x = df.drop('Bankrupt?', axis=1)

# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# Initialize SMOTE
smote = SMOTE(random_state=42)

# Resample the training data
x_train_smote, y_train_smote = smote.fit_resample(x_train, y_train)

print("Before SMOTE:", y_train.value_counts())
print("After SMOTE:", pd.Series(y_train_smote).value_counts())

# Initialize individual models
rf = RandomForestClassifier(n_estimators=100, random_state=42, class_weight={0: 10, 1: 1})
lr = LogisticRegression(random_state=42, class_weight='balanced')
xgb = XGBClassifier(random_state=42, scale_pos_weight=10)  # Weights for imbalance

# Create a Voting Classifier with soft voting
voting_clf = VotingClassifier(estimators=[('rf', rf), ('lr', lr), ('xgb', xgb)], voting='soft')

# Fit the model
voting_clf.fit(x_train_smote, y_train_smote)

# Make predictions on the test set
y_pred = voting_clf.predict(x_test)

# Evaluate the model
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Before SMOTE: Bankrupt?
0    5286
1     169
Name: count, dtype: int64
After SMOTE: Bankrupt?
0    5286
1    5286
Name: count, dtype: int64


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



Confusion Matrix:
[[1269   44]
 [  21   30]]

Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.97      0.98      1313
           1       0.41      0.59      0.48        51

    accuracy                           0.95      1364
   macro avg       0.69      0.78      0.73      1364
weighted avg       0.96      0.95      0.96      1364



Time to make some edits to increase recall.

1. increase max number of iterations
2. scale the features
3. switch to a different solver
4. check for multicollinearity


In [34]:
y = df['Bankrupt?']
x = df.drop('Bankrupt?', axis=1)

# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# Initialize SMOTE
smote = SMOTE(random_state=42)

# Resample the training data
x_train_smote, y_train_smote = smote.fit_resample(x_train, y_train)

print("Before SMOTE:", y_train.value_counts())
print("After SMOTE:", pd.Series(y_train_smote).value_counts())


scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train_smote)
x_test_scaled = scaler.transform(x_test)
lr_pipeline = Pipeline([
    ('scaler', StandardScaler()),  # Add the scaler
    ('lr', LogisticRegression(random_state=42, class_weight='balanced', max_iter=1000))
])

# Initialize individual models
rf = RandomForestClassifier(n_estimators=100, random_state=42, class_weight={0: 10, 1: 1})
lr = LogisticRegression(random_state=42, class_weight='balanced', solver='saga', max_iter=1000)
xgb = XGBClassifier(random_state=42, scale_pos_weight=10)  # Weights for imbalance

# Create a Voting Classifier with soft voting
voting_clf = VotingClassifier(estimators=[
    ('rf', rf),
    ('lr', lr_pipeline),  # Use the pipeline with scaling
    ('xgb', xgb)
], voting='soft')

# Fit the model
voting_clf.fit(x_train_smote, y_train_smote)

# Make predictions on the test set
y_pred = voting_clf.predict(x_test)

# Evaluate the model
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Before SMOTE: Bankrupt?
0    5286
1     169
Name: count, dtype: int64
After SMOTE: Bankrupt?
0    5286
1    5286
Name: count, dtype: int64

Confusion Matrix:
[[1255   58]
 [  17   34]]

Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.96      0.97      1313
           1       0.37      0.67      0.48        51

    accuracy                           0.95      1364
   macro avg       0.68      0.81      0.72      1364
weighted avg       0.96      0.95      0.95      1364



Now there's higher recall but at the expense of precision tho. 

In [35]:
y = df['Bankrupt?']
x = df.drop('Bankrupt?', axis=1)

# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# Initialize SMOTE
smote = SMOTE(random_state=42)

# Resample the training data
x_train_smote, y_train_smote = smote.fit_resample(x_train, y_train)

print("Before SMOTE:", y_train.value_counts())
print("After SMOTE:", pd.Series(y_train_smote).value_counts())


scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train_smote)
x_test_scaled = scaler.transform(x_test)
lr_pipeline = Pipeline([
    ('scaler', StandardScaler()),  # Add the scaler
])

# Initialize individual models
rf = RandomForestClassifier(n_estimators=100, random_state=42, class_weight={0: 10, 1: 1})
xgb = XGBClassifier(random_state=42, scale_pos_weight=10)  # Weights for imbalance

# Create a Voting Classifier with soft voting
voting_clf = VotingClassifier(estimators=[
    ('rf', rf),
    ('xgb', xgb)
], voting='soft')

# Fit the model
voting_clf.fit(x_train_smote, y_train_smote)

# Make predictions on the test set
y_pred = voting_clf.predict(x_test)

# Evaluate the model
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Before SMOTE: Bankrupt?
0    5286
1     169
Name: count, dtype: int64
After SMOTE: Bankrupt?
0    5286
1    5286
Name: count, dtype: int64

Confusion Matrix:
[[1271   42]
 [  19   32]]

Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.97      0.98      1313
           1       0.43      0.63      0.51        51

    accuracy                           0.96      1364
   macro avg       0.71      0.80      0.74      1364
weighted avg       0.96      0.96      0.96      1364



In [36]:
y = df['Bankrupt?']
x = df.drop('Bankrupt?', axis=1)

# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# Initialize SMOTE
smote = SMOTE(random_state=42)

# Resample the training data
x_train_smote, y_train_smote = smote.fit_resample(x_train, y_train)

print("Before SMOTE:", y_train.value_counts())
print("After SMOTE:", pd.Series(y_train_smote).value_counts())


scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train_smote)
x_test_scaled = scaler.transform(x_test)
lr_pipeline = Pipeline([
    ('scaler', StandardScaler()),  # Add the scaler
])

# Initialize individual models
rf = RandomForestClassifier(n_estimators=100, random_state=42, class_weight={0: 5, 1: 3})
xgb = XGBClassifier(random_state=42, scale_pos_weight=8)  # Weights for imbalance

# Create a Voting Classifier with soft voting
voting_clf = VotingClassifier(estimators=[
    ('rf', rf),
    ('xgb', xgb)
], voting='soft')

# Fit the model
voting_clf.fit(x_train_smote, y_train_smote)

# Make predictions on the test set
y_pred = voting_clf.predict(x_test)

# Evaluate the model
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Before SMOTE: Bankrupt?
0    5286
1     169
Name: count, dtype: int64
After SMOTE: Bankrupt?
0    5286
1    5286
Name: count, dtype: int64

Confusion Matrix:
[[1274   39]
 [  19   32]]

Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.97      0.98      1313
           1       0.45      0.63      0.52        51

    accuracy                           0.96      1364
   macro avg       0.72      0.80      0.75      1364
weighted avg       0.97      0.96      0.96      1364



Now time to tune the class_weight for RandomForestClassifier. So imma just test out different weight ratios bw the majority and minority classes. Once my precision is above 50% im packing up. 

In [37]:
y = df['Bankrupt?']
x = df.drop('Bankrupt?', axis=1)

# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# Initialize SMOTE
smote = SMOTE(random_state=42)

# Resample the training data
x_train_smote, y_train_smote = smote.fit_resample(x_train, y_train)

print("Before SMOTE:", y_train.value_counts())
print("After SMOTE:", pd.Series(y_train_smote).value_counts())


scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train_smote)
x_test_scaled = scaler.transform(x_test)
lr_pipeline = Pipeline([
    ('scaler', StandardScaler()),  # Add the scaler
])

# Initialize individual models
rf = RandomForestClassifier(n_estimators=100, random_state=42, class_weight={0: 5, 1: 1})
xgb = XGBClassifier(random_state=42, scale_pos_weight=8)  # Weights for imbalance

# Create a Voting Classifier with soft voting
voting_clf = VotingClassifier(estimators=[
    ('rf', rf),
    ('xgb', xgb)
], voting='soft')

# Fit the model
voting_clf.fit(x_train_smote, y_train_smote)

# Make predictions on the test set
y_pred = voting_clf.predict(x_test)

# Evaluate the model
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Before SMOTE: Bankrupt?
0    5286
1     169
Name: count, dtype: int64
After SMOTE: Bankrupt?
0    5286
1    5286
Name: count, dtype: int64

Confusion Matrix:
[[1275   38]
 [  18   33]]

Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.97      0.98      1313
           1       0.46      0.65      0.54        51

    accuracy                           0.96      1364
   macro avg       0.73      0.81      0.76      1364
weighted avg       0.97      0.96      0.96      1364



Mild improvement in precision and recall!! Lets try again. 

In [38]:
# Prepare data
y = df['Bankrupt?']
x = df.drop('Bankrupt?', axis=1)


# Define the parameter grid for Random Forest
param_grid = {
    'rf__n_estimators': [100, 300, 500],  # Number of trees
    'rf__min_samples_split': [2, 5, 10],   # Minimum samples required to split a node
    'rf__min_samples_leaf': [1, 2, 4],     # Minimum samples required at leaf node
    'rf__max_depth': [None, 10, 30],   # Maximum depth of trees
    'rf__class_weight': ['balanced', {0: 10, 1: 1}, {0: 5, 1: 3}]  # Class weights
}

# Create a pipeline that first applies SMOTE, then trains Random Forest
pipeline = Pipeline([
    ('smote', SMOTE(random_state=42)),
    ('rf', RandomForestClassifier(random_state=42))
])

# Set up RandomizedSearchCV with recall as the scoring metric
random_search = RandomizedSearchCV(
    estimator=pipeline,
    param_distributions=param_grid,
    n_iter=30,  # Number of random combinations to test
    cv=3,  # Number of cross-validation splits
    n_jobs=-1,  # Use all available CPU cores
    scoring='recall', 
    verbose=1,
    random_state=42
)

# Split the data
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# Fit the model using RandomizedSearchCV
random_search.fit(x_train, y_train)

# Print the best parameters found by RandomizedSearchCV
print(f'Best parameters: {random_search.best_params_}')

# Get the best model found by RandomizedSearchCV
best_model = random_search.best_estimator_

# Make predictions on the test set
y_pred = best_model.predict(x_test)

cv_scores = random_search.cv_results_['mean_test_score']
print(f'Cross-validation scores: {cv_scores}')


# Evaluate the model
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Fitting 3 folds for each of 30 candidates, totalling 90 fits


  _data = np.array(data, dtype=dtype, copy=copy,


Best parameters: {'rf__n_estimators': 300, 'rf__min_samples_split': 2, 'rf__min_samples_leaf': 1, 'rf__max_depth': None, 'rf__class_weight': {0: 10, 1: 1}}
Before SMOTE: Bankrupt?
0    5286
1     169
Name: count, dtype: int64
After SMOTE: Bankrupt?
0    5286
1    5286
Name: count, dtype: int64
Cross-validation scores: [0.58563074 0.50250627 0.6031746  0.54991646 0.57362155 0.55576441
 0.61518379 0.60902256 0.53205931 0.60891813 0.63888889 0.62113617
 0.54991646 0.53205931 0.68013784 0.5734127  0.57957393 0.53811612
 0.50845865 0.5677736  0.61518379 0.57957393 0.63272765 0.62698413
 0.57957393 0.54396408 0.53226817 0.57957393 0.6031746  0.59116541]

Confusion Matrix:
[[1252   61]
 [  13   38]]

Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.95      0.97      1313
           1       0.38      0.75      0.51        51

    accuracy                           0.95      1364
   macro avg       0.69      0.85      0.74      1364
we

In [39]:
best_rf = best_model.named_steps['rf']
feature_importances = pd.Series(best_rf.feature_importances_, index=x.columns)
print(feature_importances.sort_values(ascending=False))

Net Income to Stockholder's Equity         0.044497
Borrowing dependency                       0.041840
Total debt/Total net worth                 0.038789
Net Value Growth Rate                      0.032728
Persistent EPS in the Last Four Seasons    0.032077
                                             ...   
Working Capital to Total Assets            0.003941
Total Asset Turnover                       0.003872
Operating Gross Margin                     0.003788
Liability-Assets Flag                      0.000000
Net Income Flag                            0.000000
Length: 95, dtype: float64


Ok back to the intial one above that has a good balance and trying from this again.