In [12]:
#Goal: predict if a company goes bankrupt or not
%pip install imbalanced-learn

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# import imbalanced-learn as imblearn
from scipy.stats import kurtosis, skew


from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from imblearn.over_sampling import SMOTE

Note: you may need to restart the kernel to use updated packages.


In [13]:
df = pd.read_csv('datasets/data.csv')

Sig no. of outliers observed for most columns - via dataWrangler. Since this is financial data, check for long-tailed distribution aka tail extends further than normal distribution. If there is no long-tailed distribution, we can apply Random Forest without log transformation to account for skewness

In [14]:
# # #This isn't working so gonna try a diff method results unreliable according to vscode due to many identical values
# # Calculate skewness and kurtosis for each feature
# skewness = df.skew()
# kurtosis_values = df.apply(kurtosis)

# # Print skewness and kurtosis summary
# summary = pd.DataFrame({'Skewness': skewness, 'Kurtosis': kurtosis_values})
# print(summary)

# # Filter for skewed features (consider skewness > 1 or < -1 as skewed)
# skewed_features = summary[summary['Skewness'].abs() > 1]
# print("Skewed Features:")
# print(skewed_features)



#Method 2: accounting for huge no. of columns
df_non_constant = df.loc[:, df.nunique() > 1] #filter out constant/nearly constant features to avoid precision loss issue

# Calculate skewness and kurtosis for each feature
skewness = df_non_constant.skew()
kurtosis_values = df_non_constant.apply(kurtosis)

summary = pd.DataFrame({'Skewness': skewness, 'Kurtosis': kurtosis_values}) # Create a summary DataFrame with both skewness and kurtosis
sorted_summary = summary.sort_values(by=['Skewness', 'Kurtosis'], ascending=False) # Sort by Skewness and Kurtosis in desc order (most problematic first)
print(sorted_summary.head(10))  # Shows top 10 most problematic columns based on skewness and kurtosis

                                          Skewness     Kurtosis
Fixed Assets to Assets                   82.577237  6814.000147
Current Ratio                            82.577237  6814.000147
Total income/Total expense               82.332424  6786.903523
Net Value Growth Rate                    80.291844  6540.116467
Contingent liabilities/Net worth         79.670620  6487.125425
Realized Sales Gross Profit Growth Rate  77.925109  6291.000429
Continuous Net Profit Growth Rate        67.097534  5392.615103
Total Asset Return Growth Rate Ratio     62.499961  5071.235869
Revenue per person                       59.434480  3568.408258
Quick Assets/Current Liability           47.947300  2305.178322


So there's kurtosis in the 6000s (should be around 3) and skewness in 80s (should be around 1) its very high even for financial data from a good source cos it suggests heavy-tailed distributions far from normality and extreme outliers. could be outliers but i googled and saw the years in the data are the 1997 asian financial crisis, 2001 recession, 2008 global recession Taiwan faced. But tbh the high skewness in financial data makes sense cos of the nature of how assets and liabilities behave in real world aka right skewed distribution since small no. of firms will have most of market value/revenue. Also bankruptcies are rare events that the model must capture.

So things to do:
1. find the key features w extreme values/skewness by reviewing feature importance
2. apply log transformation to features with high skewness
3. use tree-based model random forst which handles outliers and skewed data better
4. Keep in mind that your target variable (bankruptcy) is likely highly imbalanced (i.e., most companies won't go bankrupt). This imbalance itself can introduce skewness into your dataset. You may need to use techniques like SMOTE (Synthetic Minority Over-sampling Technique) or class weighting in your model to account for the imbalance.

In [15]:
y = df['Bankrupt?']
x = df.drop('Bankrupt?', axis=1)

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)


scaler = StandardScaler() #initialise scaler to bring it to normal distribution
x_train_scaled = scaler.fit_transform(x_train) #fit scaler on training data and transform it
x_test_scaled = scaler.transform(x_test) #transform test data using same scaler


model = RandomForestClassifier(n_estimators=10, random_state=42) # Initialize the model
model.fit(x_train_scaled, y_train) # Fit the model to the training data


y_pred = model.predict(x_test_scaled) #get the trained model to make predictions on test set


# Now we can evaluate how well the model did.
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.4f}')

# Confusion matrix
print('Confusion Matrix:')
print(confusion_matrix(y_test, y_pred))

# Classification report (precision, recall, f1-score)
print('\nClassification Report:')
print(classification_report(y_test, y_pred))

Accuracy: 0.9655
Confusion Matrix:
[[1310    3]
 [  44    7]]

Classification Report:
              precision    recall  f1-score   support

           0       0.97      1.00      0.98      1313
           1       0.70      0.14      0.23        51

    accuracy                           0.97      1364
   macro avg       0.83      0.57      0.61      1364
weighted avg       0.96      0.97      0.95      1364



So accuracy is noice. But a bit misleading since got very imbalanced classes (ie. lot more safe than bankrupty firms). The model is just predicting the majority safe class correctly.

Problems:
1. there are 44 firms predicted to be safe but were actually bankrupt.
2. only 70% precision for class 1 - meaning only 70% of firms predicted to be bankrupt were actually bankrupt
3. only 14% recall for class 1 - meaning only 14% of actually bankrupt firms were correctly predicted as bankrupt. very bad cos the model is failing to identify many bankrupt companies.

In [16]:
#Let's try again.
#add in the class_weight='balanced' to the model and see how.

In [17]:
y = df['Bankrupt?']
x = df.drop('Bankrupt?', axis=1)

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)


scaler = StandardScaler() #initialise scaler to bring it to normal distribution
x_train_scaled = scaler.fit_transform(x_train) #fit scaler on training data and transform it
x_test_scaled = scaler.transform(x_test) #transform test data using same scaler


model = RandomForestClassifier(n_estimators=10, class_weight='balanced', random_state=42) # Initialize the model
model.fit(x_train_scaled, y_train) # Fit the model to the training data


y_pred = model.predict(x_test_scaled) #get the trained model to make predictions on test set


# Now we can evaluate how well the model did.
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.4f}')

# Confusion matrix
print('Confusion Matrix:')
print(confusion_matrix(y_test, y_pred))

# Classification report (precision, recall, f1-score)
print('\nClassification Report:')
print(classification_report(y_test, y_pred))

Accuracy: 0.9670
Confusion Matrix:
[[1309    4]
 [  41   10]]

Classification Report:
              precision    recall  f1-score   support

           0       0.97      1.00      0.98      1313
           1       0.71      0.20      0.31        51

    accuracy                           0.97      1364
   macro avg       0.84      0.60      0.65      1364
weighted avg       0.96      0.97      0.96      1364



there are small improvements noice. let's test out SMOTE.

In [None]:
# y = df['Bankrupt?']
# x = df.drop('Bankrupt?', axis=1)

# x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)


# scaler = StandardScaler() #initialise scaler to bring it to normal distribution
# x_train_scaled = scaler.fit_transform(x_train) #fit scaler on training data and transform it
# x_test_scaled = scaler.transform(x_test) #transform test data using same scaler


# smote = SMOTE(random_state=42) #initialise SMOTE
# x_train_smote, y_train_smote = smote.fit_resample(x_train_scaled, y_train)
# print("Before SMOTE:", y_train.value_counts())
# print("\n\nAfter SMOTE:", pd.Series(y_train_smote).value_counts())


# model = RandomForestClassifier(n_estimators=100, random_state=42) # Initialize Random Forest
# model.fit(x_train_smote, y_train_smote) # fit model to trained data

# y_pred = model.predict(x_test_scaled) # make predictions on the test set

# # Accuracy of model
# accuracy = accuracy_score(y_test, y_pred)
# print(f'\nAccuracy: {accuracy:.4f}')

# # Evaluate the model
# print("\nConfusion Matrix:")
# print(confusion_matrix(y_test, y_pred))

# print("\nClassification Report:")
# print(classification_report(y_test, y_pred))

Before SMOTE: Bankrupt?
0    5286
1     169
Name: count, dtype: int64


After SMOTE: Bankrupt?
0    5286
1    5286
Name: count, dtype: int64

Accuracy: 0.9567

Confusion Matrix:
[[1277   36]
 [  23   28]]

Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.97      0.98      1313
           1       0.44      0.55      0.49        51

    accuracy                           0.96      1364
   macro avg       0.71      0.76      0.73      1364
weighted avg       0.96      0.96      0.96      1364



Ok So when we did SMOTE, some things happened.

1. The precision of class 1 dropped -> from 70% to only 44% of firms predicted to be bankrupt were actually bankrupt.
2. Recall of class 1 increased -> from 14%, its not 55% of actually bankrupt firms being correctly predicted to be bankrupt. 