In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from matplotlib import pyplot as plt

df = pd.read_csv("american_bankruptcy.csv")
df = df.drop(columns = ["company_name"])
print(df.head(10))

  status_label  year       X1        X2      X3       X4       X5      X6  \
0        alive  1999  511.267   833.107  18.373   89.031  336.018  35.163   
1        alive  2000  485.856   713.811  18.577   64.367  320.590  18.531   
2        alive  2001  436.656   526.477  22.496   27.207  286.588 -58.939   
3        alive  2002  396.412   496.747  27.172   30.745  259.954 -12.410   
4        alive  2003  432.204   523.302  26.680   47.491  247.245   3.504   
5        alive  2004  474.542   598.172  27.950   61.774  255.477  15.453   
6        alive  2005  624.454   704.081  29.222   91.877  323.592  35.163   
7        alive  2006  645.721   837.171  32.199  118.907  342.593  58.660   
8        alive  2007  783.431  1080.895  39.952  168.522  435.608  75.144   
9        alive  2008  851.312  1110.677  40.551  166.080  477.424  78.651   

        X7         X8        X9       X10      X11      X12      X13      X14  \
0  128.348   372.7519  1024.333   740.998  180.447   70.658  191.226  1

-Import statements
-Reading the csv file contents containing our bankruptcy prediction data into a pandas DataFrame
-Dropping the company names column since it's irrelevant to our model

In [2]:
# 1. DATA PREPROCESSING

# Data Cleaning

missing_values = df.isnull().sum()
print("\nMissing values per column:\n", missing_values)
duplicates = df.duplicated().sum()
print("\nDuplicates in the dataset:", duplicates)


Missing values per column:
 status_label    0
year            0
X1              0
X2              0
X3              0
X4              0
X5              0
X6              0
X7              0
X8              0
X9              0
X10             0
X11             0
X12             0
X13             0
X14             0
X15             0
X16             0
X17             0
X18             0
dtype: int64

Duplicates in the dataset: 0


In [3]:
print("\nLabels in the dataset:", df["status_label"].unique()) 
df["status_encoding"] = df["status_label"].map({"alive": 0, "failed": 1})
df["status_encoding"] = df["status_encoding"].astype(int)
df = df.drop(columns = ["status_label"])
print(df.head(10))


Labels in the dataset: ['alive' 'failed']
   year       X1        X2      X3       X4       X5      X6       X7  \
0  1999  511.267   833.107  18.373   89.031  336.018  35.163  128.348   
1  2000  485.856   713.811  18.577   64.367  320.590  18.531  115.187   
2  2001  436.656   526.477  22.496   27.207  286.588 -58.939   77.528   
3  2002  396.412   496.747  27.172   30.745  259.954 -12.410   66.322   
4  2003  432.204   523.302  26.680   47.491  247.245   3.504  104.661   
5  2004  474.542   598.172  27.950   61.774  255.477  15.453  127.121   
6  2005  624.454   704.081  29.222   91.877  323.592  35.163  136.272   
7  2006  645.721   837.171  32.199  118.907  342.593  58.660  181.691   
8  2007  783.431  1080.895  39.952  168.522  435.608  75.144  202.472   
9  2008  851.312  1110.677  40.551  166.080  477.424  78.651  227.300   

          X8        X9       X10      X11      X12      X13      X14      X15  \
0   372.7519  1024.333   740.998  180.447   70.658  191.226  163.816  20

In [None]:
# Data Scaling/Standardization

scaler = StandardScaler()

X = df.iloc[:, :-1]
y = df["status_encoding"]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 21, test_size = 0.20, shuffle = True)

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

train_scaled = pd.concat([pd.DataFrame(X_train_scaled, columns = X.columns), pd.DataFrame(y_train.reset_index(drop = True), columns = ["status_encoding"])], axis = 1)
test_scaled = pd.concat([pd.DataFrame(X_test_scaled, columns = X.columns), pd.DataFrame(y_test.reset_index(drop = True), columns = ["status_encoding"])], axis = 1)
df_scaled = pd.concat([train_scaled, test_scaled], axis = 0)
print(df_scaled.head(10))

##sns.pairplot(df_scaled, hue='status_encoding')
##plt.show()

"""
for column in df_scaled.columns[:-1]:  # Exclude the target column
    plt.figure(figsize = (10, 5))
    sns.boxplot(x = "status_encoding", y = column, data = df_scaled)
    plt.title(f'Box plot of {column} by Bankruptcy Status')
    plt.show()

for column in df_scaled.columns[:-1]:  # Exclude the target column
    plt.figure(figsize=(10, 5))
    sns.violinplot(x='status_encoding', y=column, data=df_scaled)
    plt.title(f'Violin plot of {column} by Bankruptcy Status')
    plt.show()
"""