# Supervised ML: Company Bankruptcy Prediction

In this project, I will complete a classification task. The dataset I will look at is [the Company Bankruptcy Prediction dataset from Kaggle](https://www.kaggle.com/fedesoriano/company-bankruptcy-prediction), which is a Bankruptcy data from the Taiwan Economic Journal for the years 1999–2009. My goal is to predict whether a business will go bankruptcy.

In [1]:
# import block
import matplotlib.pyplot as plt
%matplotlib widget

import pandas as pd
import numpy as np
from numpy import linalg as LA
import math
from sklearn.svm import SVC
from sklearn import preprocessing
from sklearn.decomposition import PCA

from svm_plot import *
from coding_3 import ten_fold, split_val_data

## The Dataset

In [2]:
# import data
data = np.genfromtxt("data.csv", delimiter=',', skip_header=1)
data_pd = pd.read_csv("data.csv", sep=",")

print(data.shape)
data_pd.head()

(6819, 96)


Unnamed: 0,Bankrupt?,ROA(C) before interest and depreciation before interest,ROA(A) before interest and % after tax,ROA(B) before interest and depreciation after tax,Operating Gross Margin,Realized Sales Gross Margin,Operating Profit Rate,Pre-tax net Interest Rate,After-tax net Interest Rate,Non-industry income and expenditure/revenue,...,Net Income to Total Assets,Total assets to GNP price,No-credit Interval,Gross Profit to Sales,Net Income to Stockholder's Equity,Liability to Equity,Degree of Financial Leverage (DFL),Interest Coverage Ratio (Interest expense to EBIT),Net Income Flag,Equity to Liability
0,1,0.370594,0.424389,0.40575,0.601457,0.601457,0.998969,0.796887,0.808809,0.302646,...,0.716845,0.009219,0.622879,0.601453,0.82789,0.290202,0.026601,0.56405,1,0.016469
1,1,0.464291,0.538214,0.51673,0.610235,0.610235,0.998946,0.79738,0.809301,0.303556,...,0.795297,0.008323,0.623652,0.610237,0.839969,0.283846,0.264577,0.570175,1,0.020794
2,1,0.426071,0.499019,0.472295,0.60145,0.601364,0.998857,0.796403,0.808388,0.302035,...,0.77467,0.040003,0.623841,0.601449,0.836774,0.290189,0.026555,0.563706,1,0.016474
3,1,0.399844,0.451265,0.457733,0.583541,0.583541,0.9987,0.796967,0.808966,0.30335,...,0.739555,0.003252,0.622929,0.583538,0.834697,0.281721,0.026697,0.564663,1,0.023982
4,1,0.465022,0.538432,0.522298,0.598783,0.598783,0.998973,0.797366,0.809304,0.303475,...,0.795016,0.003878,0.623521,0.598782,0.839973,0.278514,0.024752,0.575617,1,0.03549


In [3]:
# standardize the data (except for the first column)
target = data_pd.iloc[:, 0]
features = data_pd.iloc[:, 1:]
features_std = preprocessing.scale(features)

print(features_std.shape)
pd.DataFrame(features_std).head()

(6819, 95)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,85,86,87,88,89,90,91,92,93,94
0,-2.217909,-2.045798,-2.400361,-0.383334,-0.382638,0.016456,-0.023516,-0.020165,-0.087479,-0.031281,...,-2.254317,-0.049491,-0.084274,-0.383465,-0.861611,0.680171,-0.060031,-0.098978,0.0,-0.622067
1,-0.673828,-0.311068,-0.59845,0.135068,0.136307,0.014671,0.014799,0.015966,-0.005957,0.009823,...,-0.309033,-0.049491,-0.021388,0.135253,-0.029804,0.240685,15.129816,0.364559,0.0,-0.535573
2,-1.303672,-0.90842,-1.31991,-0.383759,-0.388177,0.007858,-0.061111,-0.051183,-0.142238,-0.086566,...,-0.82051,-0.049491,-0.005984,-0.383698,-0.24982,0.679247,-0.062961,-0.125015,0.0,-0.62196
3,-1.735886,-1.636209,-1.55634,-1.441418,-1.441832,-0.004257,-0.017313,-0.008676,-0.024491,-0.011077,...,-1.691222,-0.049491,-0.080226,-1.441526,-0.392864,0.093765,-0.053903,-0.052562,0.0,-0.471826
4,-0.661778,-0.307745,-0.50805,-0.541238,-0.540708,0.016758,0.013703,0.016186,-0.013254,0.0133,...,-0.316008,-0.049491,-0.032052,-0.541255,-0.029564,-0.128014,-0.178037,0.776395,0.0,-0.241715


## Dimension Reduction with PCA

The data has 6819 rows. Except for the first column `Bankrupt?`, there are 95 features that I can use to predict `Bankrupt?`. 95 features are a lot and may slow down the calculation significantly. Therefore, I will first use PCA to reduce the dimension of this dataset.

In [4]:
# implement PCA
# In order to visualize the data, I will reduce the dimension to 3
pca = PCA(n_components=3)
features_lowdim = pca.fit_transform(features_std)
features_lowdim_pd = pd.DataFrame(data=features_lowdim, columns=['pc1', 'pc2', 'pc3'])

print(features_lowdim_pd.shape)
features_lowdim_pd.head()

(6819, 3)


Unnamed: 0,pc1,pc2,pc3
0,-7.338295,0.373302,0.30918
1,-2.703713,-0.986342,2.155691
2,-4.307059,-0.404691,0.3094
3,-5.830736,-1.385625,1.30161
4,-2.343197,-0.201782,-0.806667


In [5]:
# visualize the data after dimension reduction
fig = plt.figure(figsize=(7, 7))
ax = plt.axes(projection='3d')
ax.scatter(features_lowdim[:, 0], features_lowdim[:, 1], features_lowdim[:, 2], c=target, cmap='bwr');

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

# SVM & 10-fold Cross Validation

In [6]:
# split the dataset into validation and train/test sets

all_data = np.hstack([pd.DataFrame(target), features_lowdim_pd])
val, data = split_val_data(all_data, 15)

val_feature = val[:, 1:]
val_target = val[:, 0]

data_feature = data[:, 1:]
data_target = data[:, 0]

In [7]:
print(data_target)
print()
print(data_feature)

[1. 1. 0. ... 0. 0. 0.]

[[-2.34319743 -0.20178248 -0.80666731]
 [-6.68185674 -0.77414995  0.92769037]
 [-5.85596797  0.08219254  0.62865305]
 ...
 [ 1.91030715 -1.69373744  1.41040138]
 [ 1.85426189 -0.39794146  0.48828528]
 [ 1.23594186 -2.32947265  0.46657912]]


In [8]:
# Apply ten_fold() to get the cross-val error for RBF kernel SVM
kernel = "rbf"
rbf_crossval_error = ten_fold(data_feature, data_target, kernel)
rbf_crossval_error

0.036415321225839535

In [9]:
# Apply ten_fold() to get the cross-val error for polynomial (degree=2) kernel SVM
kernel = "poly"
poly_crossval_error = ten_fold(data_feature, data_target, kernel, degree=2)
poly_crossval_error

In [None]:
# Apply ten_fold() to get the cross-val error for polynomial (degree=3) kernel SVM
kernel = "poly"
poly_crossval_error = ten_fold(data_feature, data_target, kernel, degree=3)
poly_crossval_error

In [None]:
# Apply ten_fold() to get the cross-val error for polynomial (degree=3) kernel SVM
kernel = "poly"
poly_crossval_error = ten_fold(data_feature, data_target, kernel, degree=4)
poly_crossval_error

In [None]:
# Apply ten_fold() to get the cross-val error for linear kernel SVM
kernel = "linear"
linear_crossval_error = ten_fold(data_feature, data_target, kernel)
linear_crossval_error