# Final AI Tools Project
by Vinny and Jawadul

In [1]:
#Imports go here
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
from scipy.stats import zscore

# Data Collection
The data that me and my partner are using is a dataset that shows different companies financial data and if they went bankrupt or not. This data came from kaggle and is mostly mostly numeric, with the status label being catagorical. There is also a column for the Company name, but this was not useful for the overall process.

# KNN

# Research Question
How well can a KNN accuratly predict if a company will go bankrupt using only the current assests and the net sales of the company.

# Hypothesis
I believe that a Knn with a K of 5 and a distance measurement of euclidean will achieve at least 90% accuracy when evaluating the overall accuracy.

# Data Preprocessing
Overall most of the data is numeric, and there are only 2 columns that are text based, and one that is catagorical data. The two text columns are the company name and year, and the catagorical feature is the status_label.

There are no null values, so there is no need no do anything to replace them. We will use a box plot on the dataframe to she the amount of outliers on either side, and to fix the outliers We will use the Z-score standardization approach.

We will also do One-Hot encoding on the status_label so that the KNN can read the status label

# Experiment Design
We will split the data for the KNN using the train test split method, which uses the holdout method of data splitting.
We will perform hyperparameter tuning by using a different range of k values and ploting them with their accuracies. We will do two graphs, one with euclidian distance, and one with cosine to decide the distance parameter. We will evaluate the accuracy of the model using the overall accuracy, since it is a good metric to judge the model, however The bigger class will have to be undersampled so that the class imbalance problem is fixed.

# Experiment begins here

In [2]:
#Load the data
df = pd.read_csv("american_bankruptcy.csv")
new_cols = {"X1":"Current_assets", 'X2': 'Cost_of_goods_sold', 'X3':'Depreciation_and_Amortization', 'X4': 'Financial_Performance', 'X5': 'Inventory', 'X6':'Net_Income','X7':'Total_Receivables','X8':'Market_Values','X9':'Net_Sales','X10':'Total_Assets','X11':'Total_longTerm_Debt','X12':'EBIT','X13':'Gross_profit','X14':'Current_Liabilities', 'X15':'Retained_Earnings','X16':'Total_Revenue','X17':'Total_Liabilities','X18':'Operation_Costs'}
df.rename(columns = new_cols, inplace=True)
df.head()

Unnamed: 0,company_name,status_label,year,Current_assets,Cost_of_goods_sold,Depreciation_and_Amortization,Financial_Performance,Inventory,Net_Income,Total_Receivables,...,Net_Sales,Total_Assets,Total_longTerm_Debt,EBIT,Gross_profit,Current_Liabilities,Retained_Earnings,Total_Revenue,Total_Liabilities,Operation_Costs
0,C_1,alive,1999,511.267,833.107,18.373,89.031,336.018,35.163,128.348,...,1024.333,740.998,180.447,70.658,191.226,163.816,201.026,1024.333,401.483,935.302
1,C_1,alive,2000,485.856,713.811,18.577,64.367,320.59,18.531,115.187,...,874.255,701.854,179.987,45.79,160.444,125.392,204.065,874.255,361.642,809.888
2,C_1,alive,2001,436.656,526.477,22.496,27.207,286.588,-58.939,77.528,...,638.721,710.199,217.699,4.711,112.244,150.464,139.603,638.721,399.964,611.514
3,C_1,alive,2002,396.412,496.747,27.172,30.745,259.954,-12.41,66.322,...,606.337,686.621,164.658,3.573,109.59,203.575,124.106,606.337,391.633,575.592
4,C_1,alive,2003,432.204,523.302,26.68,47.491,247.245,3.504,104.661,...,651.958,709.292,248.666,20.811,128.656,131.261,131.884,651.958,407.608,604.467


In [None]:
#Check for any null values
df.info()

In [None]:
# check for outliers in the selected data for experimentation
num_df = df[['Current_assets','Net_Sales']]
num_df.head()
num_df.boxplot()

In [None]:
#Standardize the data
num_df = num_df.apply(zscore)
num_df.head()

In [None]:
#Grab the labels and do one-hot-encoding
labels = df['status_label'].apply(lambda x: 1 if x == 'alive' else (0 if x =='failed' else x))
labels.value_counts()

In [None]:
num_df['labels'] = labels
num_df.head()

In [None]:
# grab training data with the alive label
alive_df = num_df[num_df['labels'] == 1]
print(alive_df.head())
failed_df = num_df[num_df['labels'] == 0]
failed_df.head()

In [None]:
print(alive_df.size)
print(failed_df.size)

In [None]:
alive_labels_df = num_df.sample(n = failed_df.size)
alive_labels_df.head()
final_df = pd.concat([alive_labels_df, failed_df], ignore_index = True)
final_df.head()

In [None]:
# Split the labels from the final df and get the training and testing datasets
labels = final_df['labels']
final_df.head()
final_df.drop(columns = ['labels'], inplace=True)
X_train, X_test, y_train, y_test = train_test_split(final_df, labels, random_state = 42)

In [None]:
#Run the model and do hyper parameter tuning.
def run_knn_metrics(X_train, y_train, X_test, y_test, k=5, dm = 'euclidean'):
    knn = KNeighborsClassifier(n_neighbors=k, metric = dm)
    knn.fit(X_train, y_train)
    training_acc = knn.score(X_train, y_train)
    testing_acc = knn.score(X_test, y_test)
    pred = knn.predict(X_test)
    print(confusion_matrix(y_test, pred))
    return training_acc, testing_acc

In [None]:
k_values = range(1, 50, 1)
training_accuracy_euclid = []
testing_accuracy_euclid = []

training_accuracy_cosine = []
testing_accuracy_cosine = []

for i in range(len(k_values)):
    accs = run_knn_metrics(X_train, y_train, X_test, y_test, k = k_values[i], dm = 'euclidean')
    training_accuracy_euclid.append(accs[0])
    testing_accuracy_euclid.append(accs[1])
    
for i in range(len(k_values)):
    accs = run_knn_metrics(X_train, y_train, X_test, y_test, k = k_values[i], dm = 'cosine')
    training_accuracy_cosine.append(accs[0])
    testing_accuracy_cosine.append(accs[1])

In [None]:
plt.plot(k_values, training_accuracy_euclid, testing_accuracy_euclid)
plt.xlabel('K values')
plt.ylabel('Training/Testing Accuracy')
plt.title('Accuracy using Euclidian Distance')
plt.show()

plt.plot(k_values, training_accuracy_cosine, testing_accuracy_cosine)
plt.xlabel('K values')
plt.ylabel('Training/Testing Accuracy')
plt.title('Accuracy using Cosine Distance')
plt.show()

In [None]:
#final run with Chosen Parameters
accs = run_knn_metrics(X_train, y_train, X_test, y_test, k = 45, dm = 'euclidean')
print('Accuracy of K-NN classifier on training set: {:.2f}'
         .format(accs[0]))
print('Accuracy of K-NN classifier on test set: {:.2f}'
         .format(accs[1]))

# Why We chose these parameters
We chose these parameters as they were the best in terms of distance between the training and testing accuracy. However the accuracy values themselves are not very good.

# Conclusions
It would seem that a KNN is not a good machine learning algorithm for predicting bankruptcy. The algorithm could not reach above a 70 on the training data. Some potential limitations are that there are many reasons behind why a company goes bankrupt, A potential change would be to use more of the features from the dataset instead of just two from the dataset. It may also be good to use a different type of accuracy such as sensitivity as even with undersampling, the overall accuracy is still not very good.

# K-Means 

# Research question
How well can a K means clustering algorithm group bankrupted and non bankrupted companies using the entire chosen data set.

# Hypothesis 
We believe that the K means with a K value of 2, which matches the number of labels, will achive a 89% using metrics such as the homogeneity score.

# Data Preprocessing
The data preprocessing will be very similar to the KNN preprocessing, just using all of the numerical data rather than two specific columns. The standardization method we will use for K-means will be min max normalization to see if the method of standardization will have an affect on the accuracy. We will also use one-hot encoding to convert the status labels to either 0 or 1. We also will do undersampling to make sure that the dataset is even.

# Experiment Design
We will split the training data using the train test split method, which uses the holdout method. The hyperparameter tuning will involve us using the Silhouette Coefficient to find the optimal value for K. We chose this method since it combines both Between clusters sum of squares and Within clusters sum of squares. The accuracy we will be testing is the homogenuity score since we are experimenting to see how well the model can classify if a business went bankrupt or not.

# Experiment starts here

In [3]:
# Since the outlier plot and the checking for null values was don for KNN, we will just more straight to the preprocessing
df.drop('company_name', axis = 'columns', inplace=True)
labels = df['status_label']
df.drop("status_label", axis = "columns",inplace = True)

In [4]:
min_max_df = df.apply(lambda x: (x - x.min()) / (x.max() - x.min()))

In [5]:
labels = labels.apply(lambda x: 1 if x == 'alive' else (0 if x =='failed' else x))

In [6]:
labels.value_counts()

status_label
1    73462
0     5220
Name: count, dtype: int64

In [7]:
min_max_df['labels'] = labels

In [8]:
min_max_df.drop('year', axis = 'columns', inplace=True)

In [9]:
alive_min_max = min_max_df[min_max_df['labels'] == 1]
failed_min_max = min_max_df[min_max_df['labels'] == 0]

In [12]:
print(failed_min_max.size)
print(min_max_df.size)

99180
1494958


In [13]:
alive_min_max = min_max_df.sample(n = failed_min_max.size, replace = True)
final_min_max = pd.concat([alive_min_max, failed_min_max], ignore_index = True)
final_min_max.head()

Unnamed: 0,Current_assets,Cost_of_goods_sold,Depreciation_and_Amortization,Financial_Performance,Inventory,Net_Income,Total_Receivables,Market_Values,Net_Sales,Total_Assets,Total_longTerm_Debt,EBIT,Gross_profit,Current_Liabilities,Retained_Earnings,Total_Revenue,Total_Liabilities,Operation_Costs,labels
0,0.000328,0.001082,8.6e-05,0.211541,9.6e-05,0.484985,0.000154,1.3e-05,0.003957,0.000207,1.383458e-07,0.266847,0.135931,0.000102,0.203052,0.003957,3.5e-05,0.000774,1
1,0.017798,0.003892,0.003342,0.215808,0.000975,0.486112,0.003024,0.015548,0.00957,0.008246,1.383458e-07,0.270447,0.147464,0.007436,0.207525,0.00957,0.002858,0.00584,1
2,7.3e-05,0.000986,4e-06,0.211387,4.7e-05,0.484934,7e-06,1.2e-05,0.003831,9e-06,1.383458e-07,0.266707,0.135751,5.4e-05,0.20267,0.003831,2.6e-05,0.000673,0
3,0.040418,0.024357,0.016743,0.234748,0.029169,0.49528,0.042773,0.029283,0.033824,0.033059,0.02131743,0.286732,0.177626,0.041723,0.227463,0.033824,0.036446,0.02762,1
4,7.8e-05,0.000984,1.6e-05,0.211382,0.0,0.484927,1.4e-05,2e-06,0.003832,1.3e-05,1.383458e-07,0.266698,0.13576,2.3e-05,0.202878,0.003832,8e-06,0.000675,1


# TODO Model and hyperparameter tuning and final observations