In [1]:
import pandas as pd
import re
from collections import Counter
import math
import matplotlib.pyplot as plt
import numpy as np

In [2]:
# Import our data as csv files from Kaggle, values are already normalized and split into test and training data
test_x = pd.read_csv('bankruptcy_Test_X.csv')
train = pd.read_csv('bankruptcy_Train.csv')

In [3]:
train.describe()

Unnamed: 0,Attr1,Attr2,Attr3,Attr4,Attr5,Attr6,Attr7,Attr8,Attr9,Attr10,...,Attr56,Attr57,Attr58,Attr59,Attr60,Attr61,Attr62,Attr63,Attr64,class
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,...,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,0.007954,-0.00714,-0.003544,0.005005,-0.004687,0.001455,-0.006963,0.009053,0.006763,0.007456,...,0.009804,0.005208,-0.009751,-0.002655,0.004548,0.014331,-0.006864,0.02075,-0.003984,0.0203
std,1.396405,1.40944,1.015494,0.90802,1.39475,1.286713,1.412509,1.064426,1.171199,1.407349,...,0.01737,0.718682,0.026124,0.467139,1.252571,1.17116,0.277288,1.115182,0.930875,0.141032
min,-9.474787,-140.604555,-25.597146,-0.381641,-138.720013,-26.249562,-141.176615,-0.966015,-1.29434,-2.528495,...,-1.657288,-42.381245,-0.037484,-3.21849,-0.048598,-0.412525,-0.022204,-0.743297,-0.050969,0.0
25%,-0.079776,-0.055604,-0.516971,-0.192306,0.006798,-0.016047,-0.008018,-0.445667,-0.388184,-0.068584,...,0.009315,0.003199,-0.010766,-0.02274,-0.043418,-0.221237,-0.016847,-0.371384,-0.041603,0.0
50%,-0.019204,0.000246,0.003186,-0.117972,0.007302,-0.016047,0.00428,-0.26015,-0.307842,0.000253,...,0.009838,0.010013,-0.009924,-0.020438,-0.039928,-0.139653,-0.013821,-0.180311,-0.034319,0.0
75%,0.081218,0.056957,0.585722,0.022705,0.007753,0.016026,0.024222,0.098874,0.156638,0.070036,...,0.010693,0.0199,-0.009416,-0.014401,-0.03182,-0.00221,-0.009476,0.133244,-0.020507,0.0
max,137.557872,2.004925,3.324327,60.82546,11.556238,121.354736,0.623095,47.504463,62.936396,140.123299,...,0.036727,37.405312,2.551674,27.969785,117.341069,59.139158,23.97025,62.52724,90.774695,1.0


In [4]:
# Another data source that unfortunately doesn't use the same columns so I don't think we can use it
other = pd.read_csv('data.csv')

In [5]:
#Metrics function we can use to evaluate our models
def metrics(y, y_pred):
    '''
    Parameters:
    y, y_pred (Pandas Series): series representing actual labels(y) and predicted outcomes(y_pred)
    
    Returns:
    model accuracy, sensitivity, specificity, precision, f1-score
    '''
    # Create a confusion matrix with our two series
    cm = confusion_matrix(y, y_pred)
    TN, FP, FN, TP = cm.ravel()
    
     # Accuracy Calcs
    accuracy = (TN + TP)/(TN + FP + FN + TP) 
    
    # Sensitivity Calcs
    # TP (1,1)/ (TP (1,1) + FN (1,0)) <- wherever y actual = 1
    sensitivity = TP/(TP + FN)
    
    # Specificity Calc
    specificity = TN/(TN + FP)
    
    # Precision Calc
    precision = TP/(TP + FP)
    
    # F1 Score Calc
    f1_score = 2 * ((precision * sensitivity) / (precision + sensitivity))
    
    #print(f'Accuracy: {accuracy} \nSensitivity: {sensitivity}\nSpecificity: {specificity}\nPrecision: {precision}\nf1 score: {f1_score}')
    return {'Accuracy': round(accuracy, 4), 'Sensitivity': round(sensitivity, 4), 
            'Specificity': round(specificity, 4), 'Precision': round(precision, 4), 'f1 score': round(f1_score, 4)}

In [6]:
# Check the amount of companies that went bankrupt in our data
# 0 = did not bankrupt, 1 = bankrupt
Counter(train['class'].tolist())

Counter({0: 9797, 1: 203})

## Naive Bayes (not going to use)
Despite our project proposal suggesting Naive Bayes as a way to classify our data, we decided against it. Reason being, we wanted to implement as many of our attributes in our data as possible, and in doing so, it would be more difficult to calculate probabilities given the sheer amount of attributes that our data contains. Theoretically, we could split our data into smaller subsets of attributes, but we didn't think that it would provide an accurate enough classification given that our attribute list is over 60 attributes

## K-Nearest Neighbors

In [7]:
# Ethan's knn function from HW 3
from heapq import nlargest # library to get the keys of the largest values in a dict
def knn(vector_list, speakers, k, function):
    '''
    given a dataframe, k neighbors, and a distance metric function
    compare each word vector against every other word vector
    find the k vectors with the highest distance value (in the case of cossim, for euclidean it would need to be smallest)
    find the speakers of those k vectors
    '''
    guesses = list()
    idx_vector_dict = dict(zip(list(range(0, 1000)), vector_list))
    # Loop through our word vectors
    for idx, vector in idx_vector_dict.items():
        distance_dict = {}
        others_dict = {v: idx_vector_dict[v] for v in idx_vector_dict.keys() - {idx}} # so we don't compare against itself
        for i, j in others_dict.items(): # loop through all other word vectors
            distance = function(vector, j) # can sub in another distance function is desired
            distance_dict[i] = distance

        # get the largest differences, which is the most similar for cosine similarity
        k_largest = nlargest(k, distance_dict, key=distance_dict.get) 
        s_list = [speakers[i] for i in k_largest] # find the speaker for each of the nearest neighbors
        guess = max(set(s_list), key=s_list.count) # find out which speaker is represented more
        guesses.append(guess) # append our guess to the list
    return guesses

## Neural Network

Another approach, inspired by https://www.kaggle.com/code/karthik7395/binary-classification-using-neural-networks/notebook

make sure to run these if not already installed:
- pip install keras
- pip install tensorflow --user

In [13]:
from keras.models import Sequential
from keras.layers import Dense
from keras import optimizers

In [29]:
hidden_units=100
learning_rate=0.01
hidden_layer_act='tanh'
output_layer_act='sigmoid'
no_epochs=100

model = Sequential()

In [30]:
model.add(Dense(64, input_shape=(64,), activation=hidden_layer_act))
model.add(Dense(64, activation=hidden_layer_act))
model.add(Dense(1, activation=output_layer_act))
model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_15 (Dense)            (None, 64)                4160      
                                                                 
 dense_16 (Dense)            (None, 64)                4160      
                                                                 
 dense_17 (Dense)            (None, 1)                 65        
                                                                 
Total params: 8,385
Trainable params: 8,385
Non-trainable params: 0
_________________________________________________________________


In [31]:
sgd=optimizers.SGD(learning_rate=learning_rate)
model.compile(loss='binary_crossentropy',optimizer=sgd, metrics=['acc'])

In [32]:
model.fit(train_x, train_y, epochs = no_epochs, batch_size = len(train), verbose=2)

Epoch 1/100
1/1 - 0s - loss: 0.6912 - acc: 0.6204 - 312ms/epoch - 312ms/step
Epoch 2/100
1/1 - 0s - loss: 0.6834 - acc: 0.6341 - 16ms/epoch - 16ms/step
Epoch 3/100
1/1 - 0s - loss: 0.6757 - acc: 0.6474 - 15ms/epoch - 15ms/step
Epoch 4/100
1/1 - 0s - loss: 0.6682 - acc: 0.6599 - 15ms/epoch - 15ms/step
Epoch 5/100
1/1 - 0s - loss: 0.6608 - acc: 0.6743 - 16ms/epoch - 16ms/step
Epoch 6/100
1/1 - 0s - loss: 0.6535 - acc: 0.6842 - 16ms/epoch - 16ms/step
Epoch 7/100
1/1 - 0s - loss: 0.6463 - acc: 0.6944 - 15ms/epoch - 15ms/step
Epoch 8/100
1/1 - 0s - loss: 0.6392 - acc: 0.7033 - 16ms/epoch - 16ms/step
Epoch 9/100
1/1 - 0s - loss: 0.6323 - acc: 0.7131 - 16ms/epoch - 16ms/step
Epoch 10/100
1/1 - 0s - loss: 0.6255 - acc: 0.7228 - 15ms/epoch - 15ms/step
Epoch 11/100
1/1 - 0s - loss: 0.6188 - acc: 0.7326 - 15ms/epoch - 15ms/step
Epoch 12/100
1/1 - 0s - loss: 0.6122 - acc: 0.7431 - 14ms/epoch - 14ms/step
Epoch 13/100
1/1 - 0s - loss: 0.6057 - acc: 0.7547 - 14ms/epoch - 14ms/step
Epoch 14/100
1/1 - 

<keras.callbacks.History at 0x23a78a089a0>

In [34]:
predictions = model.predict(test_x)



In [36]:
rounded = [int(round(x[0])) for x in predictions]

In [38]:
Counter(rounded)

Counter({0: 4969, 1: 31})