## Create Functions to be Used throughout Analysis

In [277]:
def create_decile(df,column_name):
    '''
    Function Created to do a simple Decile Analysis, Comparing Bankrupt and Non Bankrupt instance for a specific 
    Financial Ratio
    
    '''
    
    deciles = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
    decile_values = df[column_name].quantile(deciles)
    plt.hist([df[column_name][df['BANKRUPTCY_FLAG']==1],
              df[column_name][df['BANKRUPTCY_FLAG']==0]],
                 bins=decile_values, range=(-1, 1), alpha=0.5, label=['Bankrupt', 'Not Bankrupt'])
        
    plt.title(f"Decile Analysis: {column_name}")
    plt.legend()
    plt.show()
    
    return pd.DataFrame(decile_values).T.reset_index().rename(columns={'index':'Financial Ratio'})
    
def review_dataset_dict(df):
    
    '''
    Function Created to Review Dataset for purposes of EDA. Specifically as these are ratios, and through review
    there appeared to be a number of outliers, wanted to be able to simply and easily review not only distribution
    of data, but also simple statistical information about values in columns
    
    
    '''
    
    
    # Create Blank Dict to Store
    blank_dict = {}
    
    # Create Thresholds to Count
    bins = [-np.inf, -1, 0, 1, np.inf]
    labels = ['Value Less than -1%', 'Value between -1 to 0', 'Value Between 0 to 1', 'Value Greater than 1']
    thresh_df = pd.DataFrame()
    
    
    # Loop through columns
    for i in df.columns:
        
        # remove text created variables which crash Python
        if i in ['dataset']:
            pass
        
        temp_df = df[i].copy()
        
        blank_dict[i] = {'null_records':len(temp_df[temp_df.isnull()]),
                         'zero_records':len(temp_df[temp_df==0]),
                         'max_value':temp_df.max(),
                         'min_value':temp_df.min(),
                         'mean_value':temp_df.mean(),
                         'std_dev':temp_df.std(),
                         'positive_values':len(temp_df[temp_df>0]),
                         'negative_values':len(temp_df[temp_df<0])}
        
        record_count = pd.cut(df[i], bins=bins, labels=labels, right=False)
        
        thresh_df = pd.concat([thresh_df,pd.DataFrame(record_count.value_counts().values,index=record_count.value_counts().index,columns=[i]).T])
    
    final_df = pd.DataFrame(blank_dict.values(),index=blank_dict.keys())

    return final_df.merge(thresh_df,left_index=True,right_index=True).reset_index().rename(columns={'index':'Financial Ratio'})

def clean_string(string,remove_chars=['+','-',"(",")",'/','*']):
    
    '''
    Function to clean some of the punctuation which appears in Column Headers
    
    '''
    new_string = ""
    for char in string:
        if char not in remove_chars:
            new_string +=char
    return new_string

def create_heatmap(df,column_name='',corr_value=.1,figsize=(20,15)):
    
    sns.set(style='white')
    
    # View column with Abbreviated title or full. Abbreviated displays nicer.
    corr = df.corr()
    
    if len(column_name)!=0:
        corr = corr[[column_name]]
        corr = corr[abs(corr[column_name])>corr_value]
    
    mask= np.zeros_like(corr,dtype=bool)
    mask[np.triu_indices_from(mask)]=True
    f,ax = plt.subplots(figsize=figsize)
    cmap = sns.diverging_palette(220,10,as_cmap=True)
    sns.heatmap(corr,mask=mask,cmap=cmap,vmax=.3,center=0,square=True,linewidths=.5)
    
    plt.title('Heat Map of Correlation')
    plt.show()
    
    
def create_column_inclusion_review(df,
                               columns,
                               column_type_df='',
                               decile_review_df=''):
    
    df = df[columns].copy()
    
    temp_df = review_dataset_dict(df)
    
    if len(column_type_df)!=0:
        temp_df = temp_df.merge(column_type_df,on='Financial Ratio',how='left')
    if len(decile_review_df)!=0:
        temp_df = temp_df.merge(decile_review_df,on='Financial Ratio',how='left')
    
    return temp_df


def variable_review(df,
                    column_name,
                    og_column,
                    column_inclusion_review_df,
                    corr_weight=.15):
        
    print(column_inclusion_review_df[column_inclusion_review_df['Financial Ratio']==column_name].T)
    print(f"\n")
    
    create_heatmap(df[og_columns],column_name)
    create_decile(df,column_name)
    
    print('Top 20 Records')
    print(df.sort_values(column_name)[[column_name,'BANKRUPTCY_FLAG']].tail(20))
    
    print('Bottom 20 Records')
    print(df.sort_values(column_name)[[column_name,'BANKRUPTCY_FLAG']].head(20))
    
    return df[df[column_name].isnull()].T

# Process for reviewing Non Tier 1 Elements.

def review_single_variable_manully(df,
                                   column_name,
                                   baseline_columns,
                                   column_inclusion_review_df,
                                   og_columns,
                                   export_to_excel=0):
        
    import datetime
    now = datetime.datetime.now()
    
    # Currently Included Columns for Simple Reference
    print("Columns Currently In Scope:")
    for included in baseline_columns:
        print(f"{included}\n")
    
    # EDA
    variable_review(df,column_name,og_columns,column_inclusion_review_df)    
    print(f"Number of Null Records with Bankrupcy Flag Yes: {df[df[column_name].isnull()]['BANKRUPTCY_FLAG'].sum()}")
    
    # Review Questions
    blank_or_remove = input('Remove Null Records/ Zero Null Records/ Exit Loop (remove/zero/exit)')
    include_in_model = input('Subjective Belief as to whether variable should be included in model (include/exclude)')
    negative_value = input('Remove, Zero or Leave Negative Values (remove/zero/ignore)')
    decision_logic = input('Please Provide Comment on Decision for Archival Reference')
        
    record_df =  pd.DataFrame([blank_or_remove.lower(),include_in_model.lower(),negative_value.lower(),decision_logic.lower(),now],index=['Null Record Approach','Baseline V2 Model Inclusion',"Negative Valuation",'Archival Decisioning','Extract Time'],columns=[column_name]).T.reset_index().rename(columns={'index':'Financial Ratio'})
    
    if export_to_excel==1:
        clean_column_name = clean_string(column_name,remove_chars=['+','-',"(",")",'/','*']).replace(" ","_")
        record_df.to_excel(f"manual_review/manual_review_{clean_column_name}_{now.strftime('%d%m%y%h%m%s')}.xlsx",index=False)
            
    return record_df

def read_files_in_folder(folder_location,file_type='*',import_df=0):
    
    files_ = os.listdir(folder_location)
    
    if file_type =='*':
        files_desired = files_.copy()
    else:
        files_desired = [x for x in files_ if x.find('xlsx')>-1]
        
    if import_df ==1:
        final_df = pd.DataFrame()
        if file_type =='xlsx':
            pd_read = pd.read_excel
        elif file_type =='csv':
            pd_read = pd.read_csv      
        else:
            print('Update Function')              
        for file in files_desired:
            final_df = pd.concat([final_df,pd_read(f"{folder_location}/{file}")])
            
        return final_df
         
    return files_desired


def apply_standard_scaler(df,list_of_columns_to_scale=[]):
    '''
    Apply Standard Scaler, creating distribution with Mean 0 and Standard Deviation 1
    
    If list_of_columns is blank, applied to entire DataFrame.
    
    Otherwise, include list of variables you wish to transform, will return DF with transformed variables
    
    '''
    
    if len(list_of_columns_to_scale)==0:
        list_of_columns_to_scale = df.columns
    
    df = df[list_of_columns_to_scale].copy()
    
    
    scaler = StandardScaler()
    
    for i in list_of_columns_to_scale:
        new_column_name = f"{i}_scaled"
        df[new_column_name] = scaler.fit_transform(df[[i]])
        
    return df.drop(list_of_columns_to_scale,axis=1)

def apply_min_max_scaler(df,list_of_columns_to_scale=[]):
    '''
    Apply Min Max Scale, with 
    
    If list_of_columns is blank, applied to entire DataFrame.
    
    Otherwise, include list of variables you wish to transform, will return DF with transformed variables
    
    '''
    
    if len(list_of_columns_to_scale)==0:
        list_of_columns_to_scale = df.columns
    
    df = df[list_of_columns_to_scale].copy()
    
    scaler = MinMaxScaler()
    
    for i in list_of_columns_to_scale:
        new_column_name = f"{i}_scaled"
        df[new_column_name] = scaler.fit_transform(df[[i]])
        
    return df.drop(list_of_columns_to_scale,axis=1)

def text_manipulation(layer_list):
    '''
    Simple function to apply text manipulation to display layer size in Dataframe in single column
    '''
    str_layer = ""
    for layer in test:
        str_layer +=str(layer)+' '
    return str_layer


def build_binary_classification_model(input_dim, 
                                      hidden_layer_sizes,
                                      activation, 
                                      optimizer,
                                      learning_rate,
                                      metrics):

    """Build a binary classification model using Keras.

      Args:
        input_dim: Number of features in the input data.
        hidden_layer_sizes: A list with the number of units in each hidden layer.
        activation: The activation function to use for the hidden layers.
        optimizer: The optimizer
        learning_rate: The desired learning rate for the optimizer.

      Returns:
        model: A tf.keras model.
    """
    # Instantiate Model
    model = keras.models.Sequential()

    # Add Input Layer
    model.add(layers.InputLayer(input_shape=(input_dim,)))

    # Add Hidden Layers
    for nodes in hidden_layer_sizes:
        model.add(layers.Dense(units=nodes, activation=activation))

    # Add Output Layer
    model.add(layers.Dense(units=1, activation='sigmoid'))

    # Configure optimizer and compile the model
    if optimizer == 'sgd':
        optimizer = keras.optimizers.SGD(learning_rate=learning_rate)
    elif optimizer == 'adam':
        optimizer = keras.optimizers.legacy.Adam(learning_rate=learning_rate)

    model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=metrics)

    return model


def train_model(X,
                y,
                input_dim,
                metrics,
                hidden_layer_sizes,
                activation, 
                optimizer,
                learning_rate,
                batch_size,
                num_epochs,
                validation_split,
                verbose=0):
                       

    # Build the model.
    model = build_binary_classification_model(input_dim=input_dim,
                                              hidden_layer_sizes=hidden_layer_sizes,
                                              activation=activation, 
                                              optimizer=optimizer,
                                              learning_rate=learning_rate,
                                              metrics=metrics)
    
    print(model.summary())     
                        
    # Train the model.
    history = model.fit(x=X,
                        y=y,
                        batch_size=batch_size,
                        epochs=num_epochs,
                        validation_split=validation_split,
                        verbose=verbose)

    # Retrieve the training metrics (after each train epoch) and the final test
    # accuracy.
    train_accuracy = history.history['accuracy']
    val_accuracy = history.history['val_accuracy']
    plt.figure(figsize=(15,5))
    plt.plot(train_accuracy, label='train_accuracy')
    plt.plot(val_accuracy, label='validation accuracy')
    plt.xticks(range(num_epochs))
    plt.xlabel('Train epochs')
    plt.ylim(0, 1)
    plt.legend()
    plt.show()
    
 
    return history,model

def create_balanced_dataset(X,y,observations=0,column_name='BANKRUPTCY_FLAG'):
    
    '''
    Function to take observations and labels, combine them together and then select a even numher of random examples
    
    X - X_Test or X_Training
    y - y_test or y_training
    observation - Number of records from both Binary On and Binary Off Column
    column_name - Name of Binary Column to filter
    
    
    Used Default Value for purposes of reducing typing in code and given function created for Project Exclusively.
    
    '''
    
    # If length of observations is not defined, create a even 50/ 50 dataset
    if observations == 0:
        observations = y[column_name].sum()
        
    if observations>y[column_name].sum():
        observations = y[column_name].sum()
        
    temp_df = pd.concat([X,y],axis=1).copy()
    
    df1 = temp_df[temp_df[column_name]==1].sample(observations).copy()
    df2 = temp_df[temp_df[column_name]==0].sample(observations).copy()
    
    final_df = pd.concat([df1,df2])
    final_df = final_df.sample(frac=1)
    
    X = final_df.drop(column_name,axis=1)
    y = final_df[[column_name]]
    
    return X,y

In [2]:
# Create Dictionary to Track Progress and serve as reference throughout workbook
project_reference_dictionary = {'Goal':'The goal of our project is to determine whether we can create a model which predicts whether a company will go Bankruptm based on Financial Ratio.',
                                'Step 1. Import Data':'Pending',
                                'Step 2. Review Data':'Pending',
                                'Step 3. Clean Data':'Pending',
                                'Step 4. Determine which Data to Include':'Pending',
                                'Step 5. Identify a Baseline':'Pending',
                                'Step 6. Generate Perspective Models':'Pending',
                                'Step 7. Compare Models':'Pending',
                                'Step 8. Tune Models':'Pending',
                                'Step 9. Select Model':'Pending',
                                'Step 10. Validate Against Test Data Set':'Pending'}

project_reference_dictionary['additional_items'] = ['How does company Size Impact',
                                                    'How does company Industry Impact',
                                                   'How is the amount of debt being given consideration',
                                                   'Can we demonstrate how this model would save money']

### Data Source

##### https://archive.ics.uci.edu/dataset/365/polish+companies+bankruptcy+data

## Import Libraries

In [3]:
import pandas as pd
import numpy as np
from scipy.io import arff
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score

from tensorflow import keras
from tensorflow.keras import layers

import os
import math
import seaborn as sns

pd.set_option('display.float_format', '{:.2f}'.format)

## Import Data 


In [4]:
df = pd.DataFrame()


# Read Annual Files downloaded from https://archive.ics.uci.edu/dataset/365/polish+companies+bankruptcy+data
for i in [x for x in os.listdir() if x.find('.arff')!=-1]:
    temp_df = pd.DataFrame(arff.loadarff(i)[0])
    temp_df['dataset']=i
    df = pd.concat([df,temp_df])

# Change Column Header Names to respective Financial Ratio, Excel Manually Compiled from Read Me.
column_data = pd.read_excel('polish_company_data.xlsx')
column_data['COLUMN_NAME'] = column_data['COLUMN_NAME'].apply(lambda x:x.replace('X','Attr'))
column_data.loc[64,'COLUMN_NAME']='class'
column_data.loc[64,'FINANCIAL_RATIO']='BANKRUPTCY_FLAG'

df = df.rename(columns={column_data['COLUMN_NAME'].tolist()[x]:column_data['FINANCIAL_RATIO'].tolist()[x] for x in range(len(column_data['COLUMN_NAME']))})

# Convert Bankruptcy Flag into INT.
df['BANKRUPTCY_FLAG'] = df['BANKRUPTCY_FLAG'].apply(lambda x:int(x))

# Create a list of original column names, supports visualizations utilized below
og_columns = [x for x in df.columns if x not in ['dataset']]
og_predict_columns = og_columns.copy()
og_predict_columns.remove('BANKRUPTCY_FLAG')


In [5]:
# Update Dictionary
project_reference_dictionary['Step 1. Import Data'] = "Completed"

## Initial Data Review

In [6]:
column_inclusion_review_df = create_column_inclusion_review(df,og_columns)
column_inclusion_review_df

Unnamed: 0,Financial Ratio,null_records,zero_records,max_value,min_value,mean_value,std_dev,positive_values,negative_values,Value Between 0 to 1,Value between -1 to 0,Value Less than -1%,Value Greater than 1
0,net profit / total assets,8,240,94.28,-463.89,0.04,2.99,33865,9292,33999,9106,186,106
1,total liabilities / total assets,8,89,480.96,-430.87,0.59,5.84,43305,3,41116,0,3,2278
2,working capital / total assets,8,24,28.34,-479.96,0.11,5.44,33804,9569,33797,9134,435,31
3,current assets / short-term liabilities,134,2,53433.00,-0.40,6.31,295.43,43267,2,9569,2,0,33700
4,[(cash + short-term securities + receivables -...,89,43,1250100.00,-11903000.00,-385.35,61243.03,21356,21917,319,254,21663,21080
...,...,...,...,...,...,...,...,...,...,...,...,...,...
60,sales / receivables,102,122,108000.00,-12.66,17.03,553.05,43175,6,670,4,2,42627
61,(short-term liabilities *365) / sales,127,121,25016000.00,-2336500.00,1502.33,139266.70,43153,4,157,0,4,43117
62,sales / short-term liabilities,134,118,23454.00,-1.54,9.34,124.18,43149,4,1539,3,1,41728
63,sales / fixed assets,812,88,294770.00,-10677.00,72.79,2369.34,42500,5,3437,3,2,39151


### Identify Words in Colums to Determine potential Similarity

In [7]:
# Create a Dictionary of Words from Column Name to Thematically see what type of data we have available.
# First Review Identified a number of undesired Strings, created function to Clean.

column_data['CLEAN'] = column_data['FINANCIAL_RATIO'].apply(lambda x:clean_string(x))


# Count Occurances of Every Word in Column

word_dictionary = {}

for i in column_data['CLEAN']:
    word = i.split()
    for i in word:
        try:
            word_dictionary[i] +=1
        except:
            word_dictionary[i] =1
            
wd_df = pd.DataFrame(word_dictionary.values(),index=word_dictionary.keys(),columns=['word_count'])
wd_df.head()

Unnamed: 0,word_count
net,4
profit,23
total,33
assets,30
liabilities,24


### Incorporate Classification of Column into Initial Review

In [8]:
# Apply knowledge of financial ratios to determine ratios are likely to be similiar, and thus in potential duplication
# and to assist with determining which metrics to apply

column_type_df = pd.DataFrame(df.columns,columns=['Financial Ratio'])

condition = [column_type_df['Financial Ratio'].str.contains('working capital'),
             column_type_df['Financial Ratio'].str.contains('profit|EBIT|expenses|cost of'),
             column_type_df['Financial Ratio'].str.contains('assets|equity|retained|capital'),
             column_type_df['Financial Ratio'].str.contains('sales|receivables|inventory'),
             ]

values = ['Liqudity Ratio',
          'Profitability Ratio',
          'Capitalization Ratio',
          'Turnover Ratio']

column_type_df['Ratio Classification'] = np.select(condition,values,'Other')

column_inclusion_review_df = create_column_inclusion_review(df,
                                                            og_columns,
                                                            column_type_df)

column_inclusion_review_df

Unnamed: 0,Financial Ratio,null_records,zero_records,max_value,min_value,mean_value,std_dev,positive_values,negative_values,Value Between 0 to 1,Value between -1 to 0,Value Less than -1%,Value Greater than 1,Ratio Classification
0,net profit / total assets,8,240,94.28,-463.89,0.04,2.99,33865,9292,33999,9106,186,106,Profitability Ratio
1,total liabilities / total assets,8,89,480.96,-430.87,0.59,5.84,43305,3,41116,0,3,2278,Capitalization Ratio
2,working capital / total assets,8,24,28.34,-479.96,0.11,5.44,33804,9569,33797,9134,435,31,Liqudity Ratio
3,current assets / short-term liabilities,134,2,53433.00,-0.40,6.31,295.43,43267,2,9569,2,0,33700,Capitalization Ratio
4,[(cash + short-term securities + receivables -...,89,43,1250100.00,-11903000.00,-385.35,61243.03,21356,21917,319,254,21663,21080,Profitability Ratio
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60,sales / receivables,102,122,108000.00,-12.66,17.03,553.05,43175,6,670,4,2,42627,Turnover Ratio
61,(short-term liabilities *365) / sales,127,121,25016000.00,-2336500.00,1502.33,139266.70,43153,4,157,0,4,43117,Turnover Ratio
62,sales / short-term liabilities,134,118,23454.00,-1.54,9.34,124.18,43149,4,1539,3,1,41728,Turnover Ratio
63,sales / fixed assets,812,88,294770.00,-10677.00,72.79,2369.34,42500,5,3437,3,2,39151,Capitalization Ratio


### Visual Review of Data

In [None]:
# Remove Unwanted Columns for review purposes.

df1 = df.drop('dataset',axis=1)

# Histogram View

fig = plt.figure(figsize=(50,50))

plt.suptitle('Histograms for Variables',fontsize='20')

l = math.ceil(len(df1.columns)/4)

for i in range(len(df1.columns)):
    plt.subplot(l,4,i+1)
    f = plt.gca()
    f.set_title(df1.columns[i])
    
    plt.hist(df1.iloc[:,i])
    
plt.tight_layout(rect=[0,0.03,1,.95])

#plt.savefig("eda.pdf")

## Review Deciles of Data.

In [None]:
#### Due to variance of data, initial review of histograms difficult to understand exactly which variables are of importance and impact of outlining values, given relatively small occurances of many

decile_value_df = pd.DataFrame()

for i in df1.columns:
    if i == "BANKRUPTCY_FLAG":
        pass
    else:    
        decile_value_df = pd.concat([decile_value_df,create_decile(df1,i)])

In [11]:
column_inclusion_review_df = create_column_inclusion_review(df,
                                                            og_columns,
                                                            column_type_df,
                                                            decile_value_df)
column_inclusion_review_df.to_excel('column_inclusion_review_df.xlsx',index=False)
column_inclusion_review_df

Unnamed: 0,Financial Ratio,null_records,zero_records,max_value,min_value,mean_value,std_dev,positive_values,negative_values,Value Between 0 to 1,...,Ratio Classification,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9
0,net profit / total assets,8,240,94.28,-463.89,0.04,2.99,33865,9292,33999,...,Profitability Ratio,-0.09,-0.01,0.01,0.03,0.05,0.08,0.11,0.15,0.24
1,total liabilities / total assets,8,89,480.96,-430.87,0.59,5.84,43305,3,41116,...,Capitalization Ratio,0.13,0.22,0.31,0.39,0.47,0.55,0.64,0.74,0.88
2,working capital / total assets,8,24,28.34,-479.96,0.11,5.44,33804,9569,33797,...,Liqudity Ratio,-0.15,-0.02,0.06,0.13,0.20,0.27,0.36,0.46,0.60
3,current assets / short-term liabilities,134,2,53433.00,-0.40,6.31,295.43,43267,2,9569,...,Capitalization Ratio,0.71,0.96,1.14,1.33,1.57,1.90,2.42,3.32,5.53
4,[(cash + short-term securities + receivables -...,89,43,1250100.00,-11903000.00,-385.35,61243.03,21356,21917,319,...,Profitability Ratio,-134.63,-65.60,-36.54,-16.94,-1.03,16.05,37.02,68.51,145.61
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60,sales / receivables,102,122,108000.00,-12.66,17.03,553.05,43175,6,670,...,Turnover Ratio,3.08,4.08,4.92,5.72,6.64,7.77,9.34,11.83,18.06
61,(short-term liabilities *365) / sales,127,121,25016000.00,-2336500.00,1502.33,139266.70,43153,4,157,...,Turnover Ratio,24.40,36.56,47.50,58.83,71.33,86.46,104.77,132.46,188.74
62,sales / short-term liabilities,134,118,23454.00,-1.54,9.34,124.18,43149,4,1539,...,Turnover Ratio,1.90,2.73,3.46,4.20,5.09,6.16,7.63,9.89,14.74
63,sales / fixed assets,812,88,294770.00,-10677.00,72.79,2369.34,42500,5,3437,...,Capitalization Ratio,1.15,1.84,2.53,3.30,4.28,5.63,7.89,12.73,29.65


## Review Correlations with Data.

In [None]:
df1.drop(['BANKRUPTCY_FLAG'],axis=1).corrwith(df.BANKRUPTCY_FLAG).plot.bar(figsize=(20,10),title='Correlation of Variables with Bankrupcy',rot=90,grid=True)
plt.show()

In [None]:
create_heatmap(df1.rename(columns={column_data.iloc[x]['FINANCIAL_RATIO']:column_data.iloc[x]['COLUMN_NAME'] for x in range(0,len(column_data))})) 

### After reviewing the initial EDA Histogram, Decile Analysis, Column Names and Correlations what do we know.


1. Considerable blanks on certain variables.
2. Correlation amongst variables appears to be a concern, could be a problem. Specifically, multicolinearity.
3. Outliers are skewing meaningful interpretation of Histogram. Utilized Decile to partially overcome, how do these impact model?

In [14]:
project_reference_dictionary['Step 2. Review Data'] = 'Completed'

## Data Clean Up.


In [15]:
# There are 2 approaches we can take, we can look at the Record Level, or we can look at the Variable Level.
# Given some assumptions related to the approach we will be taking, we will start at the variable level. Specifically, 
# Based on the number of blanks for certain variables, such as Sales, Inventory and Debt, we will likely be 
# removing a number of variables from the dataset and thus might have fewer omissions on the individual company level

In [16]:
# We will start our review at the Individual Variable Level and determine relative importance and whether it should be 
# retained or removed.


## Tier 1 Variables

In [17]:
# Based on Correlation with Bankrupcy we have identified 7 Variables, which we will prioritize as Tier1. 
# These variables both meet a intutive reference with what is believed and appear to mathematically be relevant.

tier1 = ['net profit / total assets',
         'total liabilities / total assets',
         'working capital / total assets',
         'retained earnings / total assets',
         'logarithm of total assets',
         'working capital',
         'short-term liabilities / total assets']

column_inclusion_review_df[column_inclusion_review_df['Financial Ratio'].isin(tier1)][['Financial Ratio','Ratio Classification']]

# Review Each of these through process: 
# 1. How to treat NA
# 2. How to Outliers
# 3. How to treat Similiar Variables
# 

Unnamed: 0,Financial Ratio,Ratio Classification
0,net profit / total assets,Profitability Ratio
1,total liabilities / total assets,Capitalization Ratio
2,working capital / total assets,Liqudity Ratio
5,retained earnings / total assets,Capitalization Ratio
28,logarithm of total assets,Capitalization Ratio
50,short-term liabilities / total assets,Capitalization Ratio
54,working capital,Liqudity Ratio


## Data Cleaning Process

In [18]:
def data_cleaning(df,
                  remove_records=0,
                  clean_records=0):
    
    new_df = df.copy()
   
    # Remove Data
    if remove_records==1:
        
        # There are 8 Instances of Profit / Assets NA. 7 Not Bankrupt, 1 Bankrupt. These 8 instances are all blanks amongst tier 1, remove. Low Risk to information loss        
        # Single Company with no entry, not a bankrupt company, company also has many other blanks. Low Risk of Information loss
        
        
        conditions = [new_df['net profit / total assets'].isnull(),
                     new_df['working capital'].isnull()]
                     
        values = ['Net Profit/ Total Assets is Null',
                  'Working capital is Null']
        
        new_df['remove_reason'] = np.select(conditions,values,"")
        
        print(new_df['remove_reason'].value_counts())
            
    
    if clean_records==1:
    # Ratio meant to capture the Profitability of Assets. If company does not make a profit then this ratio 
    # doesn't necessarily have the same negative implication. Curtail at 0.
        new_df['net profit / total assets'] = np.where(new_df['net profit / total assets']<0,0,new_df['net profit / total assets'])

    # total liabilities / total assets meant to capture the amount of debt relative to the value of company.
    # A negative ratio does not have meaninful interpretation. 
    
        new_df['total liabilities / total assets'] = np.where(new_df['total liabilities / total assets']<0,0,new_df['total liabilities / total assets'])
        
    # Ratio Meant to Capture the amount of assets which are readily available in the short term to pay debt.
    # A negative Ratio does not have a meaningul interpretation, 0 means the company has no money in the immediate to pay debt
        new_df['working capital / total assets'] = np.where(new_df['working capital / total assets']<0,0,new_df['working capital / total assets'])
        
    # Ratio meant to Capture what Percentage of Assets are financed by Retained Earnings (Versus Debt).
    # This ratio is slightly problematic, because Negative Retained Earnings is technically possible, which is 
    # a material warning sign, meaning the company is financed completely by Debt. Will Cap at 0, which implies 
    # that the company is financed 100% by debt, which is a accurate technical interpretation
        
        new_df['retained earnings / total assets'] = np.where(new_df['retained earnings / total assets']<0,0,new_df['retained earnings / total assets'])
        #new_df[''] = np.where(new_df['']<0,0,new_df[''])
        
    # Negative Logarithm implies a value of Assets between 0 and 1, which tantamounts to having no assets. 
    # equivalent of 0, given low frequency of examples, capping value at 0,
    
        new_df['logarithm of total assets'] = np.where(new_df['logarithm of total assets']<0,0,new_df['logarithm of total assets'])
        #new_df[''] = np.where(new_df['']<0,0,new_df[''])
        
    # Variance in Working Capital Material. Negative working Capital, similiar in meaning to no working capital, in 
    # that not having the ability to pay bill is very bad, and technically, negative assets and negative liabilties 
    # shouldn't exist, they should simply become the opposite, as such a negative assets is a liability, which 
    # reduces working capital, thus net negatives to 0.
    
    # also, given Huge variance, create a new variable, Logarithim in attempt to build importance of high number,
    # while keeping scale consistent.
        
        new_df['logarithim of working capital'] = np.where(new_df['working capital']<1,1,new_df['working capital'])
        new_df['logarithim of working capital'] = new_df['logarithim of working capital'].apply(lambda x:np.log(x))
    
    # Similar Logic, shouldn't be negative values, Higher more risky, removing 1 instance, which was a noted Bankrupt, although differences
    # to value is nominal.
    
        new_df['short-term liabilities / total assets'] = np.where(new_df['short-term liabilities / total assets']<0,0,new_df['short-term liabilities / total assets'])
 
    return new_df[new_df['remove_reason']==""],new_df


In [None]:
# Review Variable - Function which utilizes previously identifed views of data, allowing to focus on one particular element
variable_review(df,'net profit / total assets',og_columns,column_inclusion_review_df)

# Utilize Function to clean Dataframe
clean_df,df1 = data_cleaning(df,remove_records=1,clean_records=1)


# Update column_inclusion_review_df with Cleaned Information
column_inclusion_review_df = create_column_inclusion_review(clean_df,
                                                            og_columns,
                                                            column_type_df,
                                                            decile_value_df)

# Review Other Tier 1, Given inclusion of above, these will be already Cleaned
# If looking to replicate original review, utilize df, opposed to clean_df

variable_review(clean_df,'total liabilities / total assets',og_columns,column_inclusion_review_df)
variable_review(clean_df,'working capital / total assets',og_columns,column_inclusion_review_df)
variable_review(clean_df,'retained earnings / total assets',og_columns,column_inclusion_review_df)
variable_review(clean_df,'logarithm of total assets',og_columns,column_inclusion_review_df)
variable_review(clean_df,'working capital',og_columns,column_inclusion_review_df)
variable_review(clean_df,'short-term liabilities / total assets',og_columns,column_inclusion_review_df)

# Utilize Cleaned Dataframe

## Finalize Column By Column Review of all other elements for potential Inclusion

In [None]:
included_variables = tier1.copy()
included_variables.append('EBIT / total assets')

for i in clean_df.columns:
    if i in included_variables:
        pass
    elif i in ['current assets / short-term liabilities',
               '[(cash + short-term securities + receivables - short-term liabilities) / (operating expenses - depreciation)] * 365',
              'book value of equity / total liabilities',
               'equity / total assets',
               'sales / total assets']:
        pass
    else:
        test = review_single_variable_manully(clean_df,
                                              i,
                                              tier1,
                                              column_inclusion_review_df,
                                              og_columns,
                                              export_to_excel=1)
        
        if len(test[test['Baseline V2 Model Inclusion']=='include'])>0:
            included_variables.append(test['Financial Ratio'].item())
            
            
# test = review_single_variable_manully(clean_df,
#                                '[(cash + short-term securities + receivables - short-term liabilities) / (operating expenses - depreciation)] * 365',
#                                tier1,
#                                column_inclusion_review_df,
#                                og_columns,
#                                export_to_excel=1)

# reviewed_variables = read_files_in_folder('/Users/derekdewald/Documents/School/DATASCI207/Project/manual_review','xlsx',1)
# reviewed_variables

In [21]:
def create_binary_baseline_reference(df):
    
    df = df.copy()
    
    df['NOT_PROFITABLE'] = np.where(df['net profit / total assets']==0,1,0)
    df['NO_LIQUIDITY'] = np.where(df['logarithim of working capital']==0,1,0)
    df['LIABILITIES_GT_ASSETS'] = np.where(df['total liabilities / total assets']>1,1,0)
    df['NO_EQUITY'] = np.where(df['retained earnings / total assets']==0,1,0)
    df['ST_OBLIGATIONS_GT_TOTAL_ASSETS'] =np.where(df['short-term liabilities / total assets']>1,1,0)
    df['TOTAL_BINARY_FLAGS'] = df[['NOT_PROFITABLE','NO_LIQUIDITY','LIABILITIES_GT_ASSETS','NO_EQUITY','ST_OBLIGATIONS_GT_TOTAL_ASSETS']].sum(axis=1)
    
    df['TWO_BINARY_FLAGS'] = np.where(df['TOTAL_BINARY_FLAGS']==2,1,0)
    df['THREE_BINARY_FLAGS'] = np.where(df['TOTAL_BINARY_FLAGS']==3,1,0)
    df['FOUR_BINARY_FLAGS'] = np.where(df['TOTAL_BINARY_FLAGS']==4,1,0)
    df['FIVE_BINARY_FLAGS'] = np.where(df['TOTAL_BINARY_FLAGS']==5,1,0)
    
    df['NEVER_BANKRUPT'] = 0
    df['ALWAYS_BANKRUPT'] = 1
    
    return df

clean_df = create_binary_baseline_reference(clean_df)
clean_df

Unnamed: 0,net profit / total assets,total liabilities / total assets,working capital / total assets,current assets / short-term liabilities,[(cash + short-term securities + receivables - short-term liabilities) / (operating expenses - depreciation)] * 365,retained earnings / total assets,EBIT / total assets,book value of equity / total liabilities,sales / total assets,equity / total assets,...,LIABILITIES_GT_ASSETS,NO_EQUITY,ST_OBLIGATIONS_GT_TOTAL_ASSETS,TOTAL_BINARY_FLAGS,TWO_BINARY_FLAGS,THREE_BINARY_FLAGS,FOUR_BINARY_FLAGS,FIVE_BINARY_FLAGS,NEVER_BANKRUPT,ALWAYS_BANKRUPT
0,0.20,0.47,0.24,1.52,-14.55,0.51,0.25,0.92,1.15,0.43,...,0,0,0,0,0,0,0,0,0,1
1,0.03,0.60,0.19,1.34,-37.86,0.00,0.04,0.68,0.32,0.40,...,0,1,0,1,0,0,0,0,0,1
2,0.26,0.30,0.67,3.22,71.80,0.00,0.32,2.33,1.68,0.70,...,0,1,0,1,0,0,0,0,0,1
3,0.23,0.68,0.04,1.08,-88.21,0.00,0.29,0.47,1.32,0.32,...,0,1,0,1,0,0,0,0,0,1
4,0.09,0.38,0.36,1.94,21.73,0.19,0.11,1.37,1.11,0.52,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7022,0.02,0.47,0.00,0.61,-18.45,0.02,0.02,0.97,1.01,0.46,...,0,0,0,1,0,0,0,0,0,1
7023,0.00,0.58,0.00,0.93,-42.23,0.00,-0.02,0.56,0.99,0.33,...,0,1,0,3,0,1,0,0,0,1
7024,0.01,0.50,0.44,1.87,9.74,0.01,0.01,0.98,1.01,0.49,...,0,0,0,0,0,0,0,0,0,1
7025,0.00,0.85,0.00,0.57,-121.92,0.00,-0.04,0.18,0.42,0.15,...,0,1,0,3,0,1,0,0,0,1


In [296]:
binary_columns = ['NOT_PROFITABLE', 'NO_LIQUIDITY','LIABILITIES_GT_ASSETS','ST_OBLIGATIONS_GT_TOTAL_ASSETS','NO_EQUITY']

baseline_columns = ['NOT_PROFITABLE', 'NO_LIQUIDITY','LIABILITIES_GT_ASSETS','ST_OBLIGATIONS_GT_TOTAL_ASSETS',
                    'NO_EQUITY','TWO_BINARY_FLAGS','THREE_BINARY_FLAGS','FOUR_BINARY_FLAGS','FIVE_BINARY_FLAGS',
                    'ALWAYS_BANKRUPT','NEVER_BANKRUPT']

def generate_baseline_analysis(df,
                               prediction_columns
                               ,prediction_rationale):
    
    df=df.copy()
    new_df = df.set_index('BANKRUPTCY_FLAG')[prediction_columns].reset_index()
    
    results = pd.DataFrame()
    test_dict = {}
    
    for count,i in enumerate(prediction_columns):
        
        pred_name = f"Prediction {count}: {i}"
        
        test_dict[pred_name] = {'1) Bankrupties Predicted':len(new_df[new_df[i]==1]),
                                '2) True Positives':len(new_df[(new_df[i]==1)*(new_df['BANKRUPTCY_FLAG']==1)]),
                                '3) True Negatives':len(new_df[(new_df[i]==0)*(new_df['BANKRUPTCY_FLAG']==0)]),
                                '4) False Positives':len(new_df[(new_df[i]==1)*(new_df['BANKRUPTCY_FLAG']==0)]),
                                "5) False Negatives":len(new_df[(new_df[i]==0)*(new_df['BANKRUPTCY_FLAG']==1)])}
        try:
            test_dict[pred_name]['6) Precision']=test_dict[pred_name]['2) True Positives']/(test_dict[pred_name]['2) True Positives']+test_dict[pred_name]['3) True Negatives'])
        except:
            test_dict[pred_name]['6) Precision']=0
        
        try:
            test_dict[pred_name]['7) Recall']=test_dict[pred_name]['2) True Positives']/(test_dict[pred_name]['2) True Positives']+test_dict[pred_name]['4) False Positives'])
        except:
            test_dict[pred_name]['7) Recall']=0
        try:
            test_dict[pred_name]['8) Accuracy']=(test_dict[pred_name]['2) True Positives']+test_dict[pred_name]['3) True Negatives'])/len(df)
        except:
            test_dict[pred_name]['8) Accuracy']=0
  
    return pd.DataFrame(test_dict)
    
generate_baseline_analysis(clean_df,baseline_columns,[])


Unnamed: 0,Prediction 0: NOT_PROFITABLE,Prediction 1: NO_LIQUIDITY,Prediction 2: LIABILITIES_GT_ASSETS,Prediction 3: ST_OBLIGATIONS_GT_TOTAL_ASSETS,Prediction 4: NO_EQUITY,Prediction 5: TWO_BINARY_FLAGS,Prediction 6: THREE_BINARY_FLAGS,Prediction 7: FOUR_BINARY_FLAGS,Prediction 8: FIVE_BINARY_FLAGS,Prediction 9: ALWAYS_BANKRUPT,Prediction 10: NEVER_BANKRUPT
1) Bankrupties Predicted,9531.0,9612.0,2277.0,1409.0,27656.0,7533.0,2688.0,730.0,1013.0,43396.0,0.0
2) True Positives,958.0,903.0,307.0,228.0,1575.0,539.0,350.0,71.0,179.0,2090.0,0.0
3) True Negatives,32733.0,32597.0,39336.0,40125.0,15225.0,34312.0,38968.0,40647.0,40472.0,0.0,41306.0
4) False Positives,8573.0,8709.0,1970.0,1181.0,26081.0,6994.0,2338.0,659.0,834.0,41306.0,0.0
5) False Negatives,1132.0,1187.0,1783.0,1862.0,515.0,1551.0,1740.0,2019.0,1911.0,0.0,2090.0
6) Precision,0.03,0.03,0.01,0.01,0.09,0.02,0.01,0.0,0.0,1.0,0.0
7) Recall,0.1,0.09,0.13,0.16,0.06,0.07,0.13,0.1,0.18,0.05,0.0
8) Accuracy,0.78,0.77,0.91,0.93,0.39,0.8,0.91,0.94,0.94,0.05,0.95


## Create Baseline Models Using Tier 1 Variables

1. Do the variables identified in Tier 1 provide a sufficient basis to attempt initial model review and creation?<br>
Based on identified ratios, they cover 3 of the 4 categories (Turnover not covered) <br>
We will need to potentially replace Working Capital with created Log variable <br>
We will need to see if our variable clean up was effective <br>
We will need to establish a baseline comparison and use our simple model to see if it can outperform more complex and exhaustive variables




In [309]:

def generate_model(X,
                   y,
                   model_reference,
                   ml_model,
                   scaling,
                   learning_rate,
                   optimizer,
                   activation,
                   batch_size,
                   epochs,
                   validation_split,
                   bankrupcy_observations,
                   network_hidden_dimensions=[8,16,32],
                   metrics=['accuracy', keras.metrics.Precision(),keras.metrics.Recall()],
                   verbose=0):
    
    
    '''
    X - X_test Data Set
    y - Data set labels
    model_reference(str) - Name of model for reference purpose
    ml_model(neural_network,___,____) - Type of ML Model to Applied
    scaling (None, min_max,standard_scalar) - To Determine what type of scaling to use on Dataset
    metrics = 
    learning_rate - Learning rate of Model
    optimizer ('sgd','adam') - Optimization Model Utilized
    loss_function - Loss Function applied to model
    network_hidden_dimensions - number of layers and nodes per layer to be added to model (default is to add 3 hidden layers of 8,16,32 nodes)
    batch_size - batch size of training (default 1000, if data set is less than 1000 it will maximize to size of data)
    epochs - number of iterations to run through testing (defaul 20)
    validation_splilt - percentage of test data to include in validation set (default 30%)
    bankrupcy_observations ('entire_dataset','all_bankrupcies',n) - Number of Bankrupt Observations
    
    '''
    
    # Given our current dataset is not evenly distributed, we must give consideration to how to appropriately select
    # our Dataset below partions the dataset
    
    if bankrupcy_observations == 'entire_dataset':
        pass
    elif bankrupcy_observations == 'all_bankrupcies':
        X,y = create_balanced_dataset(X,y)
    else:
        X,y = create_balanced_dataset(X,y,bankrupcy_observations)
    
    print(len(X))
    print(len(y))
    
    # Apply Scaling to DataFrame After Trimming Dataset
    if scaling.lower() == 'standard_scalar':
        X = apply_standard_scaler(X,X.columns.values)
    elif scaling.lower()=='min_max':
        X = apply_min_max_scaler(X,X.columns.values)
        
    if ml_model=='neural_network':
        history,model = train_model(X=X,
                                    y=y,
                                    input_dim=len(X.columns),
                                    metrics=metrics,
                                    hidden_layer_sizes=network_hidden_dimensions,
                                    activation=activation, 
                                    optimizer=optimizer,
                                    learning_rate=learning_rate,
                                    batch_size=batch_size,
                                    num_epochs=epochs,
                                    validation_split=validation_split,
                                    verbose=verbose)
        
        model_results_df = pd.DataFrame(history.history)
        model_results_df['Model'] = model_reference
        model_results_df['activation'] = activation
        model_results_df['optimizer'] = optimizer
        model_results_df['learning_rate'] = learning_rate
        model_results_df['batch_size'] = batch_size
        model_results_df['epochs'] = epochs
        model_results_df['validation_split'] = validation_split
        model_results_df['hidden_layer_sizes'] = text_manipulation(network_hidden_dimensions)
        model_results_df['bankrupt_observations'] = bankrupcy_observations
        model_results_df['total_observations_read'] = len(X)
    
    elif ml_model=='gradient_descent':
        pass
    elif ml_model=='decision_tree':
        pass
    elif ml_model=='knn':
        pass
    
    return history,model,model_results_df,model_results_df.tail(1)

In [334]:
og_columns                                            

['net profit / total assets',
 'total liabilities / total assets',
 'working capital / total assets',
 'current assets / short-term liabilities',
 '[(cash + short-term securities + receivables - short-term liabilities) / (operating expenses - depreciation)] * 365',
 'retained earnings / total assets',
 'EBIT / total assets',
 'book value of equity / total liabilities',
 'sales / total assets',
 'equity / total assets',
 '(gross profit + extraordinary items + financial expenses) / total assets',
 'gross profit / short-term liabilities',
 '(gross profit + depreciation) / sales',
 '(gross profit + interest) / total assets',
 '(total liabilities * 365) / (gross profit + depreciation)',
 '(gross profit + depreciation) / total liabilities',
 'total assets / total liabilities',
 'gross profit / total assets',
 'gross profit / sales',
 '(inventory * 365) / sales',
 'sales (n) / sales (n-1)',
 'profit on operating activities / total assets',
 'net profit / sales',
 'gross profit (in 3 years) / 

In [353]:
from itertools import product

# In order to generate, need to fillna for values which have not been cleaned as part of cleaned_df

dataset_dictionary= {'All Observations Cleaned':X_all_data[og_predict_columns].fillna(0),
                     'Tier 1 Observations, Actual Values':X_tier1_values,
                     'Tier 1 Observatiosn, Binary Approximations':X_tier1_binary}

model_selection_dict = {
    'dataset':['All Observations Cleaned','Tier 1 Observations, Actual Values','Tier 1 Observatiosn, Binary Approximations'],
    'data_standardizations':['none','min_max','standard_scalar'],
    'bankrupcy_observations':[1000,'all_bankrupcies','entire_dataset'],
    'ml_model':['neural_network'],
    'activation_function': ['relu', 'tanh', 'sigmoid'],
    'optimizer': ['adam', 'sgd'],
    'learning_rate': [0.01, 0.05],
    'batch_size': [100,1000],
    'epochs': [ 10, 20],
    'validation_split': [0.3],
    'verbose': [0]}

model_tuning_dict = {
    'data_standardizations':['none','min_max','standard_scalar'],
    'ml_model':['neural_network'],
    'bankrupcy_observations':[100,500,1000,'max','entire_dataset'],
   'activation_function': ['relu', 'tanh', 'sigmoid'],
    'optimizer': ['adam', 'sgd'],
    'learning_rate': [0.001, 0.01, 0.05, 0.1],
    'batch_size': [10,100,1000],
    'epochs': [5, 10, 20],
    'validation_split': [0.1, 0.2, 0.3],
    'verbose': [0]}

# Generate all combinations of options

model_selection_list = list(product(*model_selection_dict.values()))
model_tuning_list = list(product(*model_tuning_dict.values()))

print(f"Model Selection Records: {len(model_selection_list)}")
print(f"Model Tuning Records: {len(model_tuning_list)}")


# model_selection_dict = {
#     'data_standardizations':['none','min_max','standard_scalar'],
#     'bankrupcy_observations':[1000,'all_bankrupcies','entire_dataset'],
#     'ml_model':['neural_network'],
#     'activation_function': ['relu', 'tanh', 'sigmoid'],
#     'optimizer': ['adam', 'sgd'],
#     'learning_rate': [0.01, 0.05],
#     'batch_size': [100,1000],
#     'epochs': [ 10, 20],
#     'validation_split': [0.3],
#     'verbose': [0]}

# model_tuning_dict = {
#     'data_standardizations':['none','min_max','standard_scalar'],
#     'ml_model':['neural_network'],
#     'bankrupcy_observations':[100,500,1000,'max','entire_dataset'],
#    'activation_function': ['relu', 'tanh', 'sigmoid'],
#     'optimizer': ['adam', 'sgd'],
#     'learning_rate': [0.001, 0.01, 0.05, 0.1],
#     'batch_size': [10,100,1000],
#     'epochs': [5, 10, 20],
#     'validation_split': [0.1, 0.2, 0.3],
#     'verbose': [0]}

Model Selection Records: 1296
Model Tuning Records: 9720


In [2]:
all_models_restult_history = pd.DataFrame()
all_models_final_results = pd.DataFrame()
count = 0

run = input('Do you really want to run the below, it will take a minimum of 15 minutes')

if run.lower() != 'yes':
    for hyper_param in model_selection_list:
        history, model,model_results_df,model_final_result = generate_model(X=dataset_dictionary[hyper_param[0]],
                                                                            y=y_train,
                                                                            model_reference=dataset,
                                                                            scaling=hyper_param[1],
                                                                            bankrupcy_observations=hyper_param[2],
                                                                            ml_model=hyper_param[3],
                                                                            activation=hyper_param[4],
                                                                            optimizer=hyper_param[5],
                                                                            learning_rate=hyper_param[6],
                                                                            batch_size=hyper_param[7],
                                                                            epochs=hyper_param[8],
                                                                            validation_split=hyper_param[9],
                                                                            network_hidden_dimensions=[8,16,32,64])

        all_models_restult_history = pd.concat([all_models_restult_history,model_results_df])
        all_models_final_results = pd.concat([all_models_final_results,model_final_result])
        print(count)
        count+=1

        
# history, model,model_results_df,model_final_result = generate_model(X=X3,
#                                                                     y=y3,
#                                                                     model_reference='Test',
#                                                                     ml_model='neural_network',
#                                                                     network_hidden_dimensions=[8,16,32,64],
#                                                                     scaling='standard_scalar',
#                                                                     learning_rate=.0001,
#                                                                     optimizer='adam',
#                                                                     activation='relu',
#                                                                     batch_size=1000,
#                                                                     epochs=100,
#                                                                     validation_split=.3,
#                                                                     bankrupcy_observations=1000)

In [None]:
GD
Decision Tree
KNN
NN