# Start of mess around

In [1]:
from __future__ import print_function

import numpy as np 

import matplotlib.pyplot as plt   # use matplotlib for plotting with inline plots
plt.set_cmap('jet');
%matplotlib inline

import warnings
warnings.filterwarnings('ignore'); # for deprecated matplotlib functions

import math

import tensorflow as tf
from tensorflow.keras import Model, layers

from sklearn.linear_model import LogisticRegression 
from sklearn.preprocessing import StandardScaler 
from sklearn.metrics import confusion_matrix, classification_report 
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE 
from imblearn.under_sampling import NearMiss 


# Interesting imports
In order to import mltools properly, we need to modify the module path that python thinks we have

Basically it adds everything 2 folders up onto our python module path, so referencing "mltools" python can see that it is two directories up

In [2]:
# We need to append the module path so we can work with
#     everything that is 2 folders up
import sys
import os
from pathlib import Path

path = Path(os.getcwd())
project_dir = str(path.parent.parent)
curr_path = sys.path
data_path = ""


In [3]:
if project_dir not in curr_path:
    print("Appending'", project_dir, "'to path")
    sys.path.append(project_dir)
    print("Appending'", project_dir + os.path.sep + "data '", "to path")
    sys.path.append(project_dir + os.path.sep + "data")
    data_path = project_dir + os.path.sep + "data" + os.path.sep
    
# Uncomment this if youd like to see all your python module paths
# print(sys.path)

import mltools as ml

Appending' /home/chris/git/ML-FinalProject 'to path
Appending' /home/chris/git/ML-FinalProject/data ' to path


# Import the data

In [4]:
# print(sys.path)
# print(data_path)
X_train = np.genfromtxt(data_path + "X_train.txt", delimiter=None, skip_header=1)
X_test = np.genfromtxt(data_path + "X_test.txt", delimiter=None, skip_header=1)
Y_train = np.genfromtxt(data_path + "Y_train.txt", delimiter=None, skip_header=1)
Data_Labels = np.genfromtxt(data_path + "X_train.txt", delimiter=None, dtype=str, max_rows=1)

print("Data Labels with Column index:\n")
for i in range(len(Data_Labels)):
    print(str(i) + ": " + str(Data_Labels[i]))

X,Y =(X_train[1:],Y_train[1:,1:])

# Tells numpy not to print everything in scientific notation 
np.set_printoptions(suppress=True)

# print out the number of each class
(y_class, counts) = np.unique(Y, return_counts=True)
class_frequencies = np.asarray((y_class, counts)).T

print("Class Frequencies:\n", class_frequencies)



Data Labels with Column index:

0: UniqueID
1: SIZE
2: BIG4
3: GAAP
4: CR
5: QR
6: AT
7: CashR
8: NWC
9: CFOCL
10: OM
11: TM
12: CFM
13: ROE
14: ROA
15: GRIE
16: CFONI
17: CFOTL
18: CFOTR
19: CFOTA
20: EF
21: FAF
22: LTDE
23: CAT
24: DCOH
25: ACP
26: AAP
27: DR
28: DSCR
29: CFOLTL
30: TAT
31: FAT
32: DEPR
33: CFOFA
34: APP
35: 2008
36: 2009
37: 2010
38: 2011
39: 2012
40: 2013
41: 2014
42: 2015
43: 2016
44: 2017
45: 2018
Class Frequencies:
 [[   0. 4542.]
 [   1.  233.]]


## Lets try TensorFlow

In [5]:
# Network Parameters
n_hidden_1 = 8 # 1st layer number of neurons
n_hidden_2 = 16 # 2nd layer number of neurons

# Split the data
cols_using = [1, 2, 3, 4, 5, 7, 8, 22, 32]

Xtrain,Xval,Ytrain,Yval = ml.splitData(X_train[:, cols_using],Y_train[:, 1], 0.75) # split data set 75/25

Xtrain, Xval = np.array(Xtrain, np.float32), np.array(Xval, np.float32)
Xtest = np.array(X_test[:, cols_using], np.float32)


# Data Parameters
num_features = len(cols_using)
num_classes = 2 # We have 2 classes, yes or no

tot_dist = 0
for i in range(len(Ytrain)):
    if Ytrain[i] == 1:
        tot_dist += 1

# print("Total distress:", tot_dist)

# neg = Ytrain.shape[0] - tot_dist
# initial_bias = np.log([tot_dist/neg])

# print("Bias:", initial_bias)
        
# Parameters
learning_rate = 0.01    # alpha for gradient descent
num_steps = 6000       # iterations for gradient descent
batch_size = 128       # number of inputs to look at simultaneously (good for large data!)
display_step = 100     # when to print out some feedback

print("Columns we are currently using:", Data_Labels[cols_using])


Columns we are currently using: ['SIZE' 'BIG4' 'GAAP' 'CR' 'QR' 'CashR' 'NWC' 'LTDE' 'DEPR']


## Create our NN model, this inherits from TF

In [6]:
# Create TF Model. Our NeuralNet inherits from the generic TF Model class
class NeuralNet(Model):
    # Set layers.
    def __init__(self):
        super(NeuralNet, self).__init__()
        curr_activation = tf.nn.relu
        # First fully-connected hidden layer. Activation function (threshold function) is 
        # rectified linear.
#         output_bias=tf.constant_initializer(output_bias[0])
#         output_bias = tf.ones_initializer()
#         output_bias=tf.keras.backend.variable(output_bias[0])
        self.h1 = layers.Dense(n_hidden_1, activation=curr_activation)
        
        self.h2 = layers.Dropout(0.5)
        
        # First fully-connected hidden layer. Activation function (threshold function) is 
        # rectified linear.
        self.h3 = layers.Dense(n_hidden_2, activation=curr_activation)
        self.h4 = layers.Dropout(0.5)
        # Second fully-connecter hidden layer. Activation function is using "softmax" to
        # normalize output as a probability distribution over the 2 classes (digits 0 or 1)
        self.out = layers.Dense(num_classes, activation=tf.nn.softmax)

    # Set forward pass--this defines the input layer (h1) and output layer (out) and then
    # formats the final results as a probability distribution over the classes using softmax
    def call(self, x, is_training=False):
        x = self.h1(x)
        x = self.out(x)
        if not is_training:
            # tf cross entropy expect logits (positions on the logistic regression sigmoid)
            # without softmax normalization, so only apply softmax when not training.
            x = tf.nn.softmax(x)
        return x
           

## Specify Loss

In [7]:
# Cross-Entropy Loss.
# Note that this will apply 'softmax' to the logits as part of the function, so don't do
# it before calling.
def cross_entropy_loss(x, y):
    # Convert labels to int 64 for tf cross-entropy function.
    y = tf.cast(y, tf.int64)
    # Apply softmax to logits and compute cross-entropy.
    loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=x)
    # Average loss across the batch.
    return tf.reduce_mean(loss)

# Accuracy metric. This counts how many of our predictions we get right based on 
# choosing the prediction with the highest probability. 
def accuracy(y_pred, y_true):
    # Predicted class is the index of highest score in prediction vector (i.e. argmax).
    correct_prediction = tf.equal(tf.argmax(y_pred, 1), tf.cast(y_true, tf.int64))
    return tf.reduce_mean(tf.cast(correct_prediction, tf.float32), axis=-1)

## Attempt to train?

In [10]:
# Optimization process. 
def run_optimization(x, y, nn):
    # Wrap computation inside a GradientTape for automatic differentiation 
    # (see Backprop.ipnb for an explanation of the GradientTape)
    with tf.GradientTape() as g:
        # Forward pass.
        pred = nn(x, is_training=True)
        # Compute loss.
        loss = cross_entropy_loss(pred, y)
        
    # Variables to update, i.e. trainable variables.
    trainable_variables = nn.trainable_variables

    # Compute gradients (backpropagation).
    gradients = g.gradient(loss, trainable_variables)
    
    # Update all of the weights W and biases (y-intercepts) b following gradients.
    optimizer.apply_gradients(zip(gradients, trainable_variables))
    
def run_training(nn, training_data):
    # Run training for the given number of steps.
    for step, (batch_x, batch_y) in enumerate(training_data.take(num_steps), 1):
        # Run the optimization to update W and b values.
        run_optimization(batch_x, batch_y, nn)

        if step % display_step == 0 or step==1:
            pred = nn(batch_x, is_training=True)     
            loss = cross_entropy_loss(pred, batch_y)
            acc = accuracy(pred, batch_y)
            print("step: %i, loss: %f, accuracy: %f" % (step, loss, acc))
            print()

In [11]:
# Create a dataset where we over sample our data
sm = SMOTE(random_state = 2) 

Xtrain_res, Ytrain_res = sm.fit_sample(Xtrain, Ytrain.ravel()) 

# scale the data so our different X features and their ranges don't confuse the learner
scale = StandardScaler().fit(Xtrain_res)
Xtrain_res_scaled = scale.transform(Xtrain_res)
Xval_scaled = scale.transform(Xval)
Xtest_scaled = scale.transform(Xtest)

# print out the number of each class
(y_class, counts) = np.unique(Ytrain_res, return_counts=True)
class_frequencies = np.asarray((y_class, counts)).T

print("Class Frequencies:\n", class_frequencies)
# Use tf.data API to shuffle and batch data (batches will make it faster to queue up 
# sets of images for training all at one time--a convenient way to split up large data sets)
oversample_data = tf.data.Dataset.from_tensor_slices((Xtrain_res_scaled, Ytrain_res.ravel()))

oversample_data = oversample_data.repeat().shuffle(5000).batch(batch_size).prefetch(1)

# Build neural network model.
neural_net_over_sample = NeuralNet()

# Stochastic gradient descent optimizer.
optimizer = tf.optimizers.SGD(learning_rate)
            
run_training(neural_net_over_sample, oversample_data)



Class Frequencies:
 [[   0. 3407.]
 [   1. 3407.]]
step: 1, loss: 0.609179, accuracy: 0.687500

step: 100, loss: 0.598505, accuracy: 0.687500

step: 200, loss: 0.606754, accuracy: 0.679688

step: 300, loss: 0.552282, accuracy: 0.835938

step: 400, loss: 0.540153, accuracy: 0.859375

step: 500, loss: 0.550187, accuracy: 0.835938

step: 600, loss: 0.497482, accuracy: 0.914062

step: 700, loss: 0.534170, accuracy: 0.828125

step: 800, loss: 0.499187, accuracy: 0.851562

step: 900, loss: 0.500504, accuracy: 0.867188

step: 1000, loss: 0.486761, accuracy: 0.851562

step: 1100, loss: 0.460393, accuracy: 0.906250

step: 1200, loss: 0.496599, accuracy: 0.835938

step: 1300, loss: 0.487220, accuracy: 0.851562

step: 1400, loss: 0.453038, accuracy: 0.906250

step: 1500, loss: 0.459609, accuracy: 0.882812

step: 1600, loss: 0.463901, accuracy: 0.898438

step: 1700, loss: 0.468835, accuracy: 0.867188

step: 1800, loss: 0.440306, accuracy: 0.882812

step: 1900, loss: 0.497278, accuracy: 0.820312

s

## Test

In [14]:
# Test model on validation set.
pred = neural_net_over_sample(Xval_scaled, is_training=False)
print("Test Accuracy: %f" % accuracy(pred, Yval))
pred2 = neural_net_over_sample(Xtest_scaled, is_training=False)

file = open("submission.txt","w+")
num_distress = 0
num_dist = 0
for i in range(len(Xtest)):
    prediction = np.argmax(pred2.numpy()[i])
    file.write(str(int(X_test[i, 0])) + "," + str(pred2.numpy()[i][prediction]) + "\n")
    num_distress += prediction

file.close()
print("Num in distresnum_distress)

Test Accuracy: 0.824121
0
