# Start of mess around

In [1]:
from __future__ import print_function

import numpy as np 

import matplotlib.pyplot as plt   # use matplotlib for plotting with inline plots
plt.set_cmap('jet');
%matplotlib inline

import warnings
warnings.filterwarnings('ignore'); # for deprecated matplotlib functions

import math

import datetime
import time

import tensorflow as tf
from tensorflow.keras import Model, layers

from sklearn.linear_model import LogisticRegression 
from sklearn.preprocessing import StandardScaler 
from sklearn.metrics import confusion_matrix, classification_report 
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE 
from imblearn.under_sampling import NearMiss 


# Interesting imports
In order to import mltools properly, we need to modify the module path that python thinks we have

Basically it adds everything 2 folders up onto our python module path, so referencing "mltools" python can see that it is two directories up

In [2]:
# We need to append the module path so we can work with
#     everything that is 2 folders up
import sys
import os
from pathlib import Path

path = Path(os.getcwd())
project_dir = str(path.parent.parent)

picture_path = str(path) + os.path.sep + "Pictures" + os.path.sep
model_path = str(path) + os.path.sep + "Models" + os.path.sep

if not os.path.exists(picture_path):
    os.mkdir(picture_path)
    
if not os.path.exists(model_path):
    os.mkdir(model_path)

curr_path = sys.path
data_path = ""


In [3]:
if project_dir not in curr_path:
    print("Appending'", project_dir, "'to path")
    sys.path.append(project_dir)
    print("Appending'", project_dir + os.path.sep + "data '", "to path")
    sys.path.append(project_dir + os.path.sep + "data")
    data_path = project_dir + os.path.sep + "data" + os.path.sep
    
# Uncomment this if youd like to see all your python module paths
# print(sys.path)

import mltools as ml

Appending' /home/chris/git/ML-FinalProject 'to path
Appending' /home/chris/git/ML-FinalProject/data ' to path


# Import the data

In [4]:
# print(sys.path)
# print(data_path)
X_train = np.genfromtxt(data_path + "X_train.txt", delimiter=None, skip_header=1)
Xtest = np.genfromtxt(data_path + "X_test.txt", delimiter=None, skip_header=1)
Y_train = np.genfromtxt(data_path + "Y_train.txt", delimiter=None, skip_header=1)
Data_Labels = np.genfromtxt(data_path + "X_train.txt", delimiter=None, dtype=str, max_rows=1)

Xtrain,Xval,Ytrain,Yval = ml.splitData(X_train,Y_train[:, 1], 0.75) # split data set 75/25

print("Data Labels with Column index:\n")
for i in range(len(Data_Labels)):
    print(str(i) + ": " + str(Data_Labels[i]))

X,Y =(X_train[1:],Y_train[1:,1:])

# Tells numpy not to print everything in scientific notation 
np.set_printoptions(suppress=True)

# print out the number of each class
(y_class, counts) = np.unique(Y, return_counts=True)
class_frequencies = np.asarray((y_class, counts)).T

print("Class Frequencies:\n", class_frequencies)

tot_dist = 0
for i in range(len(Ytrain)):
    if Ytrain[i] == 1:
        tot_dist += 1

neg = Ytrain.shape[0] - tot_dist
initial_bias = np.log([tot_dist/neg])

print("Bias:", initial_bias)


Data Labels with Column index:

0: UniqueID
1: SIZE
2: BIG4
3: GAAP
4: CR
5: QR
6: AT
7: CashR
8: NWC
9: CFOCL
10: OM
11: TM
12: CFM
13: ROE
14: ROA
15: GRIE
16: CFONI
17: CFOTL
18: CFOTR
19: CFOTA
20: EF
21: FAF
22: LTDE
23: CAT
24: DCOH
25: ACP
26: AAP
27: DR
28: DSCR
29: CFOLTL
30: TAT
31: FAT
32: DEPR
33: CFOFA
34: APP
35: 2008
36: 2009
37: 2010
38: 2011
39: 2012
40: 2013
41: 2014
42: 2015
43: 2016
44: 2017
45: 2018
Class Frequencies:
 [[   0. 4542.]
 [   1.  233.]]
Bias: [-2.96880144]


## Lets try TensorFlow

In [5]:
# Network Parameters
n_hidden_1 = 8 # 1st layer number of neurons
n_hidden_2 = 16 # 2nd layer number of neurons
n_hidden_3 = 16
n_hidden_4 = 8
        
# Parameters
learning_rate = 0.01    # alpha for gradient descent
num_steps = 4000       # iterations for gradient descent
batch_size = 128       # number of inputs to look at simultaneously (good for large data!)
display_step = 100     # when to print out some feedback


## Create our NN model, this inherits from TF

In [6]:
# Create TF Model. Our NeuralNet inherits from the generic TF Model class
class NeuralNet(Model):
    # Set layers.
    def __init__(self, num_classes, output_bias):
        super(NeuralNet, self).__init__()
        curr_activation = tf.nn.relu
        # First fully-connected hidden layer. Activation function (threshold function) is 
        # rectified linear.
#         output_bias=tf.constant_initializer(output_bias[0])
        output_bias = tf.zeros_initializer()
#         output_bias=tf.keras.backend.variable(output_bias[0])
        self.h1 = layers.Dense(n_hidden_1, activation=curr_activation)
        
        # First fully-connected hidden layer. Activation function (threshold function) is 
        # rectified linear.
        self.h2 = layers.Dense(n_hidden_2, activation=curr_activation)
        self.h3 = layers.Dense(n_hidden_3, activation=curr_activation)
        self.h4 = layers.Dense(n_hidden_4, activation=curr_activation)
        
        # Second fully-connecter hidden layer. Activation function is using "softmax" to
        # normalize output as a probability distribution over the 2 classes (digits 0 or 1)
        self.out = layers.Dense(num_classes, activation=tf.nn.softmax)

    # Set forward pass--this defines the input layer (h1) and output layer (out) and then
    # formats the final results as a probability distribution over the classes using softmax
    def call(self, x, is_training=False):
        x = self.h1(x)
        x = self.out(x)
        if not is_training:
            # tf cross entropy expect logits (positions on the logistic regression sigmoid)
            # without softmax normalization, so only apply softmax when not training.
            x = tf.nn.softmax(x)
        return x
           

## Specify Loss

In [7]:
# Cross-Entropy Loss.
# Note that this will apply 'softmax' to the logits as part of the function, so don't do
# it before calling.
def cross_entropy_loss(x, y):
    # Convert labels to int 64 for tf cross-entropy function.
    y = tf.cast(y, tf.int64)
    # Apply softmax to logits and compute cross-entropy.
    loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=x)
    # Average loss across the batch.
    return tf.reduce_mean(loss)

# Accuracy metric. This counts how many of our predictions we get right based on 
# choosing the prediction with the highest probability. 
def accuracy(y_pred, y_true):
    # Predicted class is the index of highest score in prediction vector (i.e. argmax).
    correct_prediction = tf.equal(tf.argmax(y_pred, 1), tf.cast(y_true, tf.int64))
    return tf.reduce_mean(tf.cast(correct_prediction, tf.float32), axis=-1)

## Attempt to train?

In [8]:
# Optimization process. 
def run_optimization(x, y, nn, optimizer):
    # Wrap computation inside a GradientTape for automatic differentiation 
    # (see Backprop.ipnb for an explanation of the GradientTape)
    with tf.GradientTape() as g:
        # Forward pass.
        pred = nn(x, is_training=True)
        # Compute loss.
        loss = cross_entropy_loss(pred, y)
        
    # Variables to update, i.e. trainable variables.
    trainable_variables = nn.trainable_variables

    # Compute gradients (backpropagation).
    gradients = g.gradient(loss, trainable_variables)
    
    # Update all of the weights W and biases (y-intercepts) b following gradients.
    optimizer.apply_gradients(zip(gradients, trainable_variables))
    
def run_training(nn, training_data, optimizer):
    # Run training for the given number of steps.
    for step, (batch_x, batch_y) in enumerate(training_data.take(num_steps), 1):
        # Run the optimization to update W and b values.
        run_optimization(batch_x, batch_y, nn, optimizer)

        if step % display_step == 0 or step==1:
            pred = nn(batch_x, is_training=True)     
            loss = cross_entropy_loss(pred, batch_y)
            acc = accuracy(pred, batch_y)
#             print("step: %i, loss: %f, accuracy: %f" % (step, loss, acc))
#             print()

In [10]:
# cat1 = np.linspace(4, 9, 9-4+1, dtype=int)
# cat2 = np.linspace(10, 19, 19-10+1, dtype=int)
# cat3 = np.linspace(20, 22, 22-20+1, dtype=int)
# cat4 = np.linspace(23, 26, 26-23+1, dtype=int)
# cat5 = np.linspace(27, 29, 29-27+1, dtype=int)
# cat6 = np.linspace(30, 33, 33-30+1, dtype=int)
# cat7 = np.linspace(34, 34, 1, dtype=int)

cat1 = [6, 7, 4, 8, 5]
cat2 = [17, 14, 11]
cat3 = [20, 21]
cat4 = [24]
cat5 = [27]
cat6 = [30, 31]
cat7 = [34]

all_cats = [cat1, cat2, cat3, cat4, cat5, cat6, cat7]

for x in range(len(all_cats)):
    new_cat = np.append(all_cats[x], -1)
    all_cats[x] = new_cat


In [11]:
from IPython.display import clear_output

def test_nn(cols_using, bias):
    
    tf.keras.backend.clear_session()
    
    X_train, X_val = np.array(Xtrain[:, cols_using], np.float32), np.array(Xval[:, cols_using], np.float32)
    X_test = np.array(Xtest[:, cols_using], np.float32)
    
    sm = SMOTE(random_state = 2)
    
    Xtr_res, Ytr_res = sm.fit_sample(X_train, Ytrain.ravel())
    
    scale = StandardScaler().fit(Xtr_res)
    Xtr_res_scaled = scale.transform(Xtr_res)
    Xval_scaled = scale.transform(X_val)
        
    # Use tf.data API to shuffle and batch data (batches will make it faster to queue up 
    # sets of images for training all at one time--a convenient way to split up large data sets)
    oversample_data = tf.data.Dataset.from_tensor_slices((Xtr_res_scaled, Ytr_res.ravel()))

    oversample_data = oversample_data.repeat().shuffle(5000).batch(batch_size).prefetch(1)
    
#     print("Columns we are currently using:", Data_Labels[cols_using])

    # Data Parameters
    num_features = len(cols_using)
    num_classes = 2 # We have 2 classes, yes or no

    # Build neural network model.
    neural_net_over_sample = NeuralNet( num_classes=num_classes, output_bias=bias)

    # Stochastic gradient descent optimizer.
    optimizer = tf.optimizers.SGD(learning_rate)

    run_training(neural_net_over_sample, oversample_data, optimizer)
    
    # Test model on validation set.
    pred = neural_net_over_sample(Xval_scaled, is_training=False)
    curr_acc = accuracy(pred, Yval)
#     print("Test Accuracy: %f" % accuracy(pred, Yval))
#     print()
    
    return float(curr_acc), scale, neural_net_over_sample
    

In [13]:
def test_options():
    best_accuracy = 0.0
    best_cols = []
    test_num = 1

    for a in all_cats[0]:
        for b in all_cats[1]:
    #         for c in all_cats[2]:
    #             for d in all_cats[3]:
    #                 for e in all_cats[4]:
    #                     for f in all_cats[5]:
    #                         for g in all_cats[6]:
            cols_using = [1, 2, 3]
            if a != -1:
                cols_using.append(a)
            if b != -1:
                cols_using.append(b)
            cols_using.append(20)
            cols_using.append(25)
            cols_using.append(27)
            cols_using.append(34)
    #                             if c != -1:
    #                                 cols_using.append(c)
    #                             if d != -1:
    #                                 cols_using.append(d)
    #                             if e != -1:
    #                                 cols_using.append(e)
    #                             if f != -1:
    #                                 cols_using.append(f)
    #                             if g != -1:
    #                                 cols_using.append(g)

            test_num += 1
            acc = test_nn(cols_using, initial_bias)[0]

            if (acc > best_accuracy):
                best_accuracy = acc
                best_cols = cols_using
                print("Found a new best accuracy:", best_accuracy)
                print("On test number:", test_num)
                print("Used columns:", best_cols)
                print()


    print("Best accuracy was: %i" % best_accuracy)
    print("It used the columgs:", best_cols)

test_options()
    

Found a new best accuracy: 0.7546063661575317
On test number: 2
Used columns: [1, 2, 3, 6, 17, 20, 25, 27, 34]

Found a new best accuracy: 0.7604690194129944
On test number: 3
Used columns: [1, 2, 3, 6, 14, 20, 25, 27, 34]

Found a new best accuracy: 0.876884400844574
On test number: 4
Used columns: [1, 2, 3, 6, 11, 20, 25, 27, 34]

Found a new best accuracy: 0.8835846185684204
On test number: 8
Used columns: [1, 2, 3, 7, 11, 20, 25, 27, 34]

Found a new best accuracy: 0.8886097073554993
On test number: 12
Used columns: [1, 2, 3, 4, 11, 20, 25, 27, 34]

Found a new best accuracy: 0.8978224396705627
On test number: 16
Used columns: [1, 2, 3, 8, 11, 20, 25, 27, 34]

Best accuracy was: 0
It used the columgs: [1, 2, 3, 8, 11, 20, 25, 27, 34]


## Test

In [15]:
ts = time.time()

st = datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%d_%H:%M:%S')

submission_path = model_path + str(st) + os.path.sep
os.mkdir(submission_path)

temp_cols = [1, 2, 3, 8, 11, 20, 25, 27, 34]
acc, scaler, nn = test_nn(temp_cols, initial_bias)

XTrain, XVal = np.array(Xtrain[:, temp_cols], np.float32), np.array(Xval[:, temp_cols], np.float32)
XTest = np.array(Xtest[:, temp_cols], np.float32)

Xval_scaled = scaler.transform(XVal)
Xtest_scaled = scaler.transform(XTest)

# Test model on validation set.
pred = nn(Xval_scaled, is_training=False)
# pred = nn.predict(Xval_scaled)
print("Test Accuracy: %f" % accuracy(pred, Yval))
pred2 = nn(Xtest_scaled, is_training=False)

file = open(submission_path + "submission.txt","w+")
file.write("Unique ID,Dist\n")
num_distress = 0
num_dist = 0
for i in range(len(Xtest_scaled)):
    prediction = np.argmax(pred2.numpy()[i])
    file.write(str(int(Xtest[i, 0])) + "," + str(pred2.numpy()[i][1]) + "\n")
    num_distress += prediction

file.close()

# nn.save(submission_path + "model")

# n.save_model(submission_path + "model")

print("Num in distres: %i" %num_distress)

Test Accuracy: 0.905360
Num in distres: 3
