# All Models on Subsets Dataset
 - We suspect that our modified dataset will yield better results than the 'raw' normalized dataset. In this notebook we will investigate this

In [1]:
#required imports and dependencies

import pandas as pd
import numpy as np 
import warnings 
warnings.filterwarnings("ignore")
from sklearn import preprocessing
from keras import Sequential
from keras.layers import Embedding, Dense, Dropout, LSTM, Flatten
from keras.metrics import *
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.linear_model import *
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC



#our own confusion matrix module that we developed
import cmatrix as cm

#load data into data frames
xlsPath = r'C:\Users\Daniel Patrick\Documents\uniwork\ada\ada2\Subsets.xlsx'
data = pd.read_excel(xlsPath)

#split into data and class variable
X = data.drop(['date','price increase tomorrow?'], axis =1)
class_var = data['price increase tomorrow?']


#perform normalization on data 
min_max_scaler = preprocessing.MinMaxScaler()
for column in X:
    X[column] = min_max_scaler.fit_transform(X[column].values.reshape(-1,1))

#implementation of the wavelet transform
import pywt
import statistics
for column in X:
    coeff = pywt.wavedec(X[column], "haar", level=10)
    sigma = statistics.median(coeff[-1])/0.6745
    threshold = sigma*np.sqrt(2*np.log(len(X[column])))
    coeff[1:] = (pywt.threshold(i, value=threshold) for i in coeff[1:])
    X[column] = pywt.waverec(coeff, "haar")

#select dimensions to reduce to 
inputDims = 3
attributes = SelectKBest(chi2, k=inputDims).fit_transform(X,class_var)


#Create a train test split
split_number = round(len(data)*0.97)
train_attributes,train_class_var,test_attributes,test_class_var = \
attributes[:split_number],class_var[:split_number], \
attributes[split_number:],class_var[split_number:]



Using TensorFlow backend.


In [2]:
def sequentialNN(train_attributes,train_class_var,test_attributes,test_class_var):
    
    model = Sequential()
    # The imput dim here is the number of cols in the df getting fed into the model
    model.add(Dense(64, input_dim=inputDims, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(256, activation='relu'))
    model.add(Dropout(0.25))
    model.add(Dense(32, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))

    model.compile(loss='binary_crossentropy',
                  optimizer='rmsprop',
                  metrics=['accuracy'])


    model.summary()

    model.fit(train_attributes, train_class_var,
             epochs=250, verbose=0, batch_size=128)

    #the predict_classes method returns the binary variable we're looking for
    # for some reason it doesn't output the correct array type, made a 2D array
    pred = model.predict_classes(test_attributes)

    #the flatten method solves this and squashes to a 1D array for evaluation
    predictions = pred.flatten()

    #be sure to import our cmatrix module
    cmat = cm.cmatrix(test_class_var, predictions)
    
    model.reset_states()
    return cmat




In [3]:
def logRegression(train_attributes,train_class_var,test_attributes,
                  test_class_var):
    
    #Below is the list of hyper-parameters which produced
    #the best results
    logreg = LogisticRegression(penalty='l2', dual=False, tol=0.0001, 
                C=1.0, fit_intercept=True, solver='liblinear')
    logreg.fit(train_attributes, train_class_var)
    predictions = logreg.predict(test_attributes)
    
    cmat = cm.cmatrix(test_class_var, predictions)
    return cmat

In [4]:
def randomForest(train_attributes,train_class_var,test_attributes,test_class_var):
    
    random_forest = RandomForestClassifier(n_estimators=20)
    random_forest.fit(train_attributes, train_class_var)
    predictions = random_forest.predict(test_attributes)
    
    cmat = cm.cmatrix(test_class_var, predictions)
    return cmat

In [5]:
def svc(train_attributes,train_class_var,test_attributes,test_class_var):
    
    svc = SVC()
    svc.fit(train_attributes, train_class_var)
    predictions = svc.predict(test_attributes)
    
    cmat = cm.cmatrix(test_class_var, predictions)
    return cmat
    

In [6]:
def perceptron(train_attributes,train_class_var,test_attributes,test_class_var):
    
    perceptron = Perceptron()
    perceptron.fit(train_attributes, train_class_var)
    predictions = perceptron.predict(test_attributes)
    
    cmat = cm.cmatrix(test_class_var, predictions)
    return cmat

In [7]:
sequentialNN(train_attributes,train_class_var,test_attributes,test_class_var)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 64)                256       
_________________________________________________________________
dropout_1 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 256)               16640     
_________________________________________________________________
dropout_2 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 32)                8224      
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 33        
Total params: 25,153
Trainable params: 25,153
Non-trainable params: 0
_________________________________________________________________
accura

Predicted,0,1
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,15,8
1,12,12


In [8]:
logRegression(train_attributes,train_class_var,test_attributes,test_class_var)

accuracy = 0.6808510638297872 : How many did we get correct?
precision = 0.6285714285714286 : When we predict an increase, how often are we correct?
recall = 0.9166666666666666 : How many of the increases did we "detect"?


Predicted,0,1
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,10,13
1,2,22


In [9]:
perceptron(train_attributes,train_class_var,test_attributes,test_class_var)

The model only predicted 1s
accuracy/precision = 0.5106382978723404


Predicted,1
Actual,Unnamed: 1_level_1
0,23
1,24


In [10]:
randomForest(train_attributes,train_class_var,test_attributes,test_class_var)

accuracy = 0.425531914893617 : How many did we get correct?
precision = 0.42857142857142855 : When we predict an increase, how often are we correct?
recall = 0.375 : How many of the increases did we "detect"?


Predicted,0,1
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,11,12
1,15,9


In [11]:
svc(train_attributes,train_class_var,test_attributes,test_class_var)

The model only predicted 1s
accuracy/precision = 0.5106382978723404


Predicted,1
Actual,Unnamed: 1_level_1
0,23
1,24
