# All Models on Subsets Dataset
 - We suspect that our modified dataset will yield better results than the 'raw' normalized dataset. In this notebook we will investigate this

In [2]:
#required imports and dependencies

import pandas as pd
import numpy as np 
import warnings 
warnings.filterwarnings("ignore")
from sklearn import preprocessing
from keras import Sequential
from keras.layers import Embedding, Dense, Dropout, LSTM, Flatten
from keras.metrics import *
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.linear_model import *
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC



#our own confusion matrix module that we developed
import cmatrix as cm

#load data into data frames
xlsPath = r'C:\Users\New\Desktop\UniWork\ADA\ada2\Subsets.xlsx'
data = pd.read_excel(xlsPath)

#split into data and class variable
X = data.drop(['date','price increase tomorrow?'], axis =1)
class_var = data['price increase tomorrow?']


#perform normalization on data 
min_max_scaler = preprocessing.MinMaxScaler()
for column in X:
    X[column] = min_max_scaler.fit_transform(X[column].values.reshape(-1,1))


#select dimensions to reduce to 
inputDims = 5
attributes = SelectKBest(chi2, k=inputDims).fit_transform(X,class_var)

#Create a train test split
split_number = round(len(data)*0.96)
train_attributes,train_class_var,test_attributes,test_class_var = \
attributes[:split_number],class_var[:split_number], \
attributes[split_number:],class_var[split_number:]



Using TensorFlow backend.


In [3]:
def sequentialNN(train_attributes,train_class_var,test_attributes,test_class_var):
    
    model = Sequential()
    # The imput dim here is the number of cols in the df getting fed into the model
    model.add(Dense(64, input_dim=inputDims, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(256, activation='relu'))
    model.add(Dropout(0.25))
    model.add(Dense(32, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))

    model.compile(loss='binary_crossentropy',
                  optimizer='rmsprop',
                  metrics=['accuracy'])


    model.summary()

    model.fit(train_attributes, train_class_var,
             epochs=250, verbose=0, batch_size=128)

    #the predict_classes method returns the binary variable we're looking for
    # for some reason it doesn't output the correct array type, made a 2D array
    pred = model.predict_classes(test_attributes)

    #the flatten method solves this and squashes to a 1D array for evaluation
    predictions = pred.flatten()

    #be sure to import our cmatrix module
    cmat = cm.cmatrix(test_class_var, predictions)
    
    model.reset_states()
    return cmat




In [11]:
def logRegression(train_attributes,train_class_var,test_attributes,
                  test_class_var):
    
    #we found the the best hyper parameter settings are solver='liblinear', 
    #and intercept_scaling=0.95
    logreg = LogisticRegression(solver='liblinear', intercept_scaling=0.95)
    logreg.fit(train_attributes, train_class_var)
    predictions = logreg.predict(test_attributes)
    
    cmat = cm.cmatrix(test_class_var, predictions)
    return cmat

In [63]:
def randomForest(train_attributes,train_class_var,test_attributes,test_class_var):
    
    random_forest = RandomForestClassifier(n_estimators=20)
    random_forest.fit(train_attributes, train_class_var)
    predictions = random_forest.predict(test_attributes)
    
    cmat = cm.cmatrix(test_class_var, predictions)
    return cmat

In [13]:
def svc(train_attributes,train_class_var,test_attributes,test_class_var):
    
    svc = SVC()
    svc.fit(train_attributes, train_class_var)
    predictions = svc.predict(test_attributes)
    
    cmat = cm.cmatrix(test_class_var, predictions)
    return cmat
    

In [17]:
def perceptron(train_attributes,train_class_var,test_attributes,test_class_var):
    
    perceptron = Perceptron()
    perceptron.fit(train_attributes, train_class_var)
    predictions = perceptron.predict(test_attributes)
    
    cmat = cm.cmatrix(test_class_var, predictions)
    return cmat

In [5]:
sequentialNN(train_attributes,train_class_var,test_attributes,test_class_var)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_5 (Dense)              (None, 64)                384       
_________________________________________________________________
dropout_3 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_6 (Dense)              (None, 256)               16640     
_________________________________________________________________
dropout_4 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_7 (Dense)              (None, 32)                8224      
_________________________________________________________________
dense_8 (Dense)              (None, 1)                 33        
Total params: 25,281
Trainable params: 25,281
Non-trainable params: 0
_________________________________________________________________
accura

Predicted,0,1
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,10,19
1,12,21


In [35]:
logRegression(train_attributes,train_class_var,test_attributes,test_class_var)

accuracy = 0.617021276596 : How many did we get correct?
precision = 0.59375 : When we predict an increase, how often are we correct?
recall = 0.791666666667 : How many of the increases did we "detect"?


Predicted,0,1
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,10,13
1,5,19


In [18]:
perceptron(train_attributes,train_class_var,test_attributes,test_class_var)

The model only predicted 1s
accuracy/precision = 0.510638297872


Predicted,1
Actual,Unnamed: 1_level_1
0,23
1,24


In [80]:
randomForest(train_attributes,train_class_var,test_attributes,test_class_var)

accuracy = 0.532258064516 : How many did we get correct?
precision = 0.576923076923 : When we predict an increase, how often are we correct?
recall = 0.454545454545 : How many of the increases did we "detect"?


Predicted,0,1
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,18,11
1,18,15


In [14]:
svc(train_attributes,train_class_var,test_attributes,test_class_var)

The model only predicted 1s
accuracy/precision = 0.510638297872


Predicted,1
Actual,Unnamed: 1_level_1
0,23
1,24
