# All Models on Subsets Dataset
 - We suspect that our modified dataset will yield better results than the 'raw' normalized dataset. In this notebook we will investigate this

In [1]:
#required imports and dependencies

import pandas as pd
import numpy as np 
import warnings 
warnings.filterwarnings("ignore")
from sklearn import preprocessing
from keras import Sequential
from keras.layers import Embedding, Dense, Dropout, LSTM, Flatten
from keras.metrics import *
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.linear_model import *
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC



#our own confusion matrix module that we developed
import cmatrix as cm

#load data into data frames
xlsPath = r'C:\Users\New\Desktop\UniWork\ADA\ada2\Subsets.xlsx'
data = pd.read_excel(xlsPath)

#split into data and class variable
X = data.drop(['date','price increase tomorrow?'], axis =1)



#perform normalization on data 
min_max_scaler = preprocessing.MinMaxScaler()
for column in X:
    X[column] = min_max_scaler.fit_transform(X[column].values.reshape(-1,1))
class_var = data['price increase tomorrow?']

#select dimensions to reduce to 
inputDims = 4
attributes = SelectKBest(chi2, k=inputDims).fit_transform(X,class_var)

#Create a train test split
split_number = 1406
train_attributes,train_class_var,test_attributes,test_class_var = \
attributes[:split_number],class_var[:split_number], \
attributes[split_number:],class_var[split_number:]



Using TensorFlow backend.


In [2]:
def sequentialNN(train_attributes,train_class_var,test_attributes,test_class_var):
    
    model = Sequential()
    # The imput dim here is the number of cols in the df getting fed into the model
    model.add(Dense(256, input_dim=inputDims, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(256, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(32, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))

    model.compile(loss='binary_crossentropy',
                  optimizer='rmsprop',
                  metrics=['accuracy'])


    model.summary()

    model.fit(train_attributes, train_class_var,
             epochs=50, verbose=0, batch_size=128)

    #the predict_classes method returns the binary variable we're looking for
    # for some reason it doesn't output the correct array type, made a 2D array
    pred = model.predict_classes(test_attributes)

    #the flatten method solves this and squashes to a 1D array for evaluation
    predictions = pred.flatten()

    #be sure to import our cmatrix module
    cmat = cm.cmatrix(test_class_var, predictions)
    return cmat




In [3]:
def logRegression(train_attributes,train_class_var,test_attributes,test_class_var):
    
    logreg = LogisticRegression()
    logreg.fit(train_attributes, train_class_var)
    predictions = logreg.predict(test_attributes)
    
    cmat = cm.cmatrix(test_class_var, predictions)
    return cmat

In [4]:
def randomForest(train_attributes,train_class_var,test_attributes,test_class_var):
    
    random_forest = RandomForestClassifier(n_estimators=7)
    random_forest.fit(train_attributes, train_class_var)
    predictions = random_forest.predict(test_attributes)
    
    cmat = cm.cmatrix(test_class_var, predictions)
    return cmat

In [5]:
def svc(train_attributes,train_class_var,test_attributes,test_class_var):
    
    svc = SVC()
    svc.fit(train_attributes, train_class_var)
    predictions = svc.predict(test_attributes)
    
    cmat = cm.cmatrix(test_class_var, predictions)
    return cmat
    

In [6]:
def perceptron(train_attributes,train_class_var,test_attributes,test_class_var):
    
    perceptron = Perceptron()
    perceptron.fit(train_attributes, train_class_var)
    predictions = perceptron.predict(test_attributes)
    
    cmat = cm.cmatrix(test_class_var, predictions)
    return cmat

In [10]:
sequentialNN(train_attributes,train_class_var,test_attributes,test_class_var)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_13 (Dense)             (None, 256)               1280      
_________________________________________________________________
dropout_7 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_14 (Dense)             (None, 256)               65792     
_________________________________________________________________
dropout_8 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_15 (Dense)             (None, 32)                8224      
_________________________________________________________________
dense_16 (Dense)             (None, 1)                 33        
Total params: 75,329
Trainable params: 75,329
Non-trainable params: 0
_________________________________________________________________
accura

Predicted,0,1
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,12,60
1,25,59


In [11]:
logRegression(train_attributes,train_class_var,test_attributes,test_class_var)

accuracy = 0.5 : How many did we get correct?
precision = 0.529411764706 : When we predict an increase, how often are we correct?
recall = 0.642857142857 : How many of the increases did we "detect"?


Predicted,0,1
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,24,48
1,30,54


In [12]:
perceptron(train_attributes,train_class_var,test_attributes,test_class_var)

The model only predicted 1s
accuracy/precision = 0.538461538462


Predicted,1
Actual,Unnamed: 1_level_1
0,72
1,84


In [13]:
randomForest(train_attributes,train_class_var,test_attributes,test_class_var)

accuracy = 0.487179487179 : How many did we get correct?
precision = 0.52380952381 : When we predict an increase, how often are we correct?
recall = 0.52380952381 : How many of the increases did we "detect"?


Predicted,0,1
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,32,40
1,40,44


In [14]:
svc(train_attributes,train_class_var,test_attributes,test_class_var)

accuracy = 0.480769230769 : How many did we get correct?
precision = 0.515789473684 : When we predict an increase, how often are we correct?
recall = 0.583333333333 : How many of the increases did we "detect"?


Predicted,0,1
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,26,46
1,35,49
