### The goal of this script is to find the best combination of parameters to increase the accuracy of the model
#### To do so, we will compare different set of inputs passed in the multinomial naive bayes classifier from sklearn
#### https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.MultinomialNB.html#sklearn.naive_bayes.MultinomialNB

In [1]:
#Import relevant libraries
import pandas as pd
import numpy as np
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import ComplementNB
from sklearn.model_selection import train_test_split
import itertools
from sklearn.metrics import confusion_matrix, f1_score
from sklearn.metrics import balanced_accuracy_score

In [2]:
#Import the data
data = pd.read_csv("cleaned_data.csv")

In [3]:
#last cleanup
data = data.loc[data['Decision28'] != -1]
data

Unnamed: 0,Date,Close,Var. (%),Open,Low,High,Volume,Support,Resistance,Hammer,...,Williams%R,12EMA,26 EMA,MACD LINE,SIGNAL LINE,HISTOGRAM,ZeroCross,SignalCross,Decision14,Decision28
33,2016-03-15,6.61,-10.68,6.61,6.56,6.96,698485643,0,0,0.0,...,0,6.848171,6.000103,0.848068,0.782849,0.065219,1.0,1.0,1,1
34,2016-03-16,7.23,9.38,6.51,6.41,7.23,584312229,1,0,0.0,...,0,6.906914,6.091207,0.815708,0.789421,0.026287,1.0,1.0,1,1
35,2016-03-17,8.10,12.03,8.15,7.71,8.19,943356446,0,0,0.0,...,1,7.090466,6.240006,0.850460,0.801629,0.048831,1.0,1.0,0,1
36,2016-03-18,8.12,0.25,8.09,7.72,8.17,608847049,0,0,0.0,...,1,7.248856,6.379265,0.869591,0.815221,0.054370,1.0,1.0,0,1
37,2016-03-21,8.06,-0.74,8.08,7.93,8.27,526247185,0,0,0.0,...,1,7.373647,6.503764,0.869883,0.826154,0.043730,1.0,1.0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1009,2020-02-21,29.14,-2.61,29.51,29.03,29.68,"1,40B",0,0,0.0,...,0,29.537766,29.461325,0.076440,-0.009671,0.086111,1.0,1.0,0,0
1010,2020-02-26,26.21,-10.05,27.04,26.03,27.39,"2,38B",0,0,0.0,...,2,29.025802,29.220486,-0.194685,-0.046674,-0.148011,0.0,0.0,0,0
1011,2020-02-27,25.30,-3.47,25.72,24.89,26.53,"2,57B",0,0,0.0,...,2,28.452601,28.930080,-0.477479,-0.132835,-0.344644,0.0,0.0,0,0
1012,2020-02-28,25.34,0.16,25.16,24.62,25.56,"2,23B",1,0,0.0,...,2,27.973740,28.664148,-0.690409,-0.244349,-0.446059,0.0,0.0,0,0


In [4]:
class Tester:

    def __init__(self,data, inputs,output,model):
        """
        data = data used
        inputs = inputs we want to test
        output = desired output
        efficiency = dictionary grouping the efficiency of each set of inputs
        model = model to be used
        """
        self.data = data
        self.inputs = inputs
        self.output = output
        self.efficiency = {}
        self.model = model
        
        
    def split_data(self,data,ratio,shuffle=True,random_state= 40):
        """
        data = data to be splitted
        ratio = cutting point between training data and testing data
        shuffle = indicator on whether or not the data should be shuffled
        random_state = state that determines the shuffling algorithm
        """
        train, test = train_test_split(data, test_size = ratio, random_state = random_state,shuffle = shuffle)
        X_test = test.iloc[:, :-1]
        X_train = train.iloc[:,:-1]
        Y_test = test.iloc[:, -1]
        Y_train = train.iloc[:,-1]
        return [X_train, Y_train, X_test,Y_test]
    
    
    def partition_inputs(self, num):
        """
        num = number of elements per partitions
        
        """
        partitions = []
        for comb in itertools.combinations(self.inputs,num):
            partitions.append(comb)
        return partitions
    
    
    def test(self,ratio, num=2,shuffle=True,random_state=40):
        """calculate efficiency of the model based on all the partition of the given inputs 
        
        ratio = cutting point between training data and testing data
        num = number of elements per partition
        shuffle = indicator on whether or not the data should be shuffled
        random_state = state that determines the shuffling algorithm
        
        """
        df = data[self.inputs + [self.output]]
        self.efficiency = {self.partition_inputs(num)[i]:0 for i in range(len(self.partition_inputs(num)))}
        df2 = self.split_data(df,ratio,shuffle,random_state)
      
        for key in self.efficiency:
            x_train = df2[0][list(key)].values
            y_train = df2[1].values
            x_test = df2[2][list(key)]
            y_test = df2[3].values
            self.model.fit(x_train,y_train)
            self.efficiency[key] = [self.model.score(x_test,y_test), balanced_accuracy_score(y_test, self.model.predict(x_test))]
       
        return self.efficiency
    
        

In [9]:
cnb = MultinomialNB() 
tester = Tester(data, ["Support", "Resistance", "Hammer", "Williams%R", "ZeroCross", "SignalCross"], "Decision14",cnb)

In [10]:
maxx = []
for i in range(1,7):
    maxx.append((tester.test(0.294,num=i, shuffle = False)))

### Finding max 

In [11]:
maxx

[{('Support',): [0.5467128027681661, 0.5],
  ('Resistance',): [0.5467128027681661, 0.5],
  ('Hammer',): [0.5467128027681661, 0.5],
  ('Williams%R',): [0.5467128027681661, 0.5],
  ('ZeroCross',): [0.5467128027681661, 0.5],
  ('SignalCross',): [0.5467128027681661, 0.5]},
 {('Support', 'Resistance'): [0.5709342560553633, 0.5306309788385352],
  ('Support', 'Hammer'): [0.5397923875432526, 0.4949753599381583],
  ('Support', 'Williams%R'): [0.5467128027681661, 0.5],
  ('Support', 'ZeroCross'): [0.5467128027681661, 0.5],
  ('Support', 'SignalCross'): [0.5467128027681661, 0.5],
  ('Resistance', 'Hammer'): [0.5467128027681661, 0.5],
  ('Resistance', 'Williams%R'): [0.5467128027681661, 0.5],
  ('Resistance', 'ZeroCross'): [0.5467128027681661, 0.5],
  ('Resistance', 'SignalCross'): [0.5467128027681661, 0.5],
  ('Hammer', 'Williams%R'): [0.5397923875432526, 0.49432312300705383],
  ('Hammer', 'ZeroCross'): [0.5467128027681661, 0.5],
  ('Hammer', 'SignalCross'): [0.5397923875432526, 0.494975359938158

In [8]:
print(list(maxx[1].values())[6])
print(list(maxx[2].values())[10])
print(list(maxx[3].values())[11])

[0.5847750865051903, 0.5386385658914729]
[0.5847750865051903, 0.5386385658914729]
[0.5882352941176471, 0.5417635658914729]
