# Importing libraries

In [29]:
import pandas as pd
import numpy as np
from sklearn.naive_bayes import ComplementNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, f1_score
from sklearn.metrics import balanced_accuracy_score

from fastquant import backtest, get_stock_data
from fastquant.strategies import TernaryStrategy
import matplotlib.pyplot as plt


# Importing data and last cleanup

In [30]:
df = pd.read_csv("cleaned_data.csv")
df

Unnamed: 0,Date,Close,Var. (%),Open,Low,High,Volume,Support,Resistance,Hammer,...,MACD LINE,SIGNAL LINE,HISTOGRAM,ZeroCross,SignalCross,Decision14,Decision28,Decision10,Decision50,Decision5
0,2016-01-26,4.20,-4.76,4.23,4.12,4.31,302717532,0,0,0.0,...,0.000000,0.000000,0.000000,,,-1,-1,-1,-1,-1
1,2016-01-27,4.57,8.81,4.22,4.16,4.64,352368579,0,0,0.0,...,0.000000,0.000000,0.000000,,,-1,-1,-1,-1,-1
2,2016-01-28,4.60,0.66,4.67,4.35,5.13,618332894,0,0,0.0,...,0.000000,0.000000,0.000000,,,-1,-1,-1,-1,-1
3,2016-01-29,4.84,5.22,4.70,4.52,4.86,439826229,0,0,0.0,...,0.000000,0.000000,0.000000,,,-1,-1,-1,-1,-1
4,2016-02-01,4.72,-2.48,4.74,4.66,4.86,245882432,0,0,0.0,...,0.000000,0.000000,0.000000,,,-1,-1,-1,-1,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1037,2020-04-03,15.34,-1.10%,16.30,14.93,16.36,"142,01M",0,0,0.0,...,-2.297545,-3.066222,0.768677,0.0,1.0,-1,-1,-1,-1,-1
1038,2020-04-06,15.77,2.80%,16.05,15.18,16.10,"122,54M",0,0,0.0,...,-2.031713,-2.859320,0.827607,0.0,1.0,-1,-1,-1,-1,-1
1039,2020-04-07,16.40,3.99%,16.61,16.34,17.07,"139,72M",0,0,0.0,...,-1.750030,-2.637462,0.887432,0.0,1.0,-1,-1,-1,-1,-1
1040,2020-04-08,17.32,5.61%,16.50,16.40,17.53,"114,58M",0,0,0.0,...,-1.436005,-2.397171,0.961166,0.0,1.0,-1,-1,-1,-1,-1


In [31]:
def feature_decision_selection(df,features, decision):
    """Reorganizing the dataset with the features first and the decision at the last column
    df = dataset
    features = input to the model
    decision = target value 
    """
    df2 = df.loc[df[decision] != -1] #removing the data where decisions are missing
    df2 = df2[features + [decision]]
    return df2

In [32]:
dataframes = [] #will contain all different datasets
features = ['Date','Close', 'Open', 'Low', 'High','Support', 'Resistance','Hammer', 'ZeroCross', 'SignalCross'] #removing WilliamR
decisions = ['Decision5', 'Decision10','Decision14', 'Decision28', 'Decision50']
for decision in decisions:
    dataframes.append(feature_decision_selection(df,features,decision))

# Splitting the data

In [33]:
def data_split(dataframes):
    """ split the dataframe in dataframes in testing and training set
        
        dataframes = array of dataframe
    """
    split_data = []
    for df in dataframes:
        train = df[0:692] #from 2016 to 2018
        test = df[692:] #from 2019 to 2020
        x_train = train.iloc[:,5:-1].to_numpy() #excluding the date column and ohlc columns 
        x_test = test.iloc[:,5:-1].to_numpy()
        y_train = train.iloc[:,-1].to_numpy()
        y_test = test.iloc[:,-1].to_numpy()
        split_data.append([x_train,x_test,y_train,y_test])

    return split_data

In [34]:
model_data = data_split(dataframes)

# Getting the predictions

In [35]:
def predictors(data,model):
    """train and test the model on the data
    
        data = testing and training data
        model = model used
    """
    predictions = []
    for d in data:
        x_train = d[0]
        x_test = d[1]
        y_train = d[2]
        y_test = d[3]
        model.fit(x_train,y_train)
        pred = model.predict(x_test)
        predictions.append([pred,balanced_accuracy_score(y_test,pred) , confusion_matrix(y_test,pred)])
        
    return predictions

In [36]:
prediction_info = predictors(model_data, ComplementNB()) #look at index 0 of each array for predictions

# Adding hold to the predictions
## repeated signals will be transformed to hold
## to be consistent with the fastquant libraries, sell = -1, hold = 0, buy = 1

In [37]:
def sell_transform(data):
    for i in range(len(data)):
        data[i][0] = np.where(data[i][0] == 0, -1, data[i][0])

In [38]:
sell_transform(prediction_info)

In [39]:
def add_hold(arr):
    last_decision = arr[0]
    decision = [last_decision]
    for i in range(1,len(arr)):
        if arr[i] == last_decision:
            decision.append(0)
        else:
            last_decision = arr[i]
            decision.append(last_decision)
    return decision

In [40]:
def final_decision(data):
    for d in data:
        d[0] = add_hold(d[0])

In [41]:
final_decision(prediction_info)

# Pre processing the data according to fastquant requirements

In [69]:
def fastquant_preprocess(data,pred):
    """Put data as an index and add the prediction column 
    
        data = array of dataframe
        pred = multidimensional array of prediction, accuracy and confusion matrix
    """
    fq_data = []
    for i in range(len(data)):
        fq = data[i].iloc[692:] # 2019 to 2020
        fq['custom'] = pred[i][0]
        fq['dt'] = fq['Date']
        fq = fq.drop('Date',axis = 1)
        fq.set_index('dt', inplace=True)
        fq = fq[['Open','High', 'Low', 'Close', 'custom']]#keeping only ohlc values and predictions
        fq_data.append(fq)
    return fq_data

In [70]:
fq_data = fastquant_preprocess(dataframes,prediction_info)

# Backtesting the strategy

In [89]:
def backtesting(fq_data):
    """ use fastquant library to get the final value of portfolio as well as percentage increase
        data = dataset
    """
    results = []
    for i in range(len(fq_data)):
        res = backtest(TernaryStrategy,fq_data[i], init_cash = 2342, plot=False,return_history = True, verbose = 0)[0]
        init_value = res['init_cash'].values
        final_value = res['final_value'].values
        incr = (final_value / init_value -1) * 100
        results.append([init_value,final_value,incr])
    return results

In [90]:
results = backtesting(fq_data)

In [94]:
results

[[array([2342], dtype=int64), array([3465.901925]), array([47.98898057])],
 [array([2342], dtype=int64), array([1873.655775]), array([-19.99761849])],
 [array([2342], dtype=int64), array([3240.2092]), array([38.35222886])],
 [array([2342], dtype=int64), array([3205.1788]), array([36.85648164])],
 [array([2342], dtype=int64), array([2018.421225]), array([-13.81634394])]]

# Display of the results and discussion

In [111]:
for i in range(len(results)):
    print("confusion matrix for " + decisions[i])
    print(prediction_info[i][2])
    print("percentage of interest gained or lost for the decision")
    print(results[i][2][0])
    print("")

confusion matrix for Decision5
[[132   6]
 [132  42]]
percentage of interest gained or lost for the decision
47.988980572160564

confusion matrix for Decision10
[[86 63]
 [77 81]]
percentage of interest gained or lost for the decision
-19.997618488471304

confusion matrix for Decision14
[[128  11]
 [132  32]]
percentage of interest gained or lost for the decision
38.35222886421865

confusion matrix for Decision28
[[117  12]
 [126  34]]
percentage of interest gained or lost for the decision
36.85648163962434

confusion matrix for Decision50
[[58 47]
 [69 93]]
percentage of interest gained or lost for the decision
-13.816343936806119



### As can be seen by the matrices above as well as the money gained, it seems that there is a relationship between the accuracy at which the selling entries are predicted and the money made. Indeed, when the prediction of selling is really high, we have 48%,38%,37% returns. When the selling accuracy is significantly lower, the fact that the buying entries are predicted with an higher accuracy does not seem to impact the return, as can be seen by the returns of -20%, -14%

### During this analysis, the 'hold' class was added after the classifying, including it prior may give different results and contradicts the above hypothesis.

### The WilliamR indicator was dropped since it considerably lowered the accuracy. We believe it is due to the fact that it was a ternary value while the other inputs were binary values and changing often, therefore adding more noise than information

### The set of inputs were chosen after different trials and were kept because of the high return they yielded. Some set of inputs gave more accuracy  but less return. In an optic of making profits, return should be prioritized to accuracy.