# Create indicators and find signals
This code is designed to calculate the below indicators:
- EMA
- MACD
- MACD mins and maxs
- MACD positive turns
- Support price
- Target price
- Prev week volume Vs 8 week average volume

We will also bring in if the target price was hit within 8 weeks for ML training and validation.

Each share has it's own trading pattern so we will train a different model for each share but use the same set of features, just the hyper paramenters will be tuned differenty.

In [1]:
#Import libraries
import pandas as pd
import numpy as np
import re

In [2]:
#Import the ftse list
path = "C:\\Users\\Robert\\Documents\\Python Scripts\\Stock trading - ML modelling\\Historical prices\\"
tick_ftse = pd.read_csv(path + "tick_ftse.csv")
tick_ftse = tick_ftse.iloc[:,1:]
tick_ftse.head()

Unnamed: 0,Company,Ticker
0,3i,III
1,Admiral Group,ADM
2,Anglo American plc,A
3,Antofagasta,ANTO
4,Ashtead Group,AHT


In [3]:
#Import and combine prices files
# df_prices_w = pd.read_csv(path + tick_ftse["Ticker"][1] + "_hist_prices_w.csv")
df_prices_w = pd.read_hdf(path + "all_hist_prices_w.h5")
#Drop unwanted columns
df_prices_w.drop(columns=["Unnamed: 0","Index"],inplace=True)
#Reformat columns where neccessary
df_prices_w["Date"] = df_prices_w["Date"].astype("datetime64")
print(df_prices_w.shape)
print(df_prices_w.dtypes)
df_prices_w.head(20)

(148627, 11)
Ticker            object
Date      datetime64[ns]
Open             float64
Close            float64
High             float64
Low              float64
Change           float64
Volume             int64
EMA12            float64
EMA26            float64
MACD             float64
dtype: object


Unnamed: 0,Ticker,Date,Open,Close,High,Low,Change,Volume,EMA12,EMA26,MACD
0,3IN,2009-03-16,0.81,0.81,0.84,0.8,0.0,2684963,,,
1,3IN,2009-03-23,0.81,0.83,0.85,0.8,0.02,6945456,,,
2,3IN,2009-03-30,0.83,0.83,1.53,0.83,0.0,3146138,,,
3,3IN,2009-04-06,0.85,0.81,0.85,0.81,-0.04,2240235,,,
4,3IN,2009-04-14,0.82,0.85,0.86,0.8,0.03,5015531,,,
5,3IN,2009-04-20,0.85,0.85,0.85,0.81,0.0,2271860,,,
6,3IN,2009-04-27,0.86,0.93,1.53,0.85,0.07,3138986,,,
7,3IN,2009-05-05,0.93,0.96,0.97,0.93,0.03,4137215,,,
8,3IN,2009-05-11,0.98,1.0,1.02,0.94,0.02,4485790,,,
9,3IN,2009-05-18,1.0,0.99,1.02,0.99,-0.01,3461152,,,


In [4]:
#Get in-row price change
def calc_changes(df_in,var_col,prev_col):
    df_in["change"] = df_in[var_col] - df_in[prev_col]
    df_in["per_change"] = df_in["change"] / df_in[prev_col]
    return (df_in["change"],df_in["per_change"])
    
df_prices_w["change_price"],df_prices_w["per_change_price"] = calc_changes(df_prices_w[["Close","Open"]].copy(),"Close","Open")
df_prices_w.drop(columns=["Change"],inplace=True)
df_prices_w.head(20)

Unnamed: 0,Ticker,Date,Open,Close,High,Low,Volume,EMA12,EMA26,MACD,change_price,per_change_price
0,3IN,2009-03-16,0.81,0.81,0.84,0.8,2684963,,,,0.0,0.0
1,3IN,2009-03-23,0.81,0.83,0.85,0.8,6945456,,,,0.02,0.024691
2,3IN,2009-03-30,0.83,0.83,1.53,0.83,3146138,,,,0.0,0.0
3,3IN,2009-04-06,0.85,0.81,0.85,0.81,2240235,,,,-0.04,-0.047059
4,3IN,2009-04-14,0.82,0.85,0.86,0.8,5015531,,,,0.03,0.036585
5,3IN,2009-04-20,0.85,0.85,0.85,0.81,2271860,,,,0.0,0.0
6,3IN,2009-04-27,0.86,0.93,1.53,0.85,3138986,,,,0.07,0.081395
7,3IN,2009-05-05,0.93,0.96,0.97,0.93,4137215,,,,0.03,0.032258
8,3IN,2009-05-11,0.98,1.0,1.02,0.94,4485790,,,,0.02,0.020408
9,3IN,2009-05-18,1.0,0.99,1.02,0.99,3461152,,,,-0.01,-0.01


# Create additional features

In [None]:
#Create a single function to run each stock through feature creation
def create_features(df_in):
    df_out = df_in.copy()
    
    return
#Then loop the tickers and combine these into one large dataset
#Export so as not to have to do this all again

In [5]:
#Calc vol as proportion of previous n-rows
def calc_prop_of_prev(s_in,periods = 4):
    s_cum = s_in.copy()
    for i in range(1,periods):
        s_cum += s_in.shift(i)
    return s_in / s_cum
df_prices_w["prop_vol"] = calc_prop_of_prev(df_prices_w["Volume"].copy().astype("float"),6)
df_prices_w.head(20)

Unnamed: 0,Ticker,Date,Open,Close,High,Low,Volume,EMA12,EMA26,MACD,change_price,per_change_price,prop_vol
0,3IN,2009-03-16,0.81,0.81,0.84,0.8,2684963,,,,0.0,0.0,
1,3IN,2009-03-23,0.81,0.83,0.85,0.8,6945456,,,,0.02,0.024691,
2,3IN,2009-03-30,0.83,0.83,1.53,0.83,3146138,,,,0.0,0.0,
3,3IN,2009-04-06,0.85,0.81,0.85,0.81,2240235,,,,-0.04,-0.047059,
4,3IN,2009-04-14,0.82,0.85,0.86,0.8,5015531,,,,0.03,0.036585,
5,3IN,2009-04-20,0.85,0.85,0.85,0.81,2271860,,,,0.0,0.0,0.101858
6,3IN,2009-04-27,0.86,0.93,1.53,0.85,3138986,,,,0.07,0.081395,0.137928
7,3IN,2009-05-05,0.93,0.96,0.97,0.93,4137215,,,,0.03,0.032258,0.20738
8,3IN,2009-05-11,0.98,1.0,1.02,0.94,4485790,,,,0.02,0.020408,0.210703
9,3IN,2009-05-18,1.0,0.99,1.02,0.99,3461152,,,,-0.01,-0.01,0.153757


In [6]:
#Get period-period changes
df_prices_w["close_shift1"] = df_prices_w["Close"].shift(1)
df_prices_w["change_close_shift1"],df_prices_w["per_change_close_shift1"] = calc_changes(df_prices_w[["Close","close_shift1"]].copy(),"Close","close_shift1")

df_prices_w["vol_shift1"] = df_prices_w["Volume"].shift(1)
df_prices_w["change_vol_shift1"],df_prices_w["per_change_vol_shift1"] = calc_changes(df_prices_w[["Volume","vol_shift1"]].copy(),"Volume","vol_shift1")

df_prices_w["MACD_shift1"] = df_prices_w["MACD"].shift(1)
df_prices_w["change_MACD_shift1"],df_prices_w["per_change_MACD_shift1"] = calc_changes(df_prices_w[["MACD","vol_shift1"]].copy(),"MACD","vol_shift1")

df_prices_w["EMA26_shift1"] = df_prices_w["EMA26"].shift(1)
df_prices_w["change_EMA26_shift1"],df_prices_w["per_change_EMA26_shift1"] = calc_changes(df_prices_w[["EMA26","EMA26_shift1"]].copy(),"EMA26","EMA26_shift1")

df_prices_w.head(20)

Unnamed: 0,Ticker,Date,Open,Close,High,Low,Volume,EMA12,EMA26,MACD,...,per_change_close_shift1,vol_shift1,change_vol_shift1,per_change_vol_shift1,MACD_shift1,change_MACD_shift1,per_change_MACD_shift1,EMA26_shift1,change_EMA26_shift1,per_change_EMA26_shift1
0,3IN,2009-03-16,0.81,0.81,0.84,0.8,2684963,,,,...,,,,,,,,,,
1,3IN,2009-03-23,0.81,0.83,0.85,0.8,6945456,,,,...,0.024691,2684963.0,4260493.0,1.586798,,,,,,
2,3IN,2009-03-30,0.83,0.83,1.53,0.83,3146138,,,,...,0.0,6945456.0,-3799318.0,-0.547022,,,,,,
3,3IN,2009-04-06,0.85,0.81,0.85,0.81,2240235,,,,...,-0.024096,3146138.0,-905903.0,-0.287941,,,,,,
4,3IN,2009-04-14,0.82,0.85,0.86,0.8,5015531,,,,...,0.049383,2240235.0,2775296.0,1.238841,,,,,,
5,3IN,2009-04-20,0.85,0.85,0.85,0.81,2271860,,,,...,0.0,5015531.0,-2743671.0,-0.547035,,,,,,
6,3IN,2009-04-27,0.86,0.93,1.53,0.85,3138986,,,,...,0.094118,2271860.0,867126.0,0.381681,,,,,,
7,3IN,2009-05-05,0.93,0.96,0.97,0.93,4137215,,,,...,0.032258,3138986.0,998229.0,0.31801,,,,,,
8,3IN,2009-05-11,0.98,1.0,1.02,0.94,4485790,,,,...,0.041667,4137215.0,348575.0,0.084254,,,,,,
9,3IN,2009-05-18,1.0,0.99,1.02,0.99,3461152,,,,...,-0.01,4485790.0,-1024638.0,-0.228419,,,,,,


In [7]:
#Mark minimums and maximums
def flag_mins(s_in,period):
    s_out = 0
    for i in range(1,period):
        s_out += (s_in > s_in.shift(i)) | (s_in.shift(i).isnull())
        s_out += (s_in > s_in.shift(-i)) | (s_in.shift(-i).isnull())
        s_out += s_in.isnull()
    s_out = s_out == 0
    return s_out
def flag_maxs(s_in,period):
    s_out = 0
    for i in range(1,period):
        s_out += s_in < s_in.shift(i)
        s_out += s_in < s_in.shift(-i)
        s_out += s_in.isnull()
    s_out = s_out == 0
    return s_out

In [8]:
#MACD max mins
view_range = 6 #This variable decides how far forward or back to look
df_prices_w["MACD_min"] = flag_mins(df_prices_w["MACD"],view_range)
df_prices_w["MACD_max"] = flag_maxs(df_prices_w["MACD"],view_range)
#Price max mins
view_range = 6 #This variable decides how far forward or back to look
df_prices_w["close_min"] = flag_mins(df_prices_w["Close"],view_range)
df_prices_w["close_max"] = flag_maxs(df_prices_w["Close"],view_range)
#EMA26 max mins
view_range = 6 #This variable decides how far forward or back to look
df_prices_w["EMA26_min"] = flag_mins(df_prices_w["EMA26"],view_range)
df_prices_w["EMA26_max"] = flag_maxs(df_prices_w["EMA26"],view_range)
df_prices_w.head(50)

Unnamed: 0,Ticker,Date,Open,Close,High,Low,Volume,EMA12,EMA26,MACD,...,per_change_MACD_shift1,EMA26_shift1,change_EMA26_shift1,per_change_EMA26_shift1,MACD_min,MACD_max,close_min,close_max,EMA26_min,EMA26_max
0,3IN,2009-03-16,0.81,0.81,0.84,0.8,2684963,,,,...,,,,,False,False,False,False,False,False
1,3IN,2009-03-23,0.81,0.83,0.85,0.8,6945456,,,,...,,,,,False,False,False,False,False,False
2,3IN,2009-03-30,0.83,0.83,1.53,0.83,3146138,,,,...,,,,,False,False,False,False,False,False
3,3IN,2009-04-06,0.85,0.81,0.85,0.81,2240235,,,,...,,,,,False,False,False,False,False,False
4,3IN,2009-04-14,0.82,0.85,0.86,0.8,5015531,,,,...,,,,,False,False,False,False,False,False
5,3IN,2009-04-20,0.85,0.85,0.85,0.81,2271860,,,,...,,,,,False,False,False,False,False,False
6,3IN,2009-04-27,0.86,0.93,1.53,0.85,3138986,,,,...,,,,,False,False,False,False,False,False
7,3IN,2009-05-05,0.93,0.96,0.97,0.93,4137215,,,,...,,,,,False,False,False,False,False,False
8,3IN,2009-05-11,0.98,1.0,1.02,0.94,4485790,,,,...,,,,,False,False,False,False,False,False
9,3IN,2009-05-18,1.0,0.99,1.02,0.99,3461152,,,,...,,,,,False,False,False,False,False,False


In [9]:
#Function to find last max and mins
def prev_mark_date(df_in,bool_col,this_i):
    return df_in.iloc[:this_i].loc[df_in[bool_col] == True,"Date"].max()
def prev_value(df_in,var_col,bool_col):
    df_in["prev_marker_date"] = [prev_mark_date(df_in,bool_col,x) for x in df_in.index]
    df_in = pd.merge(df_in.drop(columns=[var_col]),df_in[[var_col,"Date"]],left_on="prev_marker_date",right_on="Date",how="left")
    return (df_in[var_col],df_in["prev_marker_date"])
    
df_prices_w["prev_max_close"],df_prices_w["prev_max_close_date"] = prev_value(df_prices_w[["Date","Close","close_max"]].copy(),"Close","close_max")
df_prices_w["prev_min_close"],df_prices_w["prev_min_close_date"] = prev_value(df_prices_w[["Date","Close","close_min"]].copy(),"Close","close_min")
df_prices_w["prev_max_MACD"],df_prices_w["prev_max_MACD_date"] = prev_value(df_prices_w[["Date","MACD","MACD_max"]].copy(),"MACD","MACD_max")
df_prices_w["prev_min_MACD"],df_prices_w["prev_min_MACD_date"] = prev_value(df_prices_w[["Date","MACD","MACD_min"]].copy(),"MACD","MACD_min")
df_prices_w["prev_max_EMA26"],df_prices_w["prev_max_EMA26_date"] = prev_value(df_prices_w[["Date","EMA26","EMA26_max"]].copy(),"EMA26","EMA26_max")
df_prices_w["prev_min_EMA26"],df_prices_w["prev_min_EMA26_date"] = prev_value(df_prices_w[["Date","EMA26","EMA26_min"]].copy(),"EMA26","EMA26_min")
df_prices_w.head(200)

ValueError: cannot reindex from a duplicate axis

In [None]:
#Drop the columns which are no longer needed
df_prices_w.drop(columns=["MACD_min","MACD_max","close_min","close_max","EMA26_min","EMA26_max"],inplace=True)

In [None]:
#Calc the value changes and percentage changes of these movements
df_prices_w["max_change_close"],df_prices_w["max_per_change_close"] = calc_changes(df_prices_w[["Close","prev_max_close"]].copy(),"Close","prev_max_close")
df_prices_w["min_change_close"],df_prices_w["min_per_change_close"] = calc_changes(df_prices_w[["Close","prev_min_close"]].copy(),"Close","prev_min_close")
df_prices_w["max_change_MACD"],df_prices_w["max_per_change_MACD"] = calc_changes(df_prices_w[["MACD","prev_max_MACD"]].copy(),"MACD","prev_max_MACD")
df_prices_w["min_change_MACD"],df_prices_w["min_per_change_MACD"] = calc_changes(df_prices_w[["MACD","prev_min_MACD"]].copy(),"MACD","prev_min_MACD")
df_prices_w["max_change_EMA26"],df_prices_w["max_per_change_EMA26"] = calc_changes(df_prices_w[["EMA26","prev_max_EMA26"]].copy(),"EMA26","prev_max_EMA26")
df_prices_w["min_change_EMA26"],df_prices_w["min_per_change_EMA26"] = calc_changes(df_prices_w[["EMA26","prev_min_EMA26"]].copy(),"EMA26","prev_min_EMA26")
df_prices_w.head(200)

In [None]:
#Mark date change since max and mins
df_prices_w["prev_max_close_date_change"] = df_prices_w["Date"] - df_prices_w["prev_max_close_date"]
df_prices_w["prev_min_close_date_change"] = df_prices_w["Date"] - df_prices_w["prev_min_close_date"]

df_prices_w["prev_max_MACD_date_change"] = df_prices_w["Date"] - df_prices_w["prev_max_MACD_date"]
df_prices_w["prev_min_MACD_date_change"] = df_prices_w["Date"] - df_prices_w["prev_min_MACD_date"]

df_prices_w["prev_max_EMA26_date_change"] = df_prices_w["Date"] - df_prices_w["prev_max_EMA26_date"]
df_prices_w["prev_min_EMA26_date_change"] = df_prices_w["Date"] - df_prices_w["prev_min_EMA26_date"]

In [None]:
#Mark value change since max and mins
df_prices_w["prev_max_close_change"] = df_prices_w["Close"] - df_prices_w["prev_max_close"]
df_prices_w["prev_min_close_change"] = df_prices_w["Close"] - df_prices_w["prev_min_close"]

df_prices_w["prev_max_MACD_change"] = df_prices_w["MACD"] - df_prices_w["prev_max_MACD"]
df_prices_w["prev_min_MACD_change"] = df_prices_w["MACD"] - df_prices_w["prev_min_MACD"]

df_prices_w["prev_max_EMA26_change"] = df_prices_w["EMA26"] - df_prices_w["prev_max_EMA26"]
df_prices_w["prev_min_EMA26_change"] = df_prices_w["EMA26"] - df_prices_w["prev_min_EMA26"]

In [None]:
#Mark points of MACD positive entry
def macd_pos(s_in):
    return (s_in > s_in.shift(1)) & (s_in > 0) & (s_in.shift(1) < 0)
def macd_neg(s_in):
    return (s_in < s_in.shift(1)) & (s_in < 0) & (s_in.shift(1) > 0)
df_prices_w["MACD_pos"] = macd_pos(df_prices_w["MACD"])
df_prices_w["MACD_neg"] = macd_neg(df_prices_w["MACD"])

In [None]:
df_prices_w.dtypes

# Create the buy signal
I am classifying a stock worth buying if it meets all the below criteria:
- The target price (previous max) is hit within x (set as variable target_price_period) periods proceeding this period
- The target price is an increase of x% (set as variable min_gain) over the proceeding day's open price
- There is a drop in the closing price of less x% (set as variable max_drop) between this period and the x periods proceeding this period

In [None]:
#Define the variables
target_price_period = 8
min_gain = 0.1
max_drop = -0.05

In [None]:
#Check if the target price is hit within the target_price_period
def target_hit_check(x,var_s,target_s,periods):
    check_s = False
    for i in range(x+1,x+periods+1):
        if (i < len(target_s)) and (target_s[x] <= var_s[i]):
            check_s = True
            break
    return check_s
tmp_df = df_prices_w[["Date","Open","Close","prev_max_close"]].copy()
tmp_df["buy_target_check"] = [target_hit_check(x,tmp_df["Close"],tmp_df["prev_max_close"],target_price_period) for x in range(0,len(tmp_df))]
print("BUY_TARGET_CHECK COUNT:"+str(len(tmp_df[tmp_df["buy_target_check"]])))
tmp_df.head(50)

In [None]:
#min_gain check
tmp_df["gain"] = tmp_df["prev_max_close"] - tmp_df["Open"]
tmp_df["per_gain"] = tmp_df["gain"] / tmp_df["Open"]
tmp_df["buy_gain_check"] = tmp_df["per_gain"] >= min_gain
tmp_df.drop(columns=["gain","per_gain"],inplace=True)
print("BUY_GAIN_CHECK COUNT:"+str(len(tmp_df[tmp_df["buy_gain_check"]])))
tmp_df.head(50)

In [None]:
#Check if the target price is hit within the target_price_period
def max_drop_check(x,ref_s,var_s,periods):
    check_s = True #Start as True and then if it drops below the threshol turn it to False
    ref_val = ref_s[x]
    for i in range(x+1,x+periods+1):
        if (i < len(var_s)) and ((var_s[i] - ref_val) / ref_val <= max_drop):
            check_s = False
            break
    return check_s
tmp_df["buy_drop_check"] = [max_drop_check(x,tmp_df["Open"],tmp_df["Close"],target_price_period) for x in range(0,len(tmp_df))]
print("BUY_DROP_CHECK COUNT:"+str(len(tmp_df[tmp_df["buy_drop_check"]])))
tmp_df.head(50)

In [None]:
#Find the buy signals
tmp_df["buy"] = tmp_df["buy_target_check"] & tmp_df["buy_gain_check"] & tmp_df["buy_drop_check"]
print("BUY SIGNALS: "+str(len(tmp_df[tmp_df["buy"]])))
tmp_df.head(50)

In [None]:
#Join back into main table
df_prices_w["buy"] = tmp_df["buy"]
df_prices_w.head(20)

# Create the sell signal
I am classifying the sell signals as:
- The close price dips below the target price (previous max) x (set as variable target_price_period) periods proceeding this period
- There is an increase in the closing price of less x% (set as variable max_drop) over the next x periods

In [None]:
#Perform if the target is crossed again
def target_cross_check(x,var_s,target_s,periods):
    check_s = False
    for i in range(x+1,x+periods+1):
        if (i < len(target_s)) and (target_s[x] >= var_s[i]):
            check_s = True
            break
    return check_s
tmp_df = df_prices_w[["Date","Open","Close","prev_min_close"]].copy()
tmp_df["sell_target_check"] = [target_cross_check(x,tmp_df["Close"],tmp_df["prev_min_close"],target_price_period) for x in range(0,len(tmp_df))]
print("SELL_TARGET_CHECK COUNT:"+str(len(tmp_df[tmp_df["sell_target_check"]])))
tmp_df.head(50)

In [None]:
#Check if the target price is hit within the target_price_period
def max_drop_check(x,ref_s,var_s,periods):
    check_s = False #Start as False and then if it drops below the threshodl turn it to True
    ref_val = ref_s[x]
    for i in range(x+1,x+periods+1):
        if (i < len(var_s)) and ((var_s[i] - ref_val) / ref_val <= max_drop):
            check_s = True
            break
    return check_s
tmp_df["sell_drop_check"] = [max_drop_check(x,tmp_df["Open"],tmp_df["Close"],target_price_period) for x in range(0,len(tmp_df))]
print("SELL_DROP_CHECK COUNT:"+str(len(tmp_df[tmp_df["sell_drop_check"]])))
tmp_df.head(50)

In [None]:
#Find the sell signals
tmp_df["sell"] = tmp_df["sell_target_check"] & tmp_df["sell_drop_check"]
print("SELL SIGNALS: "+str(len(tmp_df[tmp_df["sell"]])))
tmp_df.head(50)

In [None]:
#Join back into main table
df_prices_w["sell"] = tmp_df["sell"]
df_prices_w.head(20)

# Mark as hold
Hold when not buy or sell

In [None]:
df_prices_w["hold"] = (df_prices_w["buy"] == False) & (df_prices_w["sell"] == False)
print("HOLD COUNT:",str(len(df_prices_w[df_prices_w["hold"]])))
df_prices_w.head(50)

In [None]:
#Check for undefined
df_prices_w["no_signal"] = (df_prices_w["buy"] == False) & (df_prices_w["hold"] == False) & (df_prices_w["sell"] == False)
print("NO_SIGNAL COUNT:",str(len(df_prices_w[df_prices_w["no_signal"]])))

In [None]:
#Composite all singals into one column
df_prices_w["signal"] = None
df_prices_w.loc[df_prices_w["buy"] == True,"signal"] = "buy"
df_prices_w.loc[df_prices_w["hold"] == True,"signal"] = "hold"
df_prices_w.loc[df_prices_w["sell"] == True,"signal"] = "sell"
df_prices_w.drop(columns=["buy","hold","sell"],inplace=True)
df_prices_w.head(50)

In [None]:
df_prices_w.columns

# Build the logistic regression model
This model is designed to predict if a week should be buy, hold or sell.


In [None]:
#Import the modules
from sklearn.linear_model import LogisticRegression as lr

In [None]:
feature_cols = [
    'Open'
    ,'Close'
    ,'High'
    ,'Low'
    ,'Volume'
    ,'EMA12'
    ,'EMA26'
    ,'MACD'
    ,'change_price'
    ,'per_change_price'
    ,'prop_vol'
    ,'change_close_shift1'
    ,'per_change_close_shift1'
    ,'change_vol_shift1'
    ,'per_change_vol_shift1'
    ,'change_MACD_shift1'
    ,'per_change_MACD_shift1'
    ,'change_EMA26_shift1'
    ,'per_change_EMA26_shift1'
    ,'min_change_close'
    ,'min_per_change_close'
    ,'max_change_MACD'
    ,'max_per_change_MACD'
    ,'min_change_MACD'
    ,'min_per_change_MACD'
    ,'max_change_EMA26'
    ,'max_per_change_EMA26'
    ,'min_change_EMA26'
    ,'min_per_change_EMA26'
    ,'prev_max_close_date_change'
    ,'prev_min_close_date_change'
    ,'prev_max_MACD_date_change'
    ,'prev_min_MACD_date_change'
    ,'prev_max_EMA26_date_change'
    ,'prev_min_EMA26_date_change'
    ,'prev_max_close_change'
    ,'prev_min_close_change'
    ,'prev_max_MACD_change'
    ,'prev_min_MACD_change'
    ,'prev_max_EMA26_change'
    ,'prev_min_EMA26_change'
    ,'MACD_pos'
    ,'MACD_neg']
cat_cols = [
    'signal']
df_model = df_prices_w[feature_cols+cat_cols].copy()
print("COLUMNS:",df_model.columns)
print("SHAPE:",df_model.shape)

In [None]:
#Remove rows with null values
df_model.dropna(inplace=True)
df_model.reset_index(inplace=True)
print("COLUMNS:",df_model.columns)
print("SHAPE:",df_model.shape)

In [None]:
#Shuffle the dataset
rand_index = np.random.permutation(df_model.index.values)
df_model_rand = df_model.iloc[rand_index]
df_model_rand.index.values

In [None]:
#Create the train and test dataset
ind_lim = int(np.round(len(df_model_rand.index)*0.7))
print("ind_lim:",ind_lim)

df_train = df_model_rand.iloc[:ind_lim].reset_index()
df_test = df_model_rand.iloc[ind_lim:].reset_index()

print("train len:",len(df_train))
print("test len:",len(df_test))

# Creating a multi-classification model
The model will take a one-vs-all approach (IE 1 if it is this value, 0 if it is anything else) using the variables of "buy", "hold", and "sell" individually and building a model to find the error rate on each one.

Error rate is determined by the four classifications:
- True positive - correct - model is 1, actual is 1
- True negative - correct - model is 0, actual is 0
- False positive - error - model is 1, actual is 0
- False negative - error - model is 0, actual is 1

In [None]:
#Function for building a model and outputting a dictionary of models created
def build_models(unique_classes,df_train,features):
    #Create a dictionary of models
    models = {}
    for cl in unique_classes:
        lm = lr() #create the model
        x_train = df_train[features]
        y_train = df_train["signal"] == cl
        lm.fit(x_train,y_train)
        models[cl] = lm
    return models

#Function for creating a dataframe with model probabilities and the most likely outcome
def calc_probs_df(models,unique_classes,df_test,features):
    modelled_probs = pd.DataFrame(columns=unique_classes)
    for cl in unique_classes:
        x_test = df_test[features]
        modelled_probs[cl] = models[cl].predict_proba(x_test)[:,1] 
        #Column index 1 as predict_proba outputs 2 columns,
            #the first is the probability that it is a negative result
            #the second is the probability that it is a positive result
            #that's why we want column index 1, we are looking at the positive result
    #Determine the most probable result
    modelled_probs["signal"] = modelled_probs.idxmax(axis=1)
    for cl in unique_classes:
        print(cl,"count:",str(len(modelled_probs.loc[modelled_probs["signal"] == cl,:])))
    return modelled_probs

#Function to calculate the True Positive Rate for each classifier
def calc_tpr(probs_df,features,unique_classes):
    tpr_li = []
    for cl in unique_classes:
        true_pos = ((probs_df["real_signal"] == cl) & (probs_df["signal"] == cl)).sum()
        false_neg = ((probs_df["real_signal"] != cl) & (probs_df["signal"] == cl)).sum()
        if sum([true_pos,false_neg]) != 0:
            tpr = true_pos / sum([true_pos,false_neg])
        else:
            tpr = None
        tpr_li.append({
            "features":features
            ,"feature_count":len(features)
            ,"signal":cl
            ,"true_pos":true_pos
            ,"false_neg":false_neg
            ,"tpr":tpr
        })
    tpr_df = pd.DataFrame(tpr_li,columns=["features","feature_count","signal","true_pos","false_neg","tpr"])
    return tpr_df

In [None]:
#Create a list of unique values
unique_classes = df_model_rand["signal"].unique()
print("unique_classes:",unique_classes)
#Create a list of features to train on
features = ["MACD","MACD_pos","change_MACD_shift1","per_change_MACD_shift1"]
#Build each model and add to the models dictionary
models = build_models(unique_classes,df_train,features)
models

In [None]:
#Calculate the probabilities of each
modelled_probs = calc_probs_df(models,unique_classes,df_test,features)

In [None]:
#Compare these results to the actual results
modelled_probs["real_signal"] = df_test["signal"]
modelled_probs.head()

In [None]:
#Calculate the True Positive Rate (TPR) for each of the unique classifiers
tpr_df = calc_tpr(modelled_probs,features,unique_classes)
tpr_df

# Comparing individual features
- Loop through the feaures to find the tpr of each one.
- Then eliminate features with a very low accuracy.
- Then order in descending order of combined buy,sell,hold accuracy and combine in that order to find the best feature combination.

In [None]:
tpr_df = pd.DataFrame([])
for ft in feature_cols:
    print("")
    print("TESTING:",ft)
    #Create a list of unique values
    unique_classes = df_model_rand["signal"].unique()
#     print("unique_classes:",unique_classes)
    #Build each model and add to the models dictionary
    models = build_models(unique_classes,df_train,[ft])
#     print(models)
    #Calculate the probabilities of each
    modelled_probs = calc_probs_df(models,unique_classes,df_test,[ft])
#     print(modelled_probs)
    #Compare these results to the actual results
    modelled_probs["real_signal"] = df_test["signal"]
    #Calculate the True Positive Rate (TPR) for each of the unique classifiers
    tpr_df = tpr_df.append(calc_tpr(modelled_probs,ft,unique_classes))
tpr_df