# Create indicators and find signals
This code is designed to calculate the features for model training.

We will also bring in if the target price was hit within 8 weeks for ML training and validation.

Each share has it's own trading pattern so we will train a different model for each share but use the same set of features, just the hyper paramenters will be tuned differenty.

V5 build - Normalise vs rolling max 5 year window

In [1]:
#Import libraries
import pandas as pd
import numpy as np
import re
import tables
import os
import datetime as dt
from rf_modules import *

In [2]:
#Import the ftse list
path = r"C:\Users\Robert\Documents\python_scripts\stock_trading_ml_modelling\historical_prices"
tick_ftse = pd.read_csv(path + r"\tick_ftse.csv")
tick_ftse = tick_ftse.iloc[:,1:]
for col in tick_ftse:
    tick_ftse.rename(columns={col:clean_col_name(col)},inplace=True)
tick_ftse.head()
tick_ftse['ticker'] = [re.sub('(?=[0-9A-Z])*\.(?=[0-9A-Z]+)','-',tick) for tick in tick_ftse['ticker']]
tick_ftse['ticker'] = [re.sub('[^0-9A-Z\-]','',tick) for tick in tick_ftse['ticker']]

In [3]:
#Import and combine prices files
df_prices_w = pd.read_hdf(path + r"\all_hist_prices_w.h5")
#Rename columns
for col in df_prices_w:
    df_prices_w.rename(columns={col:clean_col_name(col)},inplace=True)
#Drop unwanted columns
try:
    df_prices_w.drop(columns=["unnamed_0","index"],inplace=True)
except Exception as e:
    print(e)
#Reformat columns where neccessary
df_prices_w["date"] = df_prices_w["date"].astype("datetime64")
print(df_prices_w.shape)
print(df_prices_w.dtypes)
df_prices_w.head()

"['unnamed_0' 'index'] not found in axis"
(280168, 13)
ticker                 object
date           datetime64[ns]
high                  float64
low                   float64
volume                float64
open                  float64
close                 float64
change                float64
ema12                 float64
ema26                 float64
macd_line             float64
signal_line           float64
macd                  float64
dtype: object


Unnamed: 0,ticker,date,high,low,volume,open,close,change,ema12,ema26,macd_line,signal_line,macd
0,III,2007-12-31,1023.0,964.0,4511565.0,995.0,965.0,-30.0,,,,,
1,III,2008-01-07,989.0,917.5,16056554.0,967.5,924.0,-43.5,,,,,
2,III,2008-01-14,936.0,881.0,21691287.0,917.0,901.0,-16.0,,,,,
3,III,2008-01-21,965.0,847.0,17850580.0,891.0,917.5,26.5,,,,,
4,III,2008-01-28,971.0,903.0,12079245.0,911.0,961.0,50.0,,,,,


In [4]:
#Remove tickers with fewer than 34 entries as this is where the MACD can be calculated
print('START ROW COUNT -> {}'.format(df_prices_w.shape[0]))
print('START TICK COUNT -> {}'.format(tick_ftse.shape[0]))
for tick in tick_ftse.ticker:
    print(tick,' ->',df_prices_w[df_prices_w.ticker == tick].shape[0])
    if df_prices_w[df_prices_w.ticker == tick].shape[0] < 34:
        #Remove from dataframe
        print('\tTOO FEW RECORDS FOR {}'.format(tick))
        df_prices_w = df_prices_w.loc[df_prices_w.ticker != tick]
        print('\tNEW ROW COUNT -> {}'.format(df_prices_w.shape[0]))
        #Remove from tick_ftse
        tick_ftse = tick_ftse.loc[tick_ftse.ticker != tick]
        print('\tNEW TICK COUNT -> {}'.format(tick_ftse.shape[0]))
print('\nEND ROW COUNT -> {}'.format(df_prices_w.shape[0]))
print('END TICK COUNT -> {}'.format(tick_ftse.shape[0]))

START ROW COUNT -> 280168
START TICK COUNT -> 351
III  -> 623
ABF  -> 1055
ADM  -> 363
AAL  -> 1018
ANTO  -> 1055
AHT  -> 1033
AZN  -> 1055
AUTO  -> 248
AVV  -> 1055
AV  -> 2788
BA  -> 1546
BARC  -> 1055
BDEV  -> 1055
BKG  -> 1055
BHP  -> 534
BP  -> 1575
BATS  -> 1055
BLND  -> 1055
BT-A  -> 1546
BNZL  -> 1052
BRBY  -> 910
CCL  -> 999
CNA  -> 1055
CCH  -> 346
CPG  -> 1041
CRH  -> 1041
CRDA  -> 1047
DCC  -> 1037
DGE  -> 1055
EVR  -> 424
EXPN  -> 688
FERG  -> 1056
FLTR  -> 28
	TOO FEW RECORDS FOR FLTR
	NEW ROW COUNT -> 280140
	NEW TICK COUNT -> 350
FRES  -> 606
GSK  -> 1055
GLEN  -> 448
HLMA  -> 1023
HL  -> 651
HIK  -> 718
HSX  -> 1022
HSBA  -> 1266
IMB  -> 1190
INF  -> 1076
IHG  -> 854
ITRK  -> 894
IAG  -> 866
ITV  -> 1022
JD  -> 92
JMAT  -> 1247
KGF  -> 1266
LAND  -> 1274
LGEN  -> 1266
LLOY  -> 1232
LSE  -> 942
MNG  -> 8
	TOO FEW RECORDS FOR MNG
	NEW ROW COUNT -> 280132
	NEW TICK COUNT -> 349
MGGT  -> 1485
MRO  -> 822
MNDI  -> 624
MRW  -> 1266
NG  -> 1253
NXT  -> 1533
NMC  -> 383
OCDO  

In [5]:
#Function for calculating ema
def calc_ema(_s_in,_periods):
    """Function used to create EMA for a series
    
    args:
    -----
    _s_in - pandas series - series of float values
    _periods - int - value describing how far to look at for EMA calc
    
    returns:
    ------
    pandas series    
    """
    #Calc mod val
    _mod = 2/(_periods+1)
    #Calc sma
    _sma_s = [0] * _s_in.shape[0]
    for _i in range(0,_periods):
        _sma_s += _s_in.shift(_i) / _periods
    #Calc ema
    _ema_s = _sma_s.copy()
    _ema_s[(_ema_s > 0) & (np.isnan(_ema_s) == False)] = _mod*(_s_in - _ema_s.shift(1)) + _ema_s.shift(1)
    return _ema_s.copy()

In [6]:
#Function for calculating the MACD
def calc_macd(_ema_lng_s,_ema_sht_s,_sig_period:int):
    """Function used to create MACD for a series
    
    args:
    -----
    _ema_lng_s - pandas series - series of float values for the long term EMA 
    _ema_sht_s - pandas series - series of float values for the short term EMA
    _sig_period - int - value describing how far to look at for MACD calc
    
    returns:
    ------
    tuple of pandas series,pandas series,pandas series - MACD line, signal line, macd histogram  
    """
    #Make a df
    _tmp_df = pd.DataFrame([])
    _tmp_df['ema_lng'] = _ema_lng_s
    _tmp_df['ema_sht'] = _ema_sht_s
    #Calc the signal line
    _tmp_df['macd_line'] = _tmp_df['ema_sht'] - _tmp_df['ema_lng']
    _tmp_df['signal_line'] = calc_ema(_tmp_df['macd_line'],_sig_period)
    _tmp_df['macd_hist'] = _tmp_df['macd_line'] - _tmp_df['signal_line']
    return (_tmp_df['macd_line'].copy(),_tmp_df['signal_line'].copy(),_tmp_df['macd_hist'].copy())

In [7]:
#Calc the ema and macds for the data
def calc_ema_macd(_df_in):
    """Function used to call EMA and MACD functions
    
    args:
    -----
    _df_in - pandas dataframe - must have columns 'close' and 'date' 
    
    returns:
    ------
    pandas dataframe - with new columns for ema12,ema26,macd_line,signal_line,macd
    """
    _tick_df = _df_in.copy()
    try:
        #Add in the ema and macd
        _tick_df = _tick_df.sort_values(by='date')
#         _tick_df = _tick_df.reset_index(drop=True)
        _tick_df['ema12'] = calc_ema(_tick_df['close'],12)
        _tick_df['ema26'] = calc_ema(_tick_df['close'],26)
        _tick_df['macd_line'],_tick_df['signal_line'],_tick_df['macd'] = calc_macd(_tick_df['ema26'],_tick_df['ema12'],9)
        return _tick_df
    except Exception as e:
        print('ERROR:{}'.format(e))
        return _tick_df

In [8]:
#Create a function which normalises a feature based only on the values which have come before it - avoids time series bias
def norm_time_s(_ind,_s_in,_window):
    """Function used to call EMA and MACD functions
    
    args:
    -----
    _ind - int - the index of this value in the series
    _s_in - pandas series - a series of values to be normalised
    _window - int - the number of values to look over
    
    returns:
    ------
    float - normalised value in the window period
    """
    _this_ind = _ind - _s_in.index.min()
    if _this_ind < _window:
        _st_ind = 0
    else:
        _st_ind = _this_ind - _window
    _min = np.nanmin(_s_in[_st_ind:_this_ind+1].values)
    _max = np.nanmax(_s_in[_st_ind:_this_ind+1].values)
    _norm_val = (_s_in[_ind] - _min) / (_max - _min)
    return _norm_val

In [9]:
#Run the functions
def norm_prices(_df_in):
    """Function used to normalise all prices in the dataframe
    
    args:
    -----
    _df_in - pandas dataframe - must contain values 'open','close','high','low','volume'
    
    returns:
    ------
    pandas dataframe - with normalised values for 'open','close','high','low','volume'
    """
    _df_out = _df_in.copy()
    #Normalise the columns which need it
    _norm_cols = [
        #Standard features
        "open"
        ,"close"
        ,"high"
        ,"low"
        ,"volume"
    ]
    #Reset the index
    _df_out.sort_values(['date'],ascending=True,inplace=True)
    #Calc the 5yr window
    _5yr = 5*52
    #Normalise
    for _col in _norm_cols:
        _df_out["{}_orig".format(_col)] = _df_out[_col].copy() #Take a copy so as the values are changed this does not affect following calculations
        _df_out[_col] = [norm_time_s(_x,_df_out["{}_orig".format(_col)],_5yr) for _x in _df_out.index]
    return _df_out

In [10]:
#Set records into the correct order
df_prices_w = df_prices_w.sort_values(['ticker','date'],ascending=[True,True])
df_prices_w.reset_index(inplace=True,drop=True)

In [11]:
# Normalize the prices by ticker and time then create emas and macds for each ticker
print('NORALISING AND CALCULATING EMA & MACD VALUES')
error_li = []
run_time = process_time()
run_time.lap()
new_df = pd.DataFrame([])
for tick in tick_ftse.ticker:
    print('\nRUN FOR {} - {}'.format(tick,len(run_time.lap_li)))
    try:
        this_tick_df = df_prices_w[df_prices_w.ticker == tick]
        this_tick_df = norm_prices(this_tick_df.copy())
        #Calculate the ema and macd
        this_tick_df = calc_ema_macd(this_tick_df)
        #Append back on to the dataframe
        new_df = new_df.append(this_tick_df)
        print('\tSUCCESS')
        run_time.lap()
        run_time.show_latest_lap_time()
    except Exception as e:
        print('\tERROR -> {}'.format(e))
        error_li.append(e)
df_prices_w = new_df
run_time.end()
print('\n\nCOMPLETED - ERRORS ENCOUNTERED -> {}'.format(len(error_li)))
if len(error_li) > 0:
    print(error_li)

NORALISING AND CALCULATING EMA & MACD VALUES

RUN FOR III - 1




	SUCCESS
LAP 2 TIME -> 0:0:1

RUN FOR ABF - 2
	SUCCESS
LAP 3 TIME -> 0:0:2

RUN FOR ADM - 3
	SUCCESS
LAP 4 TIME -> 0:0:0

RUN FOR AAL - 4
	SUCCESS
LAP 5 TIME -> 0:0:2

RUN FOR ANTO - 5
	SUCCESS
LAP 6 TIME -> 0:0:2

RUN FOR AHT - 6
	SUCCESS
LAP 7 TIME -> 0:0:2

RUN FOR AZN - 7
	SUCCESS
LAP 8 TIME -> 0:0:2

RUN FOR AUTO - 8
	SUCCESS
LAP 9 TIME -> 0:0:0

RUN FOR AVV - 9
	SUCCESS
LAP 10 TIME -> 0:0:2

RUN FOR AV - 10
	SUCCESS
LAP 11 TIME -> 0:0:7

RUN FOR BA - 11
	SUCCESS
LAP 12 TIME -> 0:0:3

RUN FOR BARC - 12
	SUCCESS
LAP 13 TIME -> 0:0:2

RUN FOR BDEV - 13
	SUCCESS
LAP 14 TIME -> 0:0:2

RUN FOR BKG - 14
	SUCCESS
LAP 15 TIME -> 0:0:2

RUN FOR BHP - 15
	SUCCESS
LAP 16 TIME -> 0:0:1

RUN FOR BP - 16
	SUCCESS
LAP 17 TIME -> 0:0:3

RUN FOR BATS - 17
	SUCCESS
LAP 18 TIME -> 0:0:2

RUN FOR BLND - 18
	SUCCESS
LAP 19 TIME -> 0:0:2

RUN FOR BT-A - 19
	SUCCESS
LAP 20 TIME -> 0:0:3

RUN FOR BNZL - 20
	SUCCESS
LAP 21 TIME -> 0:0:2

RUN FOR BRBY - 21
	SUCCESS
LAP 22 TIME -> 0:0:2

RUN FOR CCL - 22
	S

	SUCCESS
LAP 170 TIME -> 0:0:1

RUN FOR ERM - 170
	SUCCESS
LAP 171 TIME -> 0:0:2

RUN FOR FCIT - 171
	SUCCESS
LAP 172 TIME -> 0:0:1

RUN FOR FDM - 172
	SUCCESS
LAP 173 TIME -> 0:0:0

RUN FOR FXPO - 173
	SUCCESS
LAP 174 TIME -> 0:0:1

RUN FOR FEV - 174
	SUCCESS
LAP 175 TIME -> 0:0:1

RUN FOR FSV - 175
	SUCCESS
LAP 176 TIME -> 0:0:1

RUN FOR FCSS - 176
	SUCCESS
LAP 177 TIME -> 0:0:1

RUN FOR FGT - 177
	SUCCESS
LAP 178 TIME -> 0:0:1

RUN FOR FGP - 178
	SUCCESS
LAP 179 TIME -> 0:0:2

RUN FOR FSJ - 179
	SUCCESS
LAP 180 TIME -> 0:0:2

RUN FOR FSFL - 180
	SUCCESS
LAP 181 TIME -> 0:0:0

RUN FOR FUTR - 181
	SUCCESS
LAP 182 TIME -> 0:0:2

RUN FOR GFS - 182
	SUCCESS
LAP 183 TIME -> 0:0:1

RUN FOR GFRD - 183
	SUCCESS
LAP 184 TIME -> 0:0:3

RUN FOR GAW - 184
	SUCCESS
LAP 185 TIME -> 0:0:3

RUN FOR GCP - 185
	SUCCESS
LAP 186 TIME -> 0:0:1

RUN FOR DIGS - 186
	SUCCESS
LAP 187 TIME -> 0:0:0

RUN FOR GSS - 187
	SUCCESS
LAP 188 TIME -> 0:0:1

RUN FOR GNS - 188
	SUCCESS
LAP 189 TIME -> 0:0:2

RUN FOR GOG

	SUCCESS
LAP 333 TIME -> 0:0:3

RUN FOR JDW - 333
	SUCCESS
LAP 334 TIME -> 0:0:3

RUN FOR SMWH - 334
	SUCCESS
LAP 335 TIME -> 0:0:1

RUN FOR WMH - 335
	SUCCESS
LAP 336 TIME -> 0:0:2

RUN FOR WTAN - 336
	SUCCESS
LAP 337 TIME -> 0:0:1

RUN FOR WIZZ - 337
	SUCCESS
LAP 338 TIME -> 0:0:0

RUN FOR WG - 338
	SUCCESS
LAP 339 TIME -> 0:0:2

RUN FOR WKP - 339
	SUCCESS
LAP 340 TIME -> 0:0:2

RUN FOR WWH - 340
	SUCCESS
LAP 341 TIME -> 0:0:1
TOTAL ELAPSED TIME -> 0:11:19


COMPLETED - ERRORS ENCOUNTERED -> 0


In [12]:
#Get in-row price change
def calc_changes(_s_in,_prev_s_in):
    """Function used to calculate the change between two values, absolute and percentage
    
    args:
    -----
    _s_in - pandas series - the current value to be compared
    _prev_s_in - pandas series - the base value to compare the values to
    
    returns:
    ------
    tuple - pandas series, pandas series - absolute change, percentage change
    """
    _s_change = _s_in - _prev_s_in
    _s_per_change = _s_change / _s_in
    return (_s_change,_s_per_change)

#Relabel col names
for col in df_prices_w:
    df_prices_w.rename(columns={col:col.lower()},inplace=True)
    
df_prices_w["change_price"],df_prices_w["per_change_price"] = calc_changes(df_prices_w.close.copy(),df_prices_w.open.copy())
df_prices_w.head()

Unnamed: 0,ticker,date,high,low,volume,open,close,change,ema12,ema26,macd_line,signal_line,macd,open_orig,close_orig,high_orig,low_orig,volume_orig,change_price,per_change_price
128355,III,2007-12-31,,,,,,-30.0,,,,,,995.0,965.0,1023.0,964.0,4511565.0,,
128356,III,2008-01-07,0.0,0.0,1.0,0.0,0.0,-43.5,,,,,,967.5,924.0,989.0,917.5,16056554.0,0.0,
128357,III,2008-01-14,0.0,0.0,1.0,0.0,0.0,-16.0,,,,,,917.0,901.0,936.0,881.0,21691287.0,0.0,
128358,III,2008-01-21,0.333333,0.0,0.77644,0.0,0.257812,26.5,,,,,,891.0,917.5,965.0,847.0,17850580.0,0.257812,1.0
128359,III,2008-01-28,0.402299,0.478632,0.440501,0.192308,0.9375,50.0,,,,,,911.0,961.0,971.0,903.0,12079245.0,0.745192,0.794872


In [13]:
df_prices_w = df_prices_w.sort_values(['ticker','date'],ascending=[True,True])
df_prices_w.reset_index(inplace=True,drop=True)
print(df_prices_w.ticker.unique())
df_prices_w.head()

['3IN' '888' 'AAL' 'ABF' 'ADM' 'AGK' 'AGR' 'AHT' 'AJB' 'AML' 'ANTO' 'APAX'
 'ASCL' 'ASHM' 'ASL' 'ATST' 'AUTO' 'AV' 'AVST' 'AVV' 'AZN' 'BA' 'BAB'
 'BAG' 'BAKK' 'BARC' 'BATS' 'BBGI' 'BBOX' 'BBY' 'BDEV' 'BEZ' 'BGEO' 'BGFD'
 'BGSC' 'BHP' 'BKG' 'BLND' 'BME' 'BNKR' 'BNZL' 'BOY' 'BP' 'BRBY' 'BRSC'
 'BRW' 'BRWM' 'BT-A' 'BVIC' 'BVS' 'BWY' 'BYG' 'CAPC' 'CARD' 'CBG' 'CCC'
 'CCH' 'CCL' 'CEY' 'CINE' 'CKN' 'CLDN' 'CLI' 'CNA' 'CNE' 'COA' 'COB' 'CPG'
 'CPI' 'CRDA' 'CRH' 'CRST' 'CSP' 'CTEC' 'CTY' 'CWK' 'DC' 'DCC' 'DGE'
 'DIGS' 'DJAN' 'DLG' 'DLN' 'DNLM' 'DOM' 'DPH' 'DPLM' 'DRX' 'ECM' 'EDIN'
 'EIG' 'ELM' 'EMG' 'ENOG' 'EQN' 'ERM' 'ESNT' 'ETO' 'EVR' 'EXPN' 'EZJ'
 'FCIT' 'FCSS' 'FDM' 'FERG' 'FEV' 'FGP' 'FGT' 'FOUR' 'FRES' 'FSFL' 'FSJ'
 'FSV' 'FUTR' 'FXPO' 'GAW' 'GCP' 'GFRD' 'GFS' 'GFTU' 'GLEN' 'GLO' 'GNC'
 'GNS' 'GOG' 'GPOR' 'GRG' 'GRI' 'GSK' 'GSS' 'GVC' 'HAS' 'HFG' 'HGT' 'HICL'
 'HIK' 'HILS' 'HL' 'HLMA' 'HMSO' 'HOC' 'HRI' 'HSBA' 'HSL' 'HSTG' 'HSV'
 'HSX' 'HTG' 'HVPE' 'HWDN' 'IAG' 'IBST' 'ICGT' 'ICP' 'IGG' 

Unnamed: 0,ticker,date,high,low,volume,open,close,change,ema12,ema26,macd_line,signal_line,macd,open_orig,close_orig,high_orig,low_orig,volume_orig,change_price,per_change_price
0,3IN,2007-12-31,,,,,,-1.75,,,,,,149.18,147.43,149.89,147.07,1373801.0,,
1,3IN,2008-01-07,0.0,,1.0,0.0,1.0,1.76,,,,,,147.07,148.83,149.54,147.07,2345191.0,1.0,1.0
2,3IN,2008-01-14,1.0,1.0,0.799111,0.668246,1.0,0.7,,,,,,148.48,149.18,150.59,147.43,2150049.0,0.331754,0.331754
3,3IN,2008-01-21,1.0,0.0,1.0,0.336493,1.0,4.22,,,,,,147.78,152.0,154.82,145.32,3070968.0,0.663507,0.663507
4,3IN,2008-01-28,1.0,1.0,0.670041,1.0,1.0,2.11,,,,,,149.89,152.0,154.82,148.83,2510972.0,0.0,0.0


# Create buy signals, and sell signals
I am classifying a stock worth buying if it meets all the below criteria:
- The target price (previous max) is hit within x (set as variable target_price_period) periods proceeding this period
- The target price is an increase of x% (set as variable min_gain) over the proceeding day's open price
- There is a drop in the closing price of less x% (set as variable max_drop) between this period and the x periods proceeding this period

I am classifying the sell signals as:
- The close price dips below the target price (previous max) x (set as variable target_price_period) periods proceeding this period
- There is an increase in the closing price of less x% (set as variable max_drop) over the next x periods

In [14]:
#Define the variables for buy sell signals
target_price_period = 12
min_gain = 0.1
max_drop = -0.05

In [15]:
#Programming note
#df.shift(1) looks 1 period into the past
#df.shift(-1) looks 1 period into the future

In [16]:
#Check if the target price is hit within the target_price_period
def min_gain_check(_var_s,_target_s,_periods:int=12):
    """Function used to check if the value meets the min gain criteria
    
    args:
    -----
    _var_s - pandas series - value to be compared
    _target_s - pandas series - the target value to be hit
    _periods - int - time period to check for gain over
    
    returns:
    ------
    pandas series - bools
    """
    _check_s = [False] * _var_s.shape[0]
    for _i in range(1,_periods+1):
        _tmp_check_s = _var_s.shift(-_i) > _target_s #True if price is >= limit
        _check_s = _check_s | _tmp_check_s
    return _check_s

In [17]:
def max_drop_check(_var_s,_target_s,_periods:int=12):
    """Function used to check if the value meets the max drop criteria
    
    args:
    -----
    _var_s - pandas series - value to be compared
    _target_s - pandas series - the target value to be hit
    _periods - int - time period to check for gain over
    
    returns:
    ------
    pandas series - bools
    """
    _check_s = [False] * _var_s.shape[0]
    for _i in range(1,_periods+1):
        _tmp_check_s = _var_s.shift(-_i) < _target_s #True if price is <= limit
        _check_s = _check_s | _tmp_check_s
    return _check_s

In [18]:
def close_vs_close(_var_s,_shift:int=1):
    """Function used to calculate the change over a given period
    
    args:
    -----
    _var_s - pandas series - values to be compared
    _shift - int - time period to shift _var_s over for comparison
    
    returns:
    ------
    pandas series - floats
    """
    _check_s = _var_s.shift(_shift) - _var_s
    return _check_s

In [19]:
#Create a function for finding buy signals
def get_buys(var_s):
    """Function used to find if a value meets the requirements for a buy signal
    
    args:
    -----
    _var_s - pandas series - values to be checked
    
    returns:
    ------
    pandas series - bools
    """
    
    #Check if the target price is hit within the target_price_period
    target_s = var_s * (1+min_gain)
    min_gain_s = min_gain_check(var_s,target_s,target_price_period) == True #Function returns True when min_gain is hit
    print('BUY min_gain_s -> {}'.format(min_gain_s[min_gain_s == True].shape))
    
    #Check if the sell price is hit within the target_price_period
    target_s = var_s * (1+max_drop)
    max_drop_s = max_drop_check(var_s,target_s,target_price_period) == False #Function returns False when does not go below target
    print('BUY max_drop_s -> {}'.format(max_drop_s[max_drop_s == True].shape))
    
    #Check if the following day is a positive change on today's close price
    close_vs_close_pos_s = close_vs_close(var_s,-1) > 0
    print('BUY close_vs_close_pos_s -> {}'.format(close_vs_close_pos_s[max_drop_s == True].shape))
    
    #Find the buy signals
    s_out = min_gain_s & max_drop_s & close_vs_close_pos_s
    print('BUY ALL -> {}'.format(s_out[s_out == True].shape))
    
    return s_out

In [20]:
#Function for finding sell signals
def get_sells(var_s):
    """Function used to find if a value meets the requirements for a sell signal
    
    args:
    -----
    _var_s - pandas series - values to be compared
    
    returns:
    ------
    pandas series - bools
    """
    
    #Check if the target price is hit within the target_price_period
    target_s = var_s * (1+max_drop)
    max_drop_s = max_drop_check(var_s,target_s,target_price_period) == True #Function returns True when max_drop is hit
    print('SELL max_drop_s -> {}'.format(max_drop_s[max_drop_s == True].shape))
    
    #Perform if the target is crossed again
    target_s = var_s * (1+min_gain)
    min_gain_s = min_gain_check(var_s,target_s,target_price_period) == False #Function returns False when min_gain is not hit
    print('SELL min_gain_s -> {}'.format(min_gain_s[min_gain_s == True].shape))
    
    #Check if the following day is a negative change on today's close price
    close_vs_close_neg_s = close_vs_close(var_s,-1) < 0
    print('SELL close_vs_close_pos_s -> {}'.format(close_vs_close_neg_s[max_drop_s == True].shape))
    
    #Find the sell signals
    s_out = max_drop_s & min_gain_s & close_vs_close_neg_s
    print('SELL ALL -> {}'.format(s_out[s_out == True].shape))
    
    return s_out

In [21]:
#Get buy signals
df_prices_w['buy'] = get_buys(df_prices_w['close'])

#Get sell signals
df_prices_w['sell'] = get_sells(df_prices_w['close'])

#Get hold signals
df_prices_w["hold"] = (df_prices_w["buy"] == False) & (df_prices_w["sell"] == False)

print('BUY PERCENTAGE -> {:.2f}%'.format(df_prices_w[df_prices_w['buy'] == True].shape[0]*100/df_prices_w.shape[0]))
print('SELL PERCENTAGE -> {:.2f}%'.format(df_prices_w[df_prices_w['sell'] == True].shape[0]*100/df_prices_w.shape[0]))
print('HOLD PERCENTAGE -> {:.2f}%'.format(df_prices_w[df_prices_w['hold'] == True].shape[0]*100/df_prices_w.shape[0]))

BUY min_gain_s -> (144669,)
BUY max_drop_s -> (88005,)
BUY close_vs_close_pos_s -> (88005,)
BUY ALL -> (51959,)
SELL max_drop_s -> (191962,)
SELL min_gain_s -> (135298,)
SELL close_vs_close_pos_s -> (191962,)
SELL ALL -> (69302,)
BUY PERCENTAGE -> 18.56%
SELL PERCENTAGE -> 24.75%
HOLD PERCENTAGE -> 56.69%


# Create additional features

In [22]:
#Define the variables for additional features
period_high_volatility = 5
period_low_volatility = 3
gap_high_volatility = 3
gap_low_volatility = 2

In [23]:
#Mark minimums and maximums
def flag_mins(_s_in,_period:int=3,_gap:int=3,_cur:bool=False):
    """Function used to identify values in a series as mins
    
    args:
    -----
    _s_in - pandas series - values to be compared
    _period - int:3 - window to check values over
    _gap - int:3 - the number of period which must have elapsed before a min is 
        identified (prevents changing of min_flags on current week vs same week next week)
    _cur - bool:False - is this looking at current or past values
    
    returns:
    ------
    pandas series - bools
    """
    _s_out = 0
    #Create a benchmark series
    _bench_s = _s_in.shift(_gap)
    #Check within window
    for i in range(1,_period+1):
        _s_out += (_bench_s > _bench_s.shift(i)) | (_bench_s.shift(i).isnull())
    #Check within gap
    for i in range(1,_gap+1):
        _s_out += (_bench_s > _bench_s.shift(-i))
    _s_out = _s_out == 0
    return _s_out
def flag_maxs(_s_in,_period:int=3,_gap:int=3,_cur:bool=False):
    """Function used to identify values in a series as maxs
    
    args:
    -----
    _s_in - pandas series - values to be compared
    _period - int:3 - window to check values over
    _gap - int:3 - the number of period which must have elapsed before a min is 
        identified (prevents changing of min_flags on current week vs same week next week)
    _cur - bool:False - is this looking at current or past values
    
    returns:
    ------
    pandas series - bools
    """
    _s_out = 0
    #Create a benchmark series
    _bench_s = _s_in.shift(_gap)
    #Check within window
    for i in range(1,_period+1):
        _s_out += (_bench_s < _bench_s.shift(i)) | (_bench_s.shift(i).isnull())
    #Check within gap
    for i in range(1,_gap+1):
        _s_out += (_bench_s < _bench_s.shift(-i))
    _s_out = _s_out == 0
    return _s_out

In [24]:
#Function to find last max and mins
def prev_max_min(_df_in,_var_col,_bool_col,_gap:int=0):
    """Function to find last max and mins
    
    args:
    -----
    _df_in - pandas dataframe - must contain 'date', _var_col and _bool_col as column names
    _var_col - str - the name of the column containing the current variables
    _bool_col - str - the name of the column containing the bool values defining max and min vlaues
    _gap - int:0 - the number of period which must have elapsed before a min is 
        identified (prevents changing of min_flags on current week vs same week next week)
    
    returns:
    ------
    tuple - pandas series,pandas series - last max/min value, last max/min date
    """
    _df_in["prev_val"] = _df_in.loc[_df_in[_bool_col].shift(-_gap).fillna(False),_var_col]
    _df_in["prev_val"] = _df_in["prev_val"].fillna(method='ffill')
    _df_in["prev_marker_date"] = _df_in.loc[_df_in[_bool_col].shift(-_gap).fillna(False),"date"]
    _df_in["prev_marker_date"] = _df_in["prev_marker_date"].fillna(method='ffill')
    return (_df_in["prev_val"],_df_in["prev_marker_date"])

In [25]:
#Function for finding the gradient of a variable overa set period
def gradient(_s_in,_period:int=1):
    """Function for finding the gradient of a variable over a set period
    
    args:
    -----
    _s_in - pandas series - the series from which the gradient will be found
    _period - int:1 - the period over which the gradient will be found
    
    returns:
    ------
    pandas series
    """
    _s_out = _s_in - _s_in.shift(_period)
    return _s_out

In [26]:
#Calc vol as proportion of previous n-rows
def calc_prop_of_prev(_s_in,_periods:int = 4):
    """Function to this value as a proportion of the cum previous values
    
    args:
    -----
    _s_in - pandas series - values to be looked at
    _period - int - window to sum values over
    
    returns:
    ------
    pandas series - floats
    """
    _s_cum = _s_in.copy()
    for i in range(1,_periods):
        _s_cum += _s_in.shift(i)
    return _s_in / _s_cum

In [27]:
#Mark points of macd positive entry
def pos_entry(_s_in):
    """Function to check if this value is a new positive after a negative value
    
    args:
    -----
    _s_in - pandas series - values to be looked at
    
    returns:
    ------
    pandas series - bools
    """
    return (_s_in > _s_in.shift(1)) & (_s_in > 0) & (_s_in.shift(1) < 0)
def neg_entry(_s_in):
    """Function to check if this value is a new negative after a positive value
    
    args:
    -----
    _s_in - pandas series - values to be looked at
    
    returns:
    ------
    pandas series - bools
    """
    return (_s_in < _s_in.shift(1)) & (_s_in < 0) & (_s_in.shift(1) > 0)

In [28]:
#Function to normalise current price compared to another
def norm_s(_s_in):
    """Function to normalise current price compared to another
    
    args:
    -----
    _s_in - pandas series - values to be looked at
    
    returns:
    ------
    pandas series - bools
    """
    _s_out = (_s_in - _s_in.min()) / (_max_in - _s_in.max())
    return _s_out

In [29]:
#Create separate columns for pos and neg values - allows for normalisation
def pos_neg_cols(_s_in,_gt_lt = "GT"):
    """Function to separate columns for pos and neg values - allows for normalisation
    
    args:
    -----
    _s_in - pandas series - the vlaues ot be looked at
    _gt_lt - str:'GT' - defines if looking for positive or negative values
    
    returns:
    ------
    tuple - pandas series,pandas series - bools,floats
    """
    if _gt_lt.upper() == "GT":
        _bool_s = _s_in >= 0
    elif _gt_lt.upper() == "LT":
        _bool_s = _s_in <= 0
    _df_out = _s_in.to_frame()
    _df_out["_s_in"] = _s_in
    _df_out["_val"] = abs(_s_in[_bool_s])
    _val_s = _df_out["_val"].fillna(0,method=None)
    return (_bool_s,_val_s)

In [30]:
#Function for finding the max within a given time period using indexes
def max_min_period(_s_in,_period:int=1,_normalise:bool=False,_max_min:str='max'):
    """Function for calculating the max and mins within a period
    
    args:
    -----
    _s_in - pandas series - the vlaues to be looked at
    _period - int:1 - the time window to look over
    _max_min - str:max - looking for the max or min
    _normalise - bool:False - should the returned value be normalised?
    
    returns:
    ------
    pandas series - floats
    """
    #Find the min index
    _min_i = _s_in.index.min()
    if _normalise:
        _s_max = pd.Series([_s_in.loc[x-_period if x-_period >= _min_i else _min_i:x].max() for x in _s_in.index])
        _s_min = pd.Series([_s_in.loc[x-_period if x-_period >= _min_i else _min_i:x].min() for x in _s_in.index])
        _s_out = (_s_in - _s_min) / (_s_max - _s_min)
    else:
        #Get the max or min within a time period, ensuring not to go into negative indexes
        if _max_min == 'max':
            _s_out = pd.Series([_s_in.loc[x-_period if x-_period >= _min_i else _min_i:x].max() for x in _s_in.index])
        elif _max_min == 'min':
            _s_out = pd.Series([_s_in.loc[x-_period if x-_period >= _min_i else _min_i:x].min() for x in _s_in.index])
        else:
            raise ValueError('_max_min must be either \'max\' or \'min\'')
    return pd.Series(_s_out)

In [31]:
#Function for calculating the percentage change within a range
def per_change_in_range(_s_in,_period:int=1,**kwargs):
    """Function for calculating the percentage change of a value from it's max or min within a range
    
    args:
    -----
    _s_in - pandas series - the vlaues to be looked at
    _period - int:1 - the time window to look over
    
    returns:
    ------
    pandas series - floats
    """
    return ((_s_in - max_min_period(_s_in,_period,_normalise=False,**kwargs)) / max_min_period(_s_in,_period,_normalise=False,**kwargs))

In [None]:
def mk_prev_move_float(_s_in):
    """Function to find the the magnitude of the most recent value change.
    
    args:
    ------
    _s_in - pandas series - float values
    
    returns:
    ------
    pandas series - float values
    """
    _s_out = _s_in - _s_in.shift(1)
    _s_out[_s_out == 0] = np.nan
    _s_out = _s_out.fillna(method='ffill')
    return _s_out

def mk_prev_move_date(_s_in,_periods:int=7):
    """Function to find the time elapsed between two different changes.
    
    args:
    ------
    _s_in - pandas dataframe - datetime values
    _periods - int:7 - used to modify days of datetime into the period required
    
    
    returns:
    ------
    pandas series - int values
    """
    _s_out = _s_in - _s_in.shift(1)
    _s_check = pd.Series([np.floor(_x.days) for _x in _s_out])
    _s_check[_s_check == 0] = np.nan
    _s_check = _s_check.fillna(method='ffill')
    _s_check = [np.floor(_x/_periods) for _x in _s_check]
    return _s_check
#Create features for the cumulative sequential count of max/mins in a certain direction
def mk_move_cum(_s_in):
    """Function for counting the number of changes of the same sign sequentially.
    EG how many positive moves have there been in a row.
    
    args:
    ------
    _s_in - pandas series - floats
    
    returns:
    pandas series - floats
    """
    _li_out = []
    _prev_x = None
    #Loop through each value in _s_in
    for _i,_x in _s_in.iteritems():
        if np.isnan(_x) or _prev_x == None: #If this is the first value add it to the list
            _li_out.append(0)
        else:
            _prev_x = _prev_x if not np.isnan(_prev_x) else 0
            if ((_x < 0) & (_prev_x > 0)) or ((_x > 0) & (_prev_x < 0)): #If a sign change then reset to 0
                _li_out.append(0)
            else:
                if _prev_x != _x: #if there has been a change in value from this and the previous value increment it by 1
                    if _x > 0: #for positive value increment by 1
                        _li_out.append(_li_out[-1] + 1)                                    
                    else: #for negative values increment by -1
                        _li_out.append(_li_out[-1] - 1)
                else: #Otherwise just use the last added value
                    _li_out.append(_li_out[-1])
        _prev_x = _x
    return _li_out
#Create features showing the value change since the first min/max
def mk_long_prev_move_float(_ref_s,_val_s):
    """Function to find the value change since the first max/min move in the current sequential series.
    
    args:
    ------
    _ref_s - pandas series - the reference series from which changes will be detected
    _val_s - pandas series - the values series from which outputs will be created
    
    returns:
    ------
    pandas series - float values
    """
    _li_out = []
    _st_x = None
    _prev_x = None
    #Loop through each value in _s_in
    for _i,_x in _ref_s.iteritems():
        if np.isnan(_x) or _prev_x == None: #If this is the first value add it to the list
            _li_out.append(0)
        else:
            _prev_x = _prev_x if not np.isnan(_prev_x) else 0
            if ((_x < 0) & (_prev_x > 0)) or ((_x > 0) & (_prev_x < 0)): #If a sign change then reset to 0
                _li_out.append(0)
                _st_x = None
            else:
                if _st_x == None: #If _st_x has not been set yet set it to this value
                    _st_x = _val_s[_i]
                _li_out.append(_val_s[_i] - _st_x) #Now calculate the difference and add it to the list
        _prev_x = _x
    return _li_out
def mk_long_prev_move_date(_ref_s,_val_s,_periods:int=7):
    """Function to find the date change since the first max/min move in the current sequential series.
    
    args:
    ------
    _ref_s - pandas series - the reference series from which changes will be detected
    _val_s - pandas series - the values series from which outputs will be created
    _periods - int:7 - used to modify days of datetime into the period required
    
    returns:
    ------
    pandas series - int values
    """
    _li_out = []
    _st_x = None
    _prev_x = None
    #Loop through each value in _s_in
    for _i,_x in _ref_s.iteritems():
        if np.isnan(_x) or _prev_x == None: #If this is the first value add it to the list
            _li_out.append(0)
        else:
            _prev_x = _prev_x if not np.isnan(_prev_x) else 0
            if ((_x < 0) & (_prev_x > 0)) or ((_x > 0) & (_prev_x < 0)): #If a sign change then reset to 0
                _li_out.append(0)
                _st_x = None
            else:
                if _st_x == None: #If _st_x has not been set yet set it to this value
                    _st_x = _val_s[_i]
                _li_out.append(np.floor((_val_s[_i] - _st_x).days/_periods)) #Now calculate the difference and add it to the list
        _prev_x = _x
    return _li_out

In [1]:
#Create a dictionary of max character lengths of fields for use later in h5 file appending
def get_col_len_s(_s_in):
    """Get the max length of value in the series
    
    args:
    -----
    _s_in - pandas series - series holding values to look at for max field lengths
    
    returns:
    ------
    float
    """
    _tmp_s = pd.Series([len(str(x)) for x in _s_in])
    return _tmp_s.max()
    
def get_col_len_df(_df_in):
    """Create a dictionary of max character lengths of fields for use later in h5 file appending
    
    args:
    -----
    _df_in - pandas dataframe - dataframe holding values to look at for max field lengths
    
    returns:
    ------
    pandas series - floats
    """
    _col_lens = {}
    for _c in _df_in:
        _col_lens[_c] = get_col_len_s(_df_in[_c])
    return _col_lens
col_lens = get_col_len_df(df_prices_w)
col_lens

NameError: name 'df_prices_w' is not defined

In [33]:
#Create a single function to run each stock through feature creation
def create_features(_df_in):
    """A single function to run each stock through feature creation
    
    args:
    -----
    _df_in - pandas dataframe
    
    returns:
    ------
    pandas dataframe
    """
    
    _df_out = _df_in.copy() 
    
    #Calc vol as proportion of previous n-rows
    _df_out["prop_vol"] = calc_prop_of_prev(_df_out["volume"].copy().astype("float"),6)

    #Get period-period changes
    for col in ['close','volume','macd','ema26','signal_line','macd_line']:
        _df_out["change_{}_shift1".format(col)] = gradient(_df_out[col],1)
        
    #Compare close to the max/mins within 4,13,26,52 periods
    for col in ['close_orig','macd_line']:
        for max_min in ['max','min']:
            for period in [4,13,26,52]:
                _df_out["{}_per_change_{}_{}".format(col,max_min,period)] = per_change_in_range(_df_out[col],period,_max_min=max_min)
            
    #Mark points of macd positive entry
    _df_out["macd_pos_ent"] = pos_entry(_df_out["macd"])
    _df_out["macd_neg_ent"] = neg_entry(_df_out["macd"])
    
    #Create max min columns
    def mk_cols_max_min(tmp_df,col,period:int=4,gap:int=2):
        #Historic max mins
        tmp_df["{}_min".format(col)] = flag_mins(tmp_df[col],period,gap)
        tmp_df["{}_max".format(col)] = flag_maxs(tmp_df[col],period,gap)
        
    #Find previous max and mins, then look at:
        # - how many positive or negative moves in a row there has been
        # - what the move since the last (n-1) max/min was
        # - what the gradient is since the last (n-1) max/min
        # - what the move since the first max/min was
        # - what the gradient since the first max/min was
    def mk_cols_prev_max_min(tmp_df,col,period:int=4):
        #GETTING THE MAX/MINS - includes "gap" to account for time lag before declaring something as mn/max
        tmp_df["prev_max_{}".format(col)],tmp_df["prev_max_{}_date".format(col)] = prev_max_min(tmp_df[["date",col,"{}_max".format(col)]].copy(),col,"{}_max".format(col))
        tmp_df["prev_min_{}".format(col)],tmp_df["prev_min_{}_date".format(col)] = prev_max_min(tmp_df[["date",col,"{}_min".format(col)]].copy(),col,"{}_min".format(col))
#         #Shift the max min columns by n periods to not leak future information
#         tmp_df["prev_max_{}".format(col)] = tmp_df["prev_max_{}".format(col)].shift(period)
#         tmp_df["prev_min_{}".format(col)] = tmp_df["prev_min_{}".format(col)].shift(period)
#         tmp_df["prev_max_{}_date".format(col)] = tmp_df["prev_max_{}_date".format(col)].shift(period)
#         tmp_df["prev_min_{}_date".format(col)] = tmp_df["prev_min_{}_date".format(col)].shift(period)
        #WHAT WAS THE MOVE SINCE THE LAST (N-1) MAX/MIN
        tmp_df['prev_max_move_{}'.format(col)] = mk_prev_move_float(tmp_df["prev_max_{}".format(col)])
        tmp_df['prev_max_date_move_{}'.format(col)] = mk_prev_move_date(tmp_df["prev_max_{}_date".format(col)])        
        tmp_df['prev_min_move_{}'.format(col)] = mk_prev_move_float(tmp_df["prev_min_{}".format(col)])
        tmp_df['prev_min_date_move_{}'.format(col)] = mk_prev_move_date(tmp_df["prev_min_{}_date".format(col)])
        #WHAT IS THE GRADIENT SINCE THE LAST (N-1) MAX/MIN
        tmp_df['prev_max_grad_{}'.format(col)] = tmp_df['prev_max_move_{}'.format(col)] / tmp_df['prev_max_date_move_{}'.format(col)]
        tmp_df['prev_min_grad_{}'.format(col)] = tmp_df['prev_min_move_{}'.format(col)] / tmp_df['prev_min_date_move_{}'.format(col)]
        #HOW MANY PROGRESSIVE MAX/MINS IN A ROW HAVE THERE BEEN - UP OR DOWN FOR BOTH OPTIONS
        tmp_df['max_move_cum_{}'.format(col)] = mk_move_cum(tmp_df['prev_max_move_{}'.format(col)])
        tmp_df['min_move_cum_{}'.format(col)] = mk_move_cum(tmp_df['prev_min_move_{}'.format(col)])
        #WHAT WAS THE MOVE SINCE THE FIRST (N=0) MAX/MIN
        tmp_df['long_prev_max_move_{}'.format(col)] = mk_long_prev_move_float(tmp_df['prev_max_move_{}'.format(col)],tmp_df['prev_max_{}'.format(col)])
        tmp_df['long_prev_min_move_{}'.format(col)] = mk_long_prev_move_float(tmp_df['prev_min_move_{}'.format(col)],tmp_df['prev_min_{}'.format(col)])
        #WHAT WAS THE TIMEDELTA SINCE THE FIRST (N=0) MAX/MIN
        tmp_df['long_prev_max_move_date_{}'.format(col)] = mk_long_prev_move_date(tmp_df['prev_max_move_{}'.format(col)],tmp_df['prev_max_{}_date'.format(col)])
        tmp_df['long_prev_min_move_date_{}'.format(col)] = mk_long_prev_move_date(tmp_df['prev_min_move_{}'.format(col)],tmp_df['prev_min_{}_date'.format(col)])
        #WHAT IS THE GRADIENT SINCE THE FIRST (N=0) MAX/MIN
        tmp_df['long_max_grad_{}'.format(col)] = tmp_df['long_prev_max_move_{}'.format(col)] / tmp_df['long_prev_max_move_date_{}'.format(col)]
        tmp_df['long_min_grad_{}'.format(col)] = tmp_df['long_prev_min_move_{}'.format(col)] / tmp_df['long_prev_min_move_date_{}'.format(col)]
        #WHAT IS THE MAX MIN CONVERGENCE/DIVERGENCE
        tmp_df['prev_grad_conv_{}'.format(col)] = tmp_df['prev_min_grad_{}'.format(col)] - tmp_df['prev_max_grad_{}'.format(col)]
        tmp_df['long_grad_conv_{}'.format(col)] = tmp_df['long_min_grad_{}'.format(col)] - tmp_df['long_max_grad_{}'.format(col)]
        
    #Calc the value changes and percentage changes of these movements
    def mk_cols_prev_max_min_change(tmp_df,col):
        tmp_df["max_change_{}".format(col)],tmp_df["max_per_change_{}".format(col)] = calc_changes(tmp_df[col].copy(),tmp_df["prev_max_{}".format(col)].copy())
        tmp_df["min_change_{}".format(col)],tmp_df["min_per_change_{}".format(col)] = calc_changes(tmp_df[col].copy(),tmp_df["prev_min_{}".format(col)].copy())
        
    #Mark date change since max and mins and convert to periods
    def mk_cols_prev_max_min_date_change(tmp_df,col,period:int=7):
        tmp_df["prev_max_{}_date_change".format(col)] = tmp_df["date"] - tmp_df["prev_max_{}_date".format(col)]
        tmp_df["prev_min_{}_date_change".format(col)] = tmp_df["date"] - tmp_df["prev_min_{}_date".format(col)]
        #Convert all to period changes
        tmp_df["prev_max_{}_date_change".format(col)] = [np.floor(x.days/period) for x in tmp_df["prev_max_{}_date_change".format(col)]]
        tmp_df["prev_min_{}_date_change".format(col)] = [np.floor(x.days/period) for x in tmp_df["prev_min_{}_date_change".format(col)]]
    
    #Run function for columns - high volatility
    for col in ['close','signal_line','volume']:
        mk_cols_max_min(_df_out,col,period_high_volatility,gap_high_volatility)
        mk_cols_prev_max_min(_df_out,col,period_high_volatility)
        mk_cols_prev_max_min_change(_df_out,col) 
        mk_cols_prev_max_min_date_change(_df_out,col,7)
    #Run function for columns - low volatility
    for col in ['macd','ema26','macd_line']:
        mk_cols_max_min(_df_out,col,period_low_volatility,gap_low_volatility)
        mk_cols_prev_max_min(_df_out,col,period_low_volatility)
        mk_cols_prev_max_min_change(_df_out,col) 
        mk_cols_prev_max_min_date_change(_df_out,col,7)
    
    #Check for undefined
    _df_out["no_signal"] = (_df_out["buy"] == False) & (_df_out["hold"] == False) & (_df_out["sell"] == False)
    
    #Composite all singals into one column
    _df_out["signal"] = None
    _df_out.loc[_df_out["buy"] == True,"signal"] = "buy"
    _df_out.loc[_df_out["hold"] == True,"signal"] = "hold"
    _df_out.loc[_df_out["sell"] == True,"signal"] = "sell"
    _df_out.drop(columns=["buy","hold","sell"],inplace=True)
    print("SIGNAL COUNTS: \n{}".format(_df_out["signal"].value_counts()))
    
    return _df_out

In [34]:
#Define the columns for the output
out_cols = [
    #NON-NORMALISED COLS
    "ticker"
    ,"date"
    #NORMALISED COLS
    #Standard features
    ,"open"
    ,"close"
    ,"high"
    ,"low"
    ,"volume"
    ,"change_price"
    ,"per_change_price"
    ,"ema26"
    ,"macd"
    ,"signal_line"
    ,"macd_line"
]
#Append additional columns for key areas
for col in ['close_orig','macd_line']:
    for max_min in ['max','min']:
        for period in [4,13,26,52]:
            out_cols.append("{}_per_change_{}_{}".format(col,max_min,period))
for col in ['close','macd','ema26','signal_line','macd_line','volume']:
    #Shifted features
    out_cols.append("change_{}_shift1".format(col))
    #Max/min flags
    out_cols.append("{}_max".format(col))    
    out_cols.append("{}_min".format(col))    
    #Prev max/min features
    out_cols.append("prev_max_{}".format(col))
    out_cols.append("prev_min_{}".format(col))
    #date changes
    out_cols.append("prev_max_{}_date_change".format(col))
    out_cols.append("prev_min_{}_date_change".format(col))
    #Min max change features
    out_cols.append("max_change_{}".format(col))
    out_cols.append("min_change_{}".format(col))
    #prev max/mins (n-1) - compared to previous
    out_cols.append('prev_max_grad_{}'.format(col))
    out_cols.append('prev_min_grad_{}'.format(col))
    out_cols.append('prev_grad_conv_{}'.format(col))
    #prev max/mins (n=0) - compared to first in this run
    out_cols.append('max_move_cum_{}'.format(col))
    out_cols.append('min_move_cum_{}'.format(col))
    out_cols.append('long_prev_max_move_date_{}'.format(col))
    out_cols.append('long_prev_min_move_date_{}'.format(col))
    out_cols.append('long_max_grad_{}'.format(col))
    out_cols.append('long_min_grad_{}'.format(col))
    out_cols.append('long_grad_conv_{}'.format(col))
#Append signal
out_cols.append("signal")

In [35]:
#A conversion for all variables ot the correct dtype
conv_di = {
    #NON-NORMALISED COLS
    "ticker":'object'
    ,"date":'datetime64'
    #NORMALISED COLS
    #Standard features
    ,"open":'float64'
    ,"close":'float64'
    ,"high":'float64'
    ,"low":'float64'
    ,"volume":'float64'
    ,"change_price":'float64'
    ,"per_change_price":'float64'
    ,"ema26":'float64'
    ,"macd":'float64'
    ,"signal_line":'float64'
    ,"macd_line":'float64'
}
#Append additional columns for key areas
for col in ['close_orig','macd_line']:
    for max_min in ['max','min']:
        for period in [4,13,26,52]:
            conv_di["{}_per_change_{}_{}".format(col,max_min,period)] = 'float64'
for col in ['close','macd','ema26','signal_line','macd_line','volume']:
    #Shifted features
    conv_di["change_{}_shift1".format(col)] = 'float64'
    #Max/min flags
    conv_di["{}_max".format(col)] = 'bool'
    conv_di["{}_min".format(col)] = 'bool'
    #Prev max/min features
    conv_di["prev_max_{}".format(col)] = 'float64'
    conv_di["prev_min_{}".format(col)] = 'float64'
    #date changes
    conv_di["prev_max_{}_date_change".format(col)] = 'float64'
    conv_di["prev_min_{}_date_change".format(col)] = 'float64'
    #Min max change features
    conv_di["max_change_{}".format(col)] = 'float64'
    conv_di["min_change_{}".format(col)] = 'float64'
    #prev max/mins (n-1) - compared to previous
    conv_di['prev_max_grad_{}'.format(col)] = 'float64'
    conv_di['prev_min_grad_{}'.format(col)] = 'float64'
    conv_di['prev_grad_conv_{}'.format(col)] = 'float64'
    #prev max/mins (n=0) - compared to first in this run
    conv_di['max_move_cum_{}'.format(col)] = 'int64'
    conv_di['min_move_cum_{}'.format(col)] = 'int64'
    conv_di['long_prev_max_move_date_{}'.format(col)] = 'float64'
    conv_di['long_prev_min_move_date_{}'.format(col)] = 'float64'
    conv_di['long_max_grad_{}'.format(col)] = 'float64'
    conv_di['long_min_grad_{}'.format(col)] = 'float64'
    conv_di['long_grad_conv_{}'.format(col)] = 'float64'
#Append signal
conv_di["signal"] = 'object'

In [36]:
#Then loop the tickers and combine these into one large dataset
hf_store_name = path+r'\all_hist_prices_w_ft_eng2_TMP.h5'
h_store = pd.HDFStore(hf_store_name)
errors = []
run_time = process_time()
for tick in tick_ftse["ticker"]:
# for tick in ['SBRY','AJB']: #TEMP
    try:
        print("\n{}".format(len(run_time.lap_li)))
        print("RUN FOR {}".format(tick))
        #Isolate this ticker
        this_tick_df = df_prices_w[df_prices_w["ticker"] == re.sub('[^a-zA-Z0-9\-]','',tick)].copy()
        print("shape before -> {}".format(this_tick_df.shape))
        #Create the features
        this_tick_df = create_features(this_tick_df)
        #Clarify col_lens with cur cols in data
        this_col_lens = get_col_len_df(this_tick_df)
        min_itemsize_di = {}
        for col in out_cols:
            if col in col_lens:
                if this_col_lens[col] > col_lens[col]:
                    col_lens[col] = this_col_lens[col]
            else:
                col_lens[col] = this_col_lens[col]
            min_itemsize_di = col_lens[col]
        print("shape after -> {}".format(this_tick_df.shape))
        #Create function for appending to hdf file
        def append_to_hdf(df_in):
            df_in[out_cols].to_hdf(hf_store_name,key='weekly_data',append=True,min_itemsize=min_itemsize_di)
        #Append this data to the group
        try:
            append_to_hdf(this_tick_df)
            print('ADDED TO {}'.format(hf_store_name))
        except ValueError:
            print('WARNING -> Attempting to change dtypes')
            #Try changing the dtypes
            try:
                for col in out_cols:
                    # print(r'CONVERT {} FROM {} TO {}'.format(col,this_tick_df[col].dtype,conv_di[col]))
                    this_tick_df[col] = this_tick_df[col].astype(conv_di[col])
                append_to_hdf(this_tick_df)
                print('ADDED TO {}'.format(hf_store_name))
            except Exception as e:
                errors.append({"ticker":tick,"Error":e})
                print('ERROR READING TO FILE {}'.format(e))
        except Exception as e:
            errors.append({"ticker":tick,"Error":e})
            print('ERROR READING TO FILE {}'.format(e))
        #Lap
        run_time.lap()
        run_time.show_latest_lap_time()
    except Exception as e:
        h_store.close()
        errors.append({"ticker":tick,"Error":e})
        print('ERROR PROCESSING DATA {}'.format(e))
h_store.close()
print('\n\n')
run_time.end()
print('\nERROR COUNT -> {}'.format(len(errors)))
if len(errors) > 0:
    print('\tERRORS -> ')
    display(pd.DataFrame(errors))


0
RUN FOR III
shape before -> (623, 23)
SIGNAL COUNTS: 
hold    360
sell    141
buy     122
Name: signal, dtype: int64
shape after -> (623, 215)
ADDED TO C:\Users\Robert\Documents\python_scripts\stock_trading_ml_modelling\historical_prices\all_hist_prices_w_ft_eng2_TMP.h5
LAP 1 TIME -> 0:0:6

1
RUN FOR ABF
shape before -> (1055, 23)
SIGNAL COUNTS: 
hold    584
sell    290
buy     181
Name: signal, dtype: int64
shape after -> (1055, 215)
ADDED TO C:\Users\Robert\Documents\python_scripts\stock_trading_ml_modelling\historical_prices\all_hist_prices_w_ft_eng2_TMP.h5
LAP 2 TIME -> 0:0:9

2
RUN FOR ADM
shape before -> (363, 23)
SIGNAL COUNTS: 
hold    236
sell     91
buy      36
Name: signal, dtype: int64
shape after -> (363, 215)
ADDED TO C:\Users\Robert\Documents\python_scripts\stock_trading_ml_modelling\historical_prices\all_hist_prices_w_ft_eng2_TMP.h5
LAP 3 TIME -> 0:0:3

3
RUN FOR AAL
shape before -> (1018, 23)
SIGNAL COUNTS: 
hold    579
buy     224
sell    215
Name: signal, dtype: i

SIGNAL COUNTS: 
hold    572
sell    295
buy     188
Name: signal, dtype: int64
shape after -> (1055, 215)
ADDED TO C:\Users\Robert\Documents\python_scripts\stock_trading_ml_modelling\historical_prices\all_hist_prices_w_ft_eng2_TMP.h5
LAP 29 TIME -> 0:0:9

29
RUN FOR EVR
shape before -> (424, 23)
SIGNAL COUNTS: 
hold    232
sell    103
buy      89
Name: signal, dtype: int64
shape after -> (424, 215)
ADDED TO C:\Users\Robert\Documents\python_scripts\stock_trading_ml_modelling\historical_prices\all_hist_prices_w_ft_eng2_TMP.h5
LAP 30 TIME -> 0:0:4

30
RUN FOR EXPN
shape before -> (688, 23)
SIGNAL COUNTS: 
hold    408
sell    167
buy     113
Name: signal, dtype: int64
shape after -> (688, 215)
ADDED TO C:\Users\Robert\Documents\python_scripts\stock_trading_ml_modelling\historical_prices\all_hist_prices_w_ft_eng2_TMP.h5
LAP 31 TIME -> 0:0:6

31
RUN FOR FERG
shape before -> (1056, 23)
SIGNAL COUNTS: 
hold    572
sell    283
buy     201
Name: signal, dtype: int64
shape after -> (1056, 215)
AD

SIGNAL COUNTS: 
hold    736
sell    283
buy     247
Name: signal, dtype: int64
shape after -> (1266, 215)
ADDED TO C:\Users\Robert\Documents\python_scripts\stock_trading_ml_modelling\historical_prices\all_hist_prices_w_ft_eng2_TMP.h5
LAP 57 TIME -> 0:0:11

57
RUN FOR NG
shape before -> (1253, 23)
SIGNAL COUNTS: 
hold    678
sell    322
buy     253
Name: signal, dtype: int64
shape after -> (1253, 215)
ADDED TO C:\Users\Robert\Documents\python_scripts\stock_trading_ml_modelling\historical_prices\all_hist_prices_w_ft_eng2_TMP.h5
LAP 58 TIME -> 0:0:11

58
RUN FOR NXT
shape before -> (1533, 23)
SIGNAL COUNTS: 
hold    919
sell    378
buy     236
Name: signal, dtype: int64
shape after -> (1533, 215)
ADDED TO C:\Users\Robert\Documents\python_scripts\stock_trading_ml_modelling\historical_prices\all_hist_prices_w_ft_eng2_TMP.h5
LAP 59 TIME -> 0:0:13

59
RUN FOR NMC
shape before -> (383, 23)
SIGNAL COUNTS: 
hold    232
sell     98
buy      53
Name: signal, dtype: int64
shape after -> (383, 215)


SIGNAL COUNTS: 
hold    849
sell    334
buy     313
Name: signal, dtype: int64
shape after -> (1496, 215)
ADDED TO C:\Users\Robert\Documents\python_scripts\stock_trading_ml_modelling\historical_prices\all_hist_prices_w_ft_eng2_TMP.h5
LAP 85 TIME -> 0:0:14

85
RUN FOR SKG
shape before -> (647, 23)
SIGNAL COUNTS: 
hold    348
sell    153
buy     146
Name: signal, dtype: int64
shape after -> (647, 215)
ADDED TO C:\Users\Robert\Documents\python_scripts\stock_trading_ml_modelling\historical_prices\all_hist_prices_w_ft_eng2_TMP.h5
LAP 86 TIME -> 0:0:6

86
RUN FOR SPX
shape before -> (1464, 23)
SIGNAL COUNTS: 
hold    879
sell    331
buy     254
Name: signal, dtype: int64
shape after -> (1464, 215)
ADDED TO C:\Users\Robert\Documents\python_scripts\stock_trading_ml_modelling\historical_prices\all_hist_prices_w_ft_eng2_TMP.h5
LAP 87 TIME -> 0:0:13

87
RUN FOR SSE
shape before -> (1283, 23)
SIGNAL COUNTS: 
hold    731
sell    338
buy     214
Name: signal, dtype: int64
shape after -> (1283, 215)


ADDED TO C:\Users\Robert\Documents\python_scripts\stock_trading_ml_modelling\historical_prices\all_hist_prices_w_ft_eng2_TMP.h5
LAP 112 TIME -> 0:0:1

112
RUN FOR BME
shape before -> (288, 23)
SIGNAL COUNTS: 
hold    178
sell     63
buy      47
Name: signal, dtype: int64
shape after -> (288, 215)
ADDED TO C:\Users\Robert\Documents\python_scripts\stock_trading_ml_modelling\historical_prices\all_hist_prices_w_ft_eng2_TMP.h5
LAP 113 TIME -> 0:0:3

113
RUN FOR BAB
shape before -> (1035, 23)
SIGNAL COUNTS: 
hold    627
sell    253
buy     155
Name: signal, dtype: int64
shape after -> (1035, 215)
ADDED TO C:\Users\Robert\Documents\python_scripts\stock_trading_ml_modelling\historical_prices\all_hist_prices_w_ft_eng2_TMP.h5
LAP 114 TIME -> 0:0:9

114
RUN FOR BGFD
shape before -> (615, 23)
SIGNAL COUNTS: 
hold    340
sell    150
buy     125
Name: signal, dtype: int64
shape after -> (615, 215)
ADDED TO C:\Users\Robert\Documents\python_scripts\stock_trading_ml_modelling\historical_prices\all_hist

SIGNAL COUNTS: 
hold    750
sell    294
buy     270
Name: signal, dtype: int64
shape after -> (1314, 215)
ADDED TO C:\Users\Robert\Documents\python_scripts\stock_trading_ml_modelling\historical_prices\all_hist_prices_w_ft_eng2_TMP.h5
LAP 140 TIME -> 0:0:11

140
RUN FOR CBG
shape before -> (1249, 23)
SIGNAL COUNTS: 
hold    678
sell    316
buy     255
Name: signal, dtype: int64
shape after -> (1249, 215)
ADDED TO C:\Users\Robert\Documents\python_scripts\stock_trading_ml_modelling\historical_prices\all_hist_prices_w_ft_eng2_TMP.h5
LAP 141 TIME -> 0:0:11

141
RUN FOR CLI
shape before -> (1285, 23)
SIGNAL COUNTS: 
hold    727
sell    343
buy     215
Name: signal, dtype: int64
shape after -> (1285, 215)
ADDED TO C:\Users\Robert\Documents\python_scripts\stock_trading_ml_modelling\historical_prices\all_hist_prices_w_ft_eng2_TMP.h5
LAP 142 TIME -> 0:0:11

142
RUN FOR COA
shape before -> (1382, 23)
SIGNAL COUNTS: 
hold    839
sell    302
buy     241
Name: signal, dtype: int64
shape after -> (13

SIGNAL COUNTS: 
hold    441
sell    172
buy     126
Name: signal, dtype: int64
shape after -> (739, 215)
ADDED TO C:\Users\Robert\Documents\python_scripts\stock_trading_ml_modelling\historical_prices\all_hist_prices_w_ft_eng2_TMP.h5
LAP 168 TIME -> 0:0:7

168
RUN FOR JEO
shape before -> (595, 23)
SIGNAL COUNTS: 
hold    369
sell    135
buy      91
Name: signal, dtype: int64
shape after -> (595, 215)
ADDED TO C:\Users\Robert\Documents\python_scripts\stock_trading_ml_modelling\historical_prices\all_hist_prices_w_ft_eng2_TMP.h5
LAP 169 TIME -> 0:0:6

169
RUN FOR ERM
shape before -> (1122, 23)
SIGNAL COUNTS: 
hold    611
sell    276
buy     235
Name: signal, dtype: int64
shape after -> (1122, 215)
ADDED TO C:\Users\Robert\Documents\python_scripts\stock_trading_ml_modelling\historical_prices\all_hist_prices_w_ft_eng2_TMP.h5
LAP 170 TIME -> 0:0:10

170
RUN FOR FCIT
shape before -> (595, 23)
SIGNAL COUNTS: 
hold    371
sell    128
buy      96
Name: signal, dtype: int64
shape after -> (595, 21

SIGNAL COUNTS: 
hold    423
sell    199
buy     124
Name: signal, dtype: int64
shape after -> (746, 215)
ADDED TO C:\Users\Robert\Documents\python_scripts\stock_trading_ml_modelling\historical_prices\all_hist_prices_w_ft_eng2_TMP.h5
LAP 196 TIME -> 0:0:6

196
RUN FOR HMSO
shape before -> (1054, 23)
SIGNAL COUNTS: 
hold    546
sell    287
buy     221
Name: signal, dtype: int64
shape after -> (1054, 215)
ADDED TO C:\Users\Robert\Documents\python_scripts\stock_trading_ml_modelling\historical_prices\all_hist_prices_w_ft_eng2_TMP.h5
LAP 197 TIME -> 0:0:9

197
RUN FOR HVPE
shape before -> (403, 23)
SIGNAL COUNTS: 
hold    249
buy      79
sell     75
Name: signal, dtype: int64
shape after -> (403, 215)
ADDED TO C:\Users\Robert\Documents\python_scripts\stock_trading_ml_modelling\historical_prices\all_hist_prices_w_ft_eng2_TMP.h5
LAP 198 TIME -> 0:0:4

198
RUN FOR HSTG
shape before -> (218, 23)
SIGNAL COUNTS: 
hold    135
sell     55
buy      28
Name: signal, dtype: int64
shape after -> (218, 2

SIGNAL COUNTS: 
hold    361
sell    148
buy      85
Name: signal, dtype: int64
shape after -> (594, 215)
ADDED TO C:\Users\Robert\Documents\python_scripts\stock_trading_ml_modelling\historical_prices\all_hist_prices_w_ft_eng2_TMP.h5
LAP 224 TIME -> 0:0:7

224
RUN FOR JMG
shape before -> (594, 23)
SIGNAL COUNTS: 
hold    331
sell    148
buy     115
Name: signal, dtype: int64
shape after -> (594, 215)
ADDED TO C:\Users\Robert\Documents\python_scripts\stock_trading_ml_modelling\historical_prices\all_hist_prices_w_ft_eng2_TMP.h5
LAP 225 TIME -> 0:0:6

225
RUN FOR JFJ
shape before -> (605, 23)
SIGNAL COUNTS: 
hold    349
sell    141
buy     115
Name: signal, dtype: int64
shape after -> (605, 215)
ADDED TO C:\Users\Robert\Documents\python_scripts\stock_trading_ml_modelling\historical_prices\all_hist_prices_w_ft_eng2_TMP.h5
LAP 226 TIME -> 0:0:8

226
RUN FOR JUP
shape before -> (477, 23)
SIGNAL COUNTS: 
hold    265
sell    134
buy      78
Name: signal, dtype: int64
shape after -> (477, 215)
A

ADDED TO C:\Users\Robert\Documents\python_scripts\stock_trading_ml_modelling\historical_prices\all_hist_prices_w_ft_eng2_TMP.h5
LAP 251 TIME -> 0:0:3

251
RUN FOR OSB
shape before -> (289, 23)
SIGNAL COUNTS: 
hold    159
sell     77
buy      53
Name: signal, dtype: int64
shape after -> (289, 215)
ADDED TO C:\Users\Robert\Documents\python_scripts\stock_trading_ml_modelling\historical_prices\all_hist_prices_w_ft_eng2_TMP.h5
LAP 252 TIME -> 0:0:3

252
RUN FOR OXIG
shape before -> (1469, 23)
SIGNAL COUNTS: 
hold    882
sell    318
buy     269
Name: signal, dtype: int64
shape after -> (1469, 215)
ADDED TO C:\Users\Robert\Documents\python_scripts\stock_trading_ml_modelling\historical_prices\all_hist_prices_w_ft_eng2_TMP.h5
LAP 253 TIME -> 0:0:13

253
RUN FOR PAGE
shape before -> (926, 23)
SIGNAL COUNTS: 
hold    507
sell    213
buy     206
Name: signal, dtype: int64
shape after -> (926, 215)
ADDED TO C:\Users\Robert\Documents\python_scripts\stock_trading_ml_modelling\historical_prices\all_hi

SIGNAL COUNTS: 
hold    189
sell     71
buy      45
Name: signal, dtype: int64
shape after -> (305, 215)
ADDED TO C:\Users\Robert\Documents\python_scripts\stock_trading_ml_modelling\historical_prices\all_hist_prices_w_ft_eng2_TMP.h5
LAP 279 TIME -> 0:0:3

279
RUN FOR RSW
shape before -> (1030, 23)
SIGNAL COUNTS: 
hold    552
sell    270
buy     208
Name: signal, dtype: int64
shape after -> (1030, 215)
ADDED TO C:\Users\Robert\Documents\python_scripts\stock_trading_ml_modelling\historical_prices\all_hist_prices_w_ft_eng2_TMP.h5
LAP 280 TIME -> 0:0:9

280
RUN FOR RTN
shape before -> (1036, 23)
SIGNAL COUNTS: 
hold    614
sell    244
buy     178
Name: signal, dtype: int64
shape after -> (1036, 215)
ADDED TO C:\Users\Robert\Documents\python_scripts\stock_trading_ml_modelling\historical_prices\all_hist_prices_w_ft_eng2_TMP.h5
LAP 281 TIME -> 0:0:9

281
RUN FOR RHIM
shape before -> (112, 23)
SIGNAL COUNTS: 
hold    66
sell    33
buy     13
Name: signal, dtype: int64
shape after -> (112, 215)

ADDED TO C:\Users\Robert\Documents\python_scripts\stock_trading_ml_modelling\historical_prices\all_hist_prices_w_ft_eng2_TMP.h5
LAP 306 TIME -> 0:0:3

306
RUN FOR SMP
shape before -> (1433, 23)
SIGNAL COUNTS: 
hold    822
sell    358
buy     253
Name: signal, dtype: int64
shape after -> (1433, 215)
ADDED TO C:\Users\Robert\Documents\python_scripts\stock_trading_ml_modelling\historical_prices\all_hist_prices_w_ft_eng2_TMP.h5
LAP 307 TIME -> 0:0:13

307
RUN FOR SGC
shape before -> (1161, 23)
SIGNAL COUNTS: 
hold    671
sell    249
buy     241
Name: signal, dtype: int64
shape after -> (1161, 215)
ADDED TO C:\Users\Robert\Documents\python_scripts\stock_trading_ml_modelling\historical_prices\all_hist_prices_w_ft_eng2_TMP.h5
LAP 308 TIME -> 0:0:10

308
RUN FOR SYNC
shape before -> (345, 23)
SIGNAL COUNTS: 
hold    212
sell     89
buy      44
Name: signal, dtype: int64
shape after -> (345, 215)
ADDED TO C:\Users\Robert\Documents\python_scripts\stock_trading_ml_modelling\historical_prices\all_

SIGNAL COUNTS: 
hold    404
sell    178
buy      96
Name: signal, dtype: int64
shape after -> (678, 215)
ADDED TO C:\Users\Robert\Documents\python_scripts\stock_trading_ml_modelling\historical_prices\all_hist_prices_w_ft_eng2_TMP.h5
LAP 334 TIME -> 0:0:6

334
RUN FOR WMH
shape before -> (894, 23)
SIGNAL COUNTS: 
hold    488
sell    233
buy     173
Name: signal, dtype: int64
shape after -> (894, 215)
ADDED TO C:\Users\Robert\Documents\python_scripts\stock_trading_ml_modelling\historical_prices\all_hist_prices_w_ft_eng2_TMP.h5
LAP 335 TIME -> 0:0:8

335
RUN FOR WTAN
shape before -> (603, 23)
SIGNAL COUNTS: 
hold    371
sell    131
buy     101
Name: signal, dtype: int64
shape after -> (603, 215)
ADDED TO C:\Users\Robert\Documents\python_scripts\stock_trading_ml_modelling\historical_prices\all_hist_prices_w_ft_eng2_TMP.h5
LAP 336 TIME -> 0:0:5

336
RUN FOR WIZZ
shape before -> (251, 23)
SIGNAL COUNTS: 
hold    144
sell     61
buy      46
Name: signal, dtype: int64
shape after -> (251, 215)

In [37]:
#close any open h5 files
tables.file._open_files.close_all()

In [38]:
tmp_df = pd.read_hdf(hf_store_name,key='weekly_data',mode='r')
print("")
print("FINAL HDFSTORE SIZE: {}".format(tmp_df.shape))
print("FINAL BUY COUNT: {}".format(len(tmp_df[tmp_df["signal"] == "buy"])))
print("FINAL SELL COUNT: {}".format(len(tmp_df[tmp_df["signal"] == "sell"])))
h_store.close()
tmp_df.head(50)
# tmp_df[(tmp_df["ticker"] == 'ADM') & (tmp_df["date"] > '2013-12-01') & (tmp_df["date"] < '2014-02-01')].head(200)


FINAL HDFSTORE SIZE: (281287, 144)
FINAL BUY COUNT: 52218
FINAL SELL COUNT: 69568


Unnamed: 0,ticker,date,open,close,high,low,volume,change_price,per_change_price,ema26,...,prev_min_grad_volume,prev_grad_conv_volume,max_move_cum_volume,min_move_cum_volume,long_prev_max_move_date_volume,long_prev_min_move_date_volume,long_max_grad_volume,long_min_grad_volume,long_grad_conv_volume,signal
214063,SBRY,1995-01-02,,,,,,,,,...,,,0,0,0.0,0.0,,,,hold
214064,SBRY,1995-01-09,1.0,1.0,1.0,1.0,1.0,0.0,0.0,,...,,,0,0,0.0,0.0,,,,hold
214065,SBRY,1995-01-16,1.0,1.0,1.0,1.0,1.0,0.0,0.0,,...,,,0,0,0.0,0.0,,,,hold
214066,SBRY,1995-01-23,0.125278,1.0,1.0,0.599644,0.861339,0.874722,0.874722,,...,,,0,0,0.0,0.0,,,,sell
214067,SBRY,1995-01-30,1.0,0.833416,0.861097,1.0,0.54686,-0.166584,-0.199881,,...,,,0,0,0.0,0.0,,,,hold
214068,SBRY,1995-02-06,0.924822,0.333169,0.944637,0.666337,0.328915,-0.591654,-1.775838,,...,,,0,0,0.0,0.0,,,,hold
214069,SBRY,1995-02-13,0.524911,0.055858,0.333169,0.55561,0.226638,-0.469053,-8.397301,,...,,,0,0,0.0,0.0,,,,hold
214070,SBRY,1995-02-20,0.125,0.0,0.0,0.277805,0.245291,-0.125,-inf,,...,,,0,0,0.0,0.0,,,,hold
214071,SBRY,1995-02-27,0.050267,0.0,0.0,0.166584,0.019576,-0.050267,-inf,,...,,,0,0,0.0,0.0,,,,buy
214072,SBRY,1995-03-06,0.0,0.136327,0.0,0.0,0.126197,0.136327,1.0,,...,,,0,0,0.0,0.0,,,,buy


In [39]:
#close any open h5 files
tables.file._open_files.close_all()

In [40]:
#Delete the old h5 file and rename the TMP
try:
    os.remove(path+r'\all_hist_prices_w_ft_eng2.h5')
    print('\nSUCCESSFULLY REMOVED {}'.format(path+r'\all_hist_prices_w_ft_eng2.h5'))
except Exception as e:
    print('\nERROR - REMOVING:{}'.format(e))
try:
    os.rename(path+r'\all_hist_prices_w_ft_eng2_TMP.h5',path+r'\all_hist_prices_w_ft_eng2.h5')
    print('\nSUCCESSFULLY RENAMED {} TO {}'.format(path+r'\all_hist_prices_w_ft_eng2_TMP.h5',path+r'\all_hist_prices_w_ft_eng2.h5'))
except Exception as e:
    print('\nERROR - RENAMING:{}'.format(e))


SUCCESSFULLY REMOVED C:\Users\Robert\Documents\python_scripts\stock_trading_ml_modelling\historical_prices\all_hist_prices_w_ft_eng2.h5

SUCCESSFULLY RENAMED C:\Users\Robert\Documents\python_scripts\stock_trading_ml_modelling\historical_prices\all_hist_prices_w_ft_eng2_TMP.h5 TO C:\Users\Robert\Documents\python_scripts\stock_trading_ml_modelling\historical_prices\all_hist_prices_w_ft_eng2.h5


In [41]:
#Remove 'date' from out_cols
out_cols.remove('date')

In [42]:
#Export a list of the features for this model
file_object = open(path+r'\feature_engineering_feature_list.txt','w')
feature_str = ''
for i in out_cols:
    feature_str += '{},'.format(i)
feature_str = feature_str[:-1]
file_object.write(feature_str)
file_object.close()
feature_str

'ticker,open,close,high,low,volume,change_price,per_change_price,ema26,macd,signal_line,macd_line,close_orig_per_change_max_4,close_orig_per_change_max_13,close_orig_per_change_max_26,close_orig_per_change_max_52,close_orig_per_change_min_4,close_orig_per_change_min_13,close_orig_per_change_min_26,close_orig_per_change_min_52,macd_line_per_change_max_4,macd_line_per_change_max_13,macd_line_per_change_max_26,macd_line_per_change_max_52,macd_line_per_change_min_4,macd_line_per_change_min_13,macd_line_per_change_min_26,macd_line_per_change_min_52,change_close_shift1,close_max,close_min,prev_max_close,prev_min_close,prev_max_close_date_change,prev_min_close_date_change,max_change_close,min_change_close,prev_max_grad_close,prev_min_grad_close,prev_grad_conv_close,max_move_cum_close,min_move_cum_close,long_prev_max_move_date_close,long_prev_min_move_date_close,long_max_grad_close,long_min_grad_close,long_grad_conv_close,change_macd_shift1,macd_max,macd_min,prev_max_macd,prev_min_macd,prev_ma