# Trialing the Light Gradient Boost model
The code will run the logistic regression model on the FTSE 350 stock data and see what the profit would have been from investing with the below criteria:
- Starting with £10,000
- Never investing more than 10% of the value into a single stock
- Tracking balance so once the account is empty no more can be invested until shares are liqidated
- Shares are bought at the open of the day following the first buy signal, if shares are not already held
- Shares are sold at the open of the day following the first close signal after a hold period, if shares are held

Trading variables:
- Trades cost £2.50 to execute
- Spread is 1%

In [1]:
#Import models
import numpy as np
import pandas as pd
import math
import lightgbm as lgb
from sklearn.externals import joblib as jl
import os
import tables
from rf_modules import *

In [2]:
#Import and combine prices files
path = r'C:\Users\Robert\Documents\python_scripts\stock_trading_ml_modelling\historical_prices'
df_ft = pd.read_hdf(path +r'\all_hist_prices_w_ft_eng2.h5')
print('SHAPE: {}'.format(df_ft.shape))
print(df_ft.columns)
df_ft.head()

SHAPE: (258764, 106)
Index(['ticker', 'date', 'open', 'close', 'high', 'low', 'volume',
       'change_price', 'per_change_price', 'ema26',
       ...
       'min_change_macd_line', 'prev_max_grad_macd_line',
       'prev_min_grad_macd_line', 'max_move_cum_macd_line',
       'min_move_cum_macd_line', 'long_prev_max_move_date_macd_line',
       'long_prev_min_move_date_macd_line', 'long_max_grad_macd_line',
       'long_min_grad_macd_line', 'signal'],
      dtype='object', length=106)


Unnamed: 0,ticker,date,open,close,high,low,volume,change_price,per_change_price,ema26,...,min_change_macd_line,prev_max_grad_macd_line,prev_min_grad_macd_line,max_move_cum_macd_line,min_move_cum_macd_line,long_prev_max_move_date_macd_line,long_prev_min_move_date_macd_line,long_max_grad_macd_line,long_min_grad_macd_line,signal
119903,III,2007-12-31,995.0,965.0,1023.0,964.0,4511565.0,-30.0,-0.031088,,...,,,,0,0,0.0,0.0,0.0,0.0,sell
119904,III,2008-01-07,967.5,924.0,989.0,917.5,16056554.0,-43.5,-0.047078,,...,,,,0,0,0.0,0.0,0.0,0.0,sell
119905,III,2008-01-14,917.0,901.0,936.0,881.0,21691287.0,-16.0,-0.017758,,...,,,,0,0,0.0,0.0,0.0,0.0,hold
119906,III,2008-01-21,891.0,917.5,965.0,847.0,17850580.0,26.5,0.028883,,...,,,,0,0,0.0,0.0,0.0,0.0,hold
119907,III,2008-01-28,911.0,961.0,971.0,903.0,12079245.0,50.0,0.052029,,...,,,,0,0,0.0,0.0,0.0,0.0,sell


In [3]:
#Import to lr model
lgb_mod = jl.load(path+r'\lgb_model.joblib')
lgb_mod

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
        importance_type='split', learning_rate=0.1, max_depth=22,
        min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
        n_estimators=100, n_jobs=-1, num_leaves=515, objective=None,
        random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
        subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [4]:
data_df = df_ft.replace([np.inf,-np.inf],np.nan).dropna(axis=0)
print('SHAPE: {}'.format(data_df.shape))
data_df.head()

SHAPE: (228656, 106)


Unnamed: 0,ticker,date,open,close,high,low,volume,change_price,per_change_price,ema26,...,min_change_macd_line,prev_max_grad_macd_line,prev_min_grad_macd_line,max_move_cum_macd_line,min_move_cum_macd_line,long_prev_max_move_date_macd_line,long_prev_min_move_date_macd_line,long_max_grad_macd_line,long_min_grad_macd_line,signal
120016,III,2010-03-01,265.1,272.5,274.6,257.7,15386401.0,7.4,0.027156,276.458857,...,1.457656,0.684626,-0.006563,0,0,0.0,0.0,0.0,0.0,hold
120017,III,2010-03-08,274.3,281.2,282.9,270.5,13990561.0,6.9,0.024538,276.810053,...,2.918708,0.684626,-0.006563,0,0,0.0,0.0,0.0,0.0,hold
120018,III,2010-03-15,281.4,292.6,293.8,280.6,18997573.0,11.2,0.038278,277.979678,...,5.036369,0.684626,-0.006563,0,0,0.0,0.0,0.0,0.0,hold
120019,III,2010-03-22,291.5,295.3,298.3,287.3,16919837.0,3.8,0.012868,279.262665,...,6.950317,0.684626,-0.006563,0,0,0.0,0.0,0.0,0.0,sell
120020,III,2010-03-29,294.4,293.4,296.7,287.9,11508731.0,-1.0,-0.003408,280.309875,...,8.315898,0.684626,-0.006563,0,0,0.0,0.0,0.0,0.0,sell


In [5]:
#Import feature_list
f = open(path + r'\lgb_model_feature_list.txt','r')
feature_li = f.read().split(',')
feature_li

['open',
 'close',
 'high',
 'low',
 'volume',
 'change_price',
 'per_change_price',
 'ema26',
 'macd',
 'signal_line',
 'macd_line',
 'close_shift1',
 'change_close_shift1',
 'volume_shift1',
 'change_volume_shift1',
 'ema26_shift1',
 'change_ema26_shift1',
 'close_13_norm',
 'close_26_norm',
 'close_52_norm',
 'macd_line_13_norm',
 'macd_line_26_norm',
 'macd_line_52_norm',
 'close_max',
 'close_min',
 'prev_max_close',
 'prev_min_close',
 'prev_max_close_date_change',
 'prev_min_close_date_change',
 'max_change_close',
 'min_change_close',
 'prev_max_grad_close',
 'prev_min_grad_close',
 'max_move_cum_close',
 'min_move_cum_close',
 'long_prev_max_move_date_close',
 'long_prev_min_move_date_close',
 'long_max_grad_close',
 'long_min_grad_close',
 'macd_max',
 'macd_min',
 'prev_max_macd',
 'prev_min_macd',
 'prev_max_macd_date_change',
 'prev_min_macd_date_change',
 'max_change_macd',
 'min_change_macd',
 'prev_max_grad_macd',
 'prev_min_grad_macd',
 'max_move_cum_macd',
 'min_move_

In [6]:
#Run the rf_mod to get signals
data_df['signal'] = lgb_mod.predict(data_df[feature_li])
data_df['signal_prob'] = [x.max() for x in lgb_mod.predict_proba(data_df[feature_li])]

  if diff:


In [7]:
#Show current buy ratings
print('BUY COUNT: {:,}'.format(data_df.loc[(data_df['date'] == data_df['date'].max()) & (data_df['signal'] == 'buy')].shape[0]))
data_df.loc[(data_df['date'] == data_df['date'].max()) & (data_df['signal'] == 'buy'),['ticker','signal','signal_prob']].sort_values(['signal','signal_prob'],ascending=[True,False])

BUY COUNT: 2


Unnamed: 0,ticker,signal,signal_prob
90327,GFRD,buy,0.633357
147327,MCS,buy,0.49595


In [37]:
data_df.loc[data_df['ticker'] == 'SBRY',['ticker','date','signal']].sort_values(['date'])

Unnamed: 0,ticker,date,signal
200429,SBRY,1996-06-03,hold
200430,SBRY,1996-06-10,sell
200431,SBRY,1996-06-17,sell
200432,SBRY,1996-06-24,hold
200433,SBRY,1996-07-01,hold
200434,SBRY,1996-07-08,buy
200435,SBRY,1996-07-15,hold
200436,SBRY,1996-07-22,buy
200437,SBRY,1996-07-29,hold
200438,SBRY,1996-08-05,hold


In [8]:
#Show current sell ratings
print('SELL COUNT: {:,}'.format(data_df.loc[(data_df['date'] == data_df['date'].max()) & (data_df['signal'] == 'sell')].shape[0]))
data_df.loc[(data_df['date'] == data_df['date'].max()) & (data_df['signal'] == 'sell'),['ticker','signal','signal_prob']].sort_values(['signal','signal_prob'],ascending=[True,False])

SELL COUNT: 15


Unnamed: 0,ticker,signal,signal_prob
112723,HSV,sell,0.581982
126201,INVP,sell,0.579487
254322,VSVS,sell,0.570383
38989,CCC,sell,0.541271
167605,PAY,sell,0.540497
182677,PSON,sell,0.531114
28721,BRBY,sell,0.515938
241768,TLW,sell,0.513616
144871,LWDB,sell,0.512595
152385,MNDI,sell,0.510671


In [9]:
#Show current hold ratings
print('HOLD COUNT: {:,}'.format(data_df.loc[(data_df['date'] == data_df['date'].max()) & (data_df['signal'] == 'hold')].shape[0]))
data_df.loc[(data_df['date'] == data_df['date'].max()) & (data_df['signal'] == 'hold'),['ticker','signal','signal_prob']].sort_values(['signal','signal_prob'],ascending=[True,False])

HOLD COUNT: 302


Unnamed: 0,ticker,signal,signal_prob
204479,SEQI,hold,0.943388
11732,AZN,hold,0.897132
258618,WPP,hold,0.873956
226333,SSE,hold,0.859161
57395,CYBG,hold,0.855899
159883,NEX,hold,0.855206
17434,BBGI,hold,0.841636
120515,III,hold,0.840313
196526,RSE,hold,0.839522
20826,BGEO,hold,0.839228


# Combine with price data and create ledger

In [10]:
#Import and combine prices files
df_prices = pd.read_hdf(path +r'\all_hist_prices_w.h5')

In [11]:
#Sort by ticker and date then add the open_shift_neg1 field
#These allow the buying and selling to be done at a realistic price
df_prices.sort_values(['ticker','date'],ascending=[True,True],inplace=True)
df_prices['open_shift_neg1'] = df_prices['open'].shift(-1)
df_prices['date'] = df_prices['date'].astype('datetime64')
print('SHAPE: {}'.format(df_prices.shape))
print(df_prices.columns)
df_prices.iloc[90:95]

SHAPE: (260985, 14)
Index(['ticker', 'date', 'high', 'low', 'volume', 'open', 'close', 'change',
       'ema12', 'ema26', 'macd_line', 'signal_line', 'macd',
       'open_shift_neg1'],
      dtype='object')


Unnamed: 0,ticker,date,high,low,volume,open,close,change,ema12,ema26,macd_line,signal_line,macd,open_shift_neg1
90,3IN,2009-09-21,142.85,137.15,7932371.0,139.26,142.57,3.31,136.97158,134.290336,2.681244,1.988463,0.692781,140.39
91,3IN,2009-09-28,145.67,139.62,5390930.0,140.39,144.96,4.57,138.200568,135.080682,3.119886,2.214748,0.905138,144.26
92,3IN,2009-10-05,146.09,142.99,3520386.0,144.26,144.68,0.42,139.197403,135.791742,3.405661,2.45293,0.952731,143.84
93,3IN,2009-10-12,146.37,142.99,2676531.0,143.84,144.82,0.98,140.062418,136.460502,3.601916,2.682727,0.919189,146.37
94,3IN,2009-10-19,147.07,142.29,3350132.0,146.37,143.56,-2.81,140.600508,136.986391,3.614117,2.869005,0.745112,144.26


In [12]:
#Join on the buy and sell signals
df_prices = pd.merge(df_prices,data_df[['ticker','date','signal','signal_prob']],left_on=['ticker','date'],right_on=['ticker','date'],how='inner')
print('SHAPE: {}'.format(df_prices.shape))
print(df_prices.columns)
print(df_prices.signal.value_counts())
df_prices.head()

SHAPE: (228690, 16)
Index(['ticker', 'date', 'high', 'low', 'volume', 'open', 'close', 'change',
       'ema12', 'ema26', 'macd_line', 'signal_line', 'macd', 'open_shift_neg1',
       'signal', 'signal_prob'],
      dtype='object')
hold    138750
sell     49284
buy      40656
Name: signal, dtype: int64


Unnamed: 0,ticker,date,high,low,volume,open,close,change,ema12,ema26,macd_line,signal_line,macd,open_shift_neg1,signal,signal_prob
0,3IN,2009-06-22,132.3,123.15,3940925.0,127.37,130.18,2.81,130.016341,129.427995,0.588346,-0.957456,1.545802,130.89,hold,0.703602
1,3IN,2009-06-29,131.59,128.07,2504822.0,130.89,130.89,0.0,130.15075,129.536291,0.614459,-0.643073,1.257532,130.18,hold,0.871537
2,3IN,2009-07-06,131.94,128.78,1968945.0,130.18,129.83,-0.35,130.101404,129.558047,0.543356,-0.405787,0.949144,130.54,buy,0.72759
3,3IN,2009-07-13,136.52,129.48,4008860.0,130.54,135.46,4.92,130.925803,129.995229,0.930574,-0.138515,1.069089,135.46,hold,0.827442
4,3IN,2009-07-20,137.57,134.76,2907196.0,135.46,135.11,-0.35,131.569526,130.374101,1.195425,0.128273,1.067152,135.11,hold,0.765109


In [13]:
#Limit to a test period
df_prices = df_prices[df_prices['date'] >= '2014-01-01']
print('SHAPE: {}'.format(df_prices.shape))

SHAPE: (81921, 16)


In [14]:
#Create a dictionary of max character lengths of fields for use later in h5 file appending
def get_col_lens(_df_in):
    _col_lens = {}
    for c in _df_in:
        _tmp_s = pd.Series([len(str(x)) for x in _df_in[c]])
        _col_lens[c] = _tmp_s.max()
    return _col_lens
col_lens = get_col_lens(df_prices)
col_lens

{'ticker': 4,
 'date': 19,
 'high': 7,
 'low': 7,
 'volume': 12,
 'open': 7,
 'close': 7,
 'change': 21,
 'ema12': 18,
 'ema26': 18,
 'macd_line': 23,
 'signal_line': 23,
 'macd': 23,
 'open_shift_neg1': 8,
 'signal': 4,
 'signal_prob': 19}

In [15]:
#Write df_prices to a .h5 file
hf_store_name = path + r'\historic_lgb_bsh_signals_TMP.h5'
hf = pd.HDFStore(hf_store_name)
group_name = r'bsh_signals'
df_prices.to_hdf(hf_store_name,key=group_name,append=True,min_itemsize=col_lens)
hf.close()

In [16]:
#close any open h5 files
tables.file._open_files.close_all()

In [17]:
#Delete the old h5 file and rename the TMP
try:
    os.remove(path + r'\historic_lgb_bsh_signals.h5')
    print('\nSUCCESSFULLY REMOVED {}'.format(path + r'\historic_lgb_bsh_signals.h5'))
except Exception as e:
    print('\nERROR - REMOVING:{}'.format(e))
try:
    os.rename(path + r'\historic_lgb_bsh_signals_TMP.h5',path + r'\historic_lgb_bsh_signals.h5')
    print('\nSUCCESSFULLY RENAMED {} TO {}'.format(path + r'\historic_lgb_bsh_signals_TMP.h5',path + r'\historic_lgb_bsh_signals.h5'))
except Exception as e:
    print('\nERROR - RENAMING:{}'.format(e))


SUCCESSFULLY REMOVED C:\Users\Robert\Documents\python_scripts\stock_trading_ml_modelling\historical_prices\historic_lgb_bsh_signals.h5

SUCCESSFULLY RENAMED C:\Users\Robert\Documents\python_scripts\stock_trading_ml_modelling\historical_prices\historic_lgb_bsh_signals_TMP.h5 TO C:\Users\Robert\Documents\python_scripts\stock_trading_ml_modelling\historical_prices\historic_lgb_bsh_signals.h5


# Getting the status of a specific ticker

In [21]:
def get_signal():
    tick = input('\nEnter the ticker: ').upper()
    if tick in data_df.ticker.unique():
        print('{} CURRENT STATUS -> {} - {}'.format(tick.upper(),data_df.loc[(data_df['date'] == data_df['date'].max()) & (data_df['ticker'] == tick.upper())].signal.values[0],data_df['date'].max()))
        return None
    else:
        print('THERE IS NO SIGNAL FOR {}'.format(tick.upper()))
        get_signal()
get_signal()


Enter the ticker: sbry
SBRY CURRENT STATUS -> hold - 2019-09-30 00:00:00
