# Trialing the random forest model,
The code will run the logistic regression model on the FTSE 350 stock data and see what the profit would have been from investing with the below criteria:
- Starting with £10,000
- Never investing more than 10% of the value into a single stock
- Tracking balance so once the account is empty no more can be invested until shares are liqidated
- Shares are bought at the open of the day following the first buy signal, if shares are not already held
- Shares are sold at the open of the day following the first close signal after a hold period, if shares are held

Trading variables:
- Trades cost £2.50 to execute
- Spread is 1%

In [1]:
#Import models
import numpy as np
import pandas as pd
import math
from sklearn.ensemble import RandomForestClassifier as rfc
from sklearn.externals import joblib as jl
import os
import tables

  from numpy.core.umath_tests import inner1d


In [2]:
#Import and combine prices files
path = r'C:\\Users\\Robert\\Documents\\python_scripts\\stock_trading_ml_modelling\\historical_prices'
df_ft = pd.read_hdf(path +'\\all_hist_prices_w_ft_eng2.h5')
print('SHAPE: {}'.format(df_ft.shape))
print(df_ft.columns)
df_ft.head()

SHAPE: (258339, 140)
Index(['ticker', 'date', 'open', 'close', 'high', 'low', 'volume',
       'change_price', 'per_change_price', 'close_shift1',
       ...
       'min_move_cum_neg_macd_line', 'long_prev_max_move_pos_date_macd_line',
       'long_prev_max_move_neg_date_macd_line',
       'long_prev_min_move_pos_date_macd_line',
       'long_prev_min_move_neg_date_macd_line', 'long_max_grad_pos_macd_line',
       'long_max_grad_neg_macd_line', 'long_min_grad_pos_macd_line',
       'long_min_grad_neg_macd_line', 'signal'],
      dtype='object', length=140)


Unnamed: 0,ticker,date,open,close,high,low,volume,change_price,per_change_price,close_shift1,...,min_move_cum_neg_macd_line,long_prev_max_move_pos_date_macd_line,long_prev_max_move_neg_date_macd_line,long_prev_min_move_pos_date_macd_line,long_prev_min_move_neg_date_macd_line,long_max_grad_pos_macd_line,long_max_grad_neg_macd_line,long_min_grad_pos_macd_line,long_min_grad_neg_macd_line,signal
119990,III,2007-12-31,0.837127,0.838986,0.855672,0.85312,0.053557,-30.0,-0.030151,,...,1,,,,,0.0,0.0,0.0,0.0,sell
119991,III,2008-01-07,0.809307,0.802912,0.821356,0.811475,0.23639,-43.5,-0.044961,0.838986,...,2,,,,,0.0,0.0,0.0,0.0,sell
119992,III,2008-01-14,0.75822,0.782676,0.767864,0.778785,0.325624,-16.0,-0.017448,0.802912,...,3,,,,,0.0,0.0,0.0,0.0,sell
119993,III,2008-01-21,0.731917,0.797193,0.797134,0.748334,0.264801,26.5,0.029742,0.782676,...,4,,,,,0.0,0.0,0.0,0.0,sell
119994,III,2008-01-28,0.75215,0.835467,0.803189,0.798488,0.173403,50.0,0.054885,0.797193,...,5,,,,,0.0,0.0,0.0,0.0,sell


In [3]:
#Import to lr model
rf_mod = jl.load(path+r'\\random_forest_model.joblib')
rf_mod

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=2, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=150,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [4]:
rf_mod_df = df_ft.replace([np.inf,-np.inf],np.nan).dropna(axis=0)
print('SHAPE: {}'.format(rf_mod_df.shape))
rf_mod_df.head()

SHAPE: (212631, 140)


Unnamed: 0,ticker,date,open,close,high,low,volume,change_price,per_change_price,close_shift1,...,min_move_cum_neg_macd_line,long_prev_max_move_pos_date_macd_line,long_prev_max_move_neg_date_macd_line,long_prev_min_move_pos_date_macd_line,long_prev_min_move_neg_date_macd_line,long_max_grad_pos_macd_line,long_max_grad_neg_macd_line,long_min_grad_pos_macd_line,long_min_grad_neg_macd_line,signal
120114,III,2010-05-17,0.104097,0.235845,0.114251,0.231568,0.394831,9.1,0.033654,0.231182,...,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,sell
120115,III,2010-05-24,0.117046,0.240597,0.11627,0.229419,0.373991,1.7,0.006003,0.235845,...,0,1.0,0.0,1.0,0.0,0.000632,0.0,0.000632,0.0,sell
120116,III,2010-05-31,0.118867,0.238749,0.124647,0.241241,0.344081,-2.2,-0.007719,0.240597,...,0,2.0,0.0,2.0,0.0,0.000953,0.0,0.000953,0.0,sell
120117,III,2010-06-07,0.111583,0.243236,0.117077,0.234524,0.199496,10.1,0.036357,0.238749,...,0,3.0,0.0,3.0,0.0,0.000728,0.0,0.000728,0.0,sell
120118,III,2010-06-14,0.125139,0.253003,0.128078,0.248137,0.209032,7.8,0.026786,0.243236,...,0,4.0,0.0,4.0,0.0,0.000612,0.0,0.000612,0.0,sell


In [10]:
#Import feature_list
f = open(path + r'\\random_forest_model_feature_list.txt','r')
feature_li = f.read().split(',')
feature_li

['open',
 'close',
 'high',
 'low',
 'volume',
 'change_price',
 'per_change_price',
 'close_shift1',
 'change_close_shift1',
 'vol_shift1',
 'change_vol_shift1',
 'ema26_shift1',
 'change_ema26_shift1',
 'close_13_norm',
 'close_26_norm',
 'close_52_norm',
 'macd_line_13_norm',
 'macd_line_26_norm',
 'macd_line_52_norm',
 'macd_pos_val',
 'macd_neg_val',
 'signal_line_pos_val',
 'signal_line_neg_val',
 'change_price_pos_val',
 'change_price_neg_val',
 'per_change_price_pos_val',
 'per_change_price_neg_val',
 'prev_max_close',
 'prev_min_close',
 'prev_max_close_date_change',
 'prev_min_close_date_change',
 'max_change_close_pos_val',
 'max_change_close_neg_val',
 'min_change_close_pos_val',
 'min_change_close_neg_val',
 'prev_max_grad_close',
 'prev_min_grad_close',
 'max_move_cum_pos_close',
 'max_move_cum_neg_close',
 'min_move_cum_pos_close',
 'min_move_cum_neg_close',
 'long_prev_max_move_pos_date_close',
 'long_prev_max_move_neg_date_close',
 'long_prev_min_move_pos_date_close',


In [11]:
#Run the rf_mod to get signals
rf_mod_df['signal'] = rf_mod.predict(rf_mod_df[feature_li])
rf_mod_df['signal_prob'] = [x.max() for x in rf_mod.predict_proba(rf_mod_df[feature_li])]

In [12]:
#Show current buy ratings
print('BUY COUNT: {:,}'.format(rf_mod_df.loc[(rf_mod_df['date'] == rf_mod_df['date'].max()) & (rf_mod_df['signal'] == 'buy')].shape[0]))
rf_mod_df.loc[(rf_mod_df['date'] == rf_mod_df['date'].max()) & (rf_mod_df['signal'] == 'buy'),['ticker','signal','signal_prob']].sort_values(['signal','signal_prob'],ascending=[True,False])

BUY COUNT: 25


Unnamed: 0,ticker,signal,signal_prob
53382,CPI,buy,0.363336
80995,FGP,buy,0.363336
86604,FUTR,buy,0.363336
72135,EMG,buy,0.363336
155333,MRC,buy,0.363336
220932,SRP,buy,0.363336
49290,CNE,buy,0.359199
209306,SHI,buy,0.359199
230122,TALK,buy,0.356406
17671,BARC,buy,0.352268


In [13]:
#Show current sell ratings
print('SELL COUNT: {:,}'.format(rf_mod_df.loc[(rf_mod_df['date'] == rf_mod_df['date'].max()) & (rf_mod_df['signal'] == 'sell')].shape[0]))
rf_mod_df.loc[(rf_mod_df['date'] == rf_mod_df['date'].max()) & (rf_mod_df['signal'] == 'sell'),['ticker','signal','signal_prob']].sort_values(['signal','signal_prob'],ascending=[True,False])

SELL COUNT: 295


Unnamed: 0,ticker,signal,signal_prob
246145,UTG,sell,0.687770
118286,ICP,sell,0.686206
189841,REL,sell,0.685377
245095,ULVR,sell,0.685377
75900,EXPN,sell,0.683307
108030,HLMA,sell,0.683307
131332,IWG,sell,0.683307
134040,JEO,sell,0.682510
9934,ATST,sell,0.681538
60799,DGE,sell,0.679145


In [14]:
#Show current hold ratings
print('HOLD COUNT: {:,}'.format(rf_mod_df.loc[(rf_mod_df['date'] == rf_mod_df['date'].max()) & (rf_mod_df['signal'] == 'hold')].shape[0]))
rf_mod_df.loc[(rf_mod_df['date'] == rf_mod_df['date'].max()) & (rf_mod_df['signal'] == 'hold'),['ticker','signal','signal_prob']].sort_values(['signal','signal_prob'],ascending=[True,False])

HOLD COUNT: 0


Unnamed: 0,ticker,signal,signal_prob


# Combine with price data and create ledger

In [15]:
#Import and combine prices files
df_prices = pd.read_hdf(path +'\\all_hist_prices_w.h5')

In [16]:
#Sort by ticker and date then add the open_shift_neg1 field
#These allow the buying and selling to be done at a realistic price
df_prices.sort_values(['ticker','date'],ascending=[True,True],inplace=True)
df_prices['open_shift_neg1'] = df_prices['open'].shift(-1)
df_prices['date'] = df_prices['date'].astype('datetime64')
print('SHAPE: {}'.format(df_prices.shape))
print(df_prices.columns)
df_prices.head()

SHAPE: (258389, 14)
Index(['ticker', 'date', 'high', 'low', 'volume', 'open', 'close', 'change',
       'ema12', 'ema26', 'macd_line', 'signal', 'macd', 'open_shift_neg1'],
      dtype='object')


Unnamed: 0,ticker,date,high,low,volume,open,close,change,ema12,ema26,macd_line,signal,macd,open_shift_neg1
0,3IN,2007-12-31,149.89,147.07,1373801.0,149.18,147.43,-1.75,,,,,,147.07
1,3IN,2008-01-07,149.54,147.07,2345191.0,147.07,148.83,1.76,,,,,,148.48
2,3IN,2008-01-14,150.59,147.43,2150049.0,148.48,149.18,0.7,,,,,,147.78
3,3IN,2008-01-21,154.82,145.32,3070968.0,147.78,152.0,4.22,,,,,,149.89
4,3IN,2008-01-28,154.82,148.83,2510972.0,149.89,152.0,2.11,,,,,,154.46


In [17]:
#Join on the buy and sell signals
df_prices = pd.merge(df_prices[['ticker','date','open','close','open_shift_neg1']],rf_mod_df[['ticker','date','signal','signal_prob']],left_on=['ticker','date'],right_on=['ticker','date'],how='inner')
print('SHAPE: {}'.format(df_prices.shape))
print(df_prices.columns)
df_prices.head()

SHAPE: (212723, 7)
Index(['ticker', 'date', 'open', 'close', 'open_shift_neg1', 'signal',
       'signal_prob'],
      dtype='object')


Unnamed: 0,ticker,date,open,close,open_shift_neg1,signal,signal_prob
0,3IN,2010-03-01,150.31,152.56,152.99,sell,0.41001
1,3IN,2010-03-08,152.99,150.87,150.17,sell,0.402958
2,3IN,2010-03-15,150.17,146.51,147.78,sell,0.399317
3,3IN,2010-03-22,147.78,153.27,153.27,sell,0.41001
4,3IN,2010-03-29,153.27,155.1,154.39,sell,0.42938


In [18]:
#Limit to a test period
df_prices = df_prices[df_prices['date'] >= '2014-01-01']
print('SHAPE: {}'.format(df_prices.shape))

SHAPE: (88362, 7)


In [19]:
#Create a dictionary of max character lengths of fields for use later in h5 file appending
def get_col_lens(_df_in):
    _col_lens = {}
    for c in _df_in:
        _tmp_s = pd.Series([len(str(x)) for x in _df_in[c]])
        _col_lens[c] = _tmp_s.max()
    return _col_lens
col_lens = get_col_lens(df_prices)
col_lens

{'ticker': 4,
 'date': 19,
 'open': 7,
 'close': 7,
 'open_shift_neg1': 7,
 'signal': 4,
 'signal_prob': 19}

In [46]:
#Write df_prices to a .h5 file
hf_store_name = path + r'\\historic_rf_bsh_signals_TMP.h5'
hf = pd.HDFStore(hf_store_name)
group_name = r'bsh_signals'
df_prices.to_hdf(hf_store_name,key=group_name,append=True,min_itemsize=col_lens)
hf.close()

In [47]:
#close any open h5 files
tables.file._open_files.close_all()

In [48]:
#Delete the old h5 file and rename the TMP
src_fldr_pth = path + '\\\\'
try:
    os.remove(src_fldr_pth + r'historic_rf_bsh_signals.h5')
    print('\nSUCCESSFULLY REMOVED {}'.format(src_fldr_pth + r'historic_rf_bsh_signals.h5'))
except Exception as e:
    print('\nERROR - REMOVING:{}'.format(e))
try:
    os.rename(src_fldr_pth + r'historic_rf_bsh_signals_TMP.h5',src_fldr_pth + r'historic_rf_bsh_signals.h5')
    print('\nSUCCESSFULLY RENAMED {} TO {}'.format(src_fldr_pth + r'historic_rf_bsh_signals_TMP.h5',src_fldr_pth + r'historic_rf_bsh_signals.h5'))
except Exception as e:
    print('\nERROR - RENAMING:{}'.format(e))


ERROR - REMOVING:[WinError 2] The system cannot find the file specified: 'C:\\\\Users\\\\Robert\\\\Documents\\\\python_scripts\\\\stock_trading_ml_modelling\\\\historical_prices\\\\historic_rf_bsh_signals.h5'

SUCCESSFULLY RENAMED C:\\Users\\Robert\\Documents\\python_scripts\\stock_trading_ml_modelling\\historical_prices\\historic_rf_bsh_signals_TMP.h5 TO C:\\Users\\Robert\\Documents\\python_scripts\\stock_trading_ml_modelling\\historical_prices\\historic_rf_bsh_signals.h5


# Getting the status of a specific ticker

In [23]:
def get_signal():
    tick = input('\nEnter the ticker: ').upper()
    if tick in rf_mod_df.ticker.unique():
        print('{} CURRENT STATUS -> {}'.format(tick.upper(),rf_mod_df.loc[(rf_mod_df['date'] == rf_mod_df['date'].max()) & (rf_mod_df['ticker'] == tick.upper())].signal.values[0]))
        return True
    else:
        print('THERE IS NO SIGNAL FOR {}'.format(tick.upper()))
        get_signal()
get_signal()


Enter the ticker: sbry
SBRY CURRENT STATUS -> sell


True