In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from talib.abstract import *
from sklearn.preprocessing import MinMaxScaler
from finlab.data import Data

In [2]:
database = Data()
close = database.get("收盤價")
open_ = database.get("開盤價")
high = database.get("最高價")
low = database.get("最低價")
vol = database.get("成交股數")
accu = database.get("成交金額")
rev = database.get("當月營收")
com_rev = database.get("上月比較增減(%)")
d_yield = database.get("殖利率(%)")
pb = database.get("股價淨值比")

In [3]:
start_date = '2016-12-01'
end_date = '2019-02-01'

close = close[(close.index > start_date) & (close.index < end_date)]
open_ = open_[(open_.index > start_date) & (open_.index < end_date)]
high = high[(high.index > start_date) & (high.index < end_date)]
low = low[(low.index > start_date) & (low.index < end_date)]
vol = vol[(vol.index > start_date) & (vol.index < end_date)]
accu = accu[(accu.index > start_date) & (accu.index < end_date)]
rev = rev[(rev.index > start_date) & (rev.index < end_date)]
com_rev = com_rev[(com_rev.index > start_date) & (com_rev.index < end_date)]
d_yield = d_yield[(d_yield.index > start_date) & (d_yield.index < end_date)]
pb = pb[(pb.index > start_date) & (pb.index < end_date)]

In [4]:
def MA(close, n):
    return close.rolling(window=n).mean()

def bias(close, n):
    return close / close.rolling(n, min_periods=1).mean()

def acc(close, n):
    return close.shift(n) / (close.shift(2*n) + close) * 2

def mom(rev, n):
    return (rev / rev.shift(1)).shift(n)

In [7]:
# vol = vol.reset_index()
sid = '0050'
benchmark = pd.DataFrame({'close': close[sid], 'high': high[sid], 'low': low[sid], 'volume': vol[sid]})

benchmark['b_OBV'] = OBV(benchmark.close, benchmark.volume)
benchmark['b_AD'] = AD(benchmark.high, benchmark.low, benchmark.close, benchmark.volume)
benchmark['b_ADOSC'] = ADOSC(benchmark.high, benchmark.low, benchmark.close, benchmark.volume, fastperiod=3, slowperiod=10)

benchmark['b_MA5'] = MA(benchmark['close'], 5) - benchmark['close']
benchmark['b_MA20'] = MA(benchmark['close'], 20) - benchmark['close']
benchmark['b_MA60'] = MA(benchmark['close'], 60) - benchmark['close']

benchmark['b_bias5'] = bias(benchmark['close'], 5)
benchmark['b_bias10'] = bias(benchmark['close'], 10)
benchmark['b_bias20'] = bias(benchmark['close'], 20)
benchmark['b_bias60'] = bias(benchmark['close'], 60)

benchmark['b_acc5'] = acc(benchmark['close'], 5)
benchmark['b_acc10'] = acc(benchmark['close'], 10)
benchmark['b_acc20'] = acc(benchmark['close'], 20)
benchmark['b_acc60'] = acc(benchmark['close'], 60)

window_stdev = 50
benchmark['b_log_ret'] = np.log(benchmark['close']).diff()
benchmark['b_volatility'] = benchmark['b_log_ret'].rolling(window=window_stdev, min_periods=window_stdev, center=False).std()

In [8]:
benchmark.tail()

Unnamed: 0_level_0,close,high,low,volume,b_OBV,b_AD,b_ADOSC,b_MA5,b_MA20,b_MA60,b_bias5,b_bias10,b_bias20,b_bias60,b_acc5,b_acc10,b_acc20,b_acc60,b_log_ret,b_volatility
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
2019-01-24,74.05,74.15,73.75,3500506.0,-85052728.0,260245300.0,2201552.0,0.8,0.58,1.83,0.989312,0.985363,0.992228,0.975883,1.015116,1.012179,0.977145,0.970892,0.004738,0.011676
2019-01-25,75.0,75.1,74.55,7320778.0,-77731950.0,264904000.0,3227441.0,-0.34,-0.2975,0.823333,1.004554,0.998668,1.003982,0.989141,1.009302,1.016493,0.967445,0.976947,0.012748,0.011821
2019-01-28,75.15,75.45,75.05,3825143.0,-73906807.0,262991400.0,2745174.0,-0.76,-0.4425,0.615,1.010216,1.000466,1.005923,0.991883,1.018981,0.998668,0.98815,0.975201,0.001998,0.011774
2019-01-29,74.5,74.55,74.2,4419790.0,-78326597.0,266148400.0,3302819.0,-0.02,0.1575,1.2025,1.000269,0.993333,0.99789,0.984115,0.986347,1.008667,0.984997,0.974167,-0.008687,0.011825
2019-01-30,74.35,74.65,74.2,5255229.0,-83581826.0,264396700.0,2673318.0,0.26,0.3225,1.291667,0.996515,0.992922,0.995681,0.982924,0.983322,1.018194,0.970511,0.974695,-0.002015,0.011762


In [24]:
t_final = 5
def get_Daily_Volatility(close,span0=20):
    # simple percentage returns
    df0=close.pct_change()
    # 20 days, a month EWM's std as boundary
    df0=df0.ewm(span=span0).std()
    df0.dropna(inplace=True)
    return df0

def get_3_barriers():
    #create a container
    barriers = pd.DataFrame(columns=['days_passed', 
            'price', 'vert_barrier', \
            'top_barrier', 'bottom_barrier', 'long_ret', 'short_ret'], \
            index = daily_volatility.index)
    for day, vol in daily_volatility.iteritems():
        days_passed = len(daily_volatility.loc \
                    [daily_volatility.index[0] : day])
        #set the vertical barrier 
        if (days_passed + t_final < len(daily_volatility.index) \
            and t_final != 0):
            vert_barrier = daily_volatility.index[
                                days_passed + t_final]
        else:
            vert_barrier = np.nan
        #set the top barrier
        if upper_lower_multipliers[0] > 0:
            top_barrier = prices.loc[day] + prices.loc[day] * \
                        upper_lower_multipliers[0] * vol
        else:
            #set it to NaNs
            top_barrier = pd.Series(index=prices.index)
        #set the bottom barrier
        if upper_lower_multipliers[1] > 0:
            bottom_barrier = prices.loc[day] - prices.loc[day] * \
                        upper_lower_multipliers[1] * vol
        else: 
            #set it to NaNs
            bottom_barrier = pd.Series(index=prices.index)

        barriers.loc[day, ['days_passed', 'price', 'vert_barrier','top_barrier', 'bottom_barrier']] = \
        days_passed, prices.loc[day], vert_barrier, \
        top_barrier, bottom_barrier
    return barriers

def get_labels():
    for i in range(len(barriers.index)):
        start = barriers.index[i]
        end = barriers.vert_barrier[i]
        if pd.notna(end):
            # assign the initial and final price
            price_initial = barriers.price[start]
            price_final = barriers.price[end]
            # assign the top and bottom barriers
            top_barrier = barriers.top_barrier[i]
            bottom_barrier = barriers.bottom_barrier[i]
            #set the profit taking and stop loss conditons
            condition_pt = (barriers.price[start: end] >= \
            top_barrier).any()
            condition_sl = (barriers.price[start: end] <= \
            bottom_barrier).any()
            #assign the labels 如果True情況下不設out為2會產生touch vertical bar 時產生 > 1 的值
            if condition_pt:  # over the top_barrier
                barriers['out'][i] = 1
            elif condition_sl:  # below the bottom_barrier
                # barriers['out'][i] = 0
                barriers['out'][i] = -1    
            else: 
                barriers['out'][i] = 0 # vertical_barrier
                # barriers['out'][i] = max(
                #           [(price_final - price_initial)/ 
                #            (top_barrier - price_initial), \
                #            (price_final - price_initial)/ \
                #            (price_initial - bottom_barrier)],\
                #             key=abs)
            if condition_pt:  # over the top_barrier
                barriers['long_ret'][i] = (top_barrier - price_initial)/price_initial - 0.001
                barriers['short_ret'][i] = -(top_barrier - price_initial)/price_initial - 0.001
            elif condition_sl:  # below the bottom_barrier
                barriers['long_ret'][i] = -(price_initial - bottom_barrier)/price_initial - 0.001
                barriers['short_ret'][i] = (price_initial - bottom_barrier)/price_initial - 0.001
            else: 
                barriers['long_ret'][i] = (price_final - price_initial)/price_initial - 0.001
                barriers['short_ret'][i] = -(price_final - price_initial)/price_initial - 0.001
    return


In [10]:
sid = '2330'
data = pd.DataFrame({
            'close': close[sid],
            'open': open_[sid],
            'high': high[sid],
            'low': low[sid],
            'volume': vol[sid]})

In [11]:
data = data.reset_index()
data.dropna(axis=0, how='any', inplace=True)
rev = rev.reset_index()
# print(f'shape of df {data.shape}')
data = pd.merge(data,rev[['date',sid]], on="date", how='outer')
# print(f'shape of df {data.shape}')
data = data.sort_values(by=['date'])
data = data.rename(columns={sid: "rev"})
data['rev'].fillna(method='ffill', inplace=True)

com_rev = com_rev.reset_index()
# print(f'shape of df {data.shape}')
data = pd.merge(data,com_rev[['date',sid]], on="date", how='outer')
# print(f'shape of df {data.shape}')
data = data.sort_values(by=['date'])
data = data.rename(columns={sid: "com_rev"})
data['com_rev'].fillna(method='ffill', inplace=True)

d_yield = d_yield.reset_index()
# print(f'shape of df {data.shape}')
data = pd.merge(data,d_yield[['date',sid]], on="date", how='outer')
# print(f'shape of df {data.shape}')
data = data.rename(columns={sid: "d_yield"})
data['d_yield'].fillna(method='ffill', inplace=True)

pb = pb.reset_index()
# print(f'shape of df {data.shape}')
data = pd.merge(data,pb[['date',sid]], on="date", how='outer')
# print(f'shape of df {data.shape}')
data = data.rename(columns={sid: "pb"})
data['pb'].fillna(method='ffill', inplace=True)

benchmark = benchmark.reset_index()
benchmark_list = ['date', 'b_OBV', 'b_AD', 'b_ADOSC', 'b_MA5', 'b_MA20', 'b_MA60', 'b_bias5', 'b_bias10', 'b_bias20', 'b_bias60'
, 'b_acc5', 'b_acc10', 'b_acc20', 'b_acc60', 'b_volatility']
data = pd.merge(data,benchmark[benchmark_list], on="date", how='outer')
for features in benchmark_list:
    data[features].fillna(method='ffill', inplace=True)

data = data.set_index('date')
# print(f'shape of df {data.shape}')
data.dropna(axis=0, how='any', inplace=True)

In [12]:
data.tail()

Unnamed: 0_level_0,close,open,high,low,volume,rev,com_rev,d_yield,pb,b_OBV,...,b_MA60,b_bias5,b_bias10,b_bias20,b_bias60,b_acc5,b_acc10,b_acc20,b_acc60,b_volatility
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2019-01-24,222.5,222.5,222.5,220.5,31348924.0,89830598.0,-8.69,3.6,3.66,-85052728.0,...,1.83,0.989312,0.985363,0.992228,0.975883,1.015116,1.012179,0.977145,0.970892,0.011676
2019-01-25,226.0,226.5,227.0,225.0,48039764.0,89830598.0,-8.69,3.54,3.72,-77731950.0,...,0.823333,1.004554,0.998668,1.003982,0.989141,1.009302,1.016493,0.967445,0.976947,0.011821
2019-01-28,229.0,229.5,229.5,228.0,29134257.0,89830598.0,-8.69,3.49,3.77,-73906807.0,...,0.615,1.010216,1.000466,1.005923,0.991883,1.018981,0.998668,0.98815,0.975201,0.011774
2019-01-29,222.5,222.5,225.0,222.0,42566520.0,89830598.0,-8.69,3.6,3.66,-78326597.0,...,1.2025,1.000269,0.993333,0.99789,0.984115,0.986347,1.008667,0.984997,0.974167,0.011825
2019-01-30,221.0,220.5,221.5,220.0,51889945.0,89830598.0,-8.69,3.62,3.64,-83581826.0,...,1.291667,0.996515,0.992922,0.995681,0.982924,0.983322,1.018194,0.970511,0.974695,0.011762


In [20]:
data['upperband'], data['middleband'], data['lowerband'] = BBANDS(data.close, 20, 2., 2. ,0)
data['OBV'] = OBV(data.close, data.volume)
data['AD'] = AD(data.high, data.low, data.close, data.volume)
data['ADOSC'] = ADOSC(data.high, data.low, data.close, data.volume, fastperiod=3, slowperiod=10)
data['K'], data['D'] = STOCH(data.high, data.low, data.close, fastk_period=9, slowk_period=3,slowd_period=3)

data['MA5'] = MA(data['close'], 5) - data['close']
data['MA20'] = MA(data['close'], 20) - data['close']
data['MA60'] = MA(data['close'], 60) - data['close']

data['bias5'] = bias(data['close'], 5)
data['bias10'] = bias(data['close'], 10)
data['bias20'] = bias(data['close'], 20)
data['bias60'] = bias(data['close'], 60)

data['acc5'] = acc(data['close'], 5)
data['acc10'] = acc(data['close'], 10)
data['acc20'] = acc(data['close'], 20)
data['acc60'] = acc(data['close'], 60)

data['rsi'] = RSI(data['close'], window=14)
data['MACD'], data['signal'], data['hist'] = MACD(data['close'], fastperiod=12,  slowperiod=26,  signalperiod=9)
# Compute sides
data['side'] = np.nan 
data['high1'] = data['high'].shift(1)
data['low1'] = data['low'].shift(1)
data['close1'] = data['close'].shift(1)

In [22]:
long_signals = (data['close'] >= data['lowerband']) & (data['low1'] <= data['lowerband']) & (data['close'] > data['open'])
short_signals = (data['close'] <= data['upperband']) & (data['close1'] >= data['upperband']) & (data['open'] > data['close'])

data.loc[long_signals, 'side'] = 1
data.loc[short_signals, 'side'] = -1

data['side'].fillna(value=0, inplace=True)

In [25]:
data['log_ret'] = np.log(data['close']).diff()
# Momentum
data['mom1'] = data['close'].pct_change(periods=1)
data['mom2'] = data['close'].pct_change(periods=2)
data['mom3'] = data['close'].pct_change(periods=3)
data['mom4'] = data['close'].pct_change(periods=4)
data['mom5'] = data['close'].pct_change(periods=5)

# Volatility
window_stdev = 50
data['volatility'] = data['log_ret'].rolling(window=window_stdev, min_periods=window_stdev, center=False).std()

# Serial Correlation (Takes about 4 minutes)
window_autocorr = 50

data['autocorr_1'] = data['log_ret'].rolling(window=window_autocorr, min_periods=window_autocorr, center=False).apply(lambda x: x.autocorr(lag=1), raw=False)
data['autocorr_2'] = data['log_ret'].rolling(window=window_autocorr, min_periods=window_autocorr, center=False).apply(lambda x: x.autocorr(lag=2), raw=False)
data['autocorr_3'] = data['log_ret'].rolling(window=window_autocorr, min_periods=window_autocorr, center=False).apply(lambda x: x.autocorr(lag=3), raw=False)
data['autocorr_4'] = data['log_ret'].rolling(window=window_autocorr, min_periods=window_autocorr, center=False).apply(lambda x: x.autocorr(lag=4), raw=False)
data['autocorr_5'] = data['log_ret'].rolling(window=window_autocorr, min_periods=window_autocorr, center=False).apply(lambda x: x.autocorr(lag=5), raw=False)

# Get the various log -t returns
data['log_t1'] = data['log_ret'].shift(1)
data['log_t2'] = data['log_ret'].shift(2)
data['log_t3'] = data['log_ret'].shift(3)
data['log_t4'] = data['log_ret'].shift(4)
data['log_t5'] = data['log_ret'].shift(5)

# Add fast and slow moving averages
fast_window = 7
slow_window = 15

data['fast_mavg'] = data['close'].rolling(window=fast_window, min_periods=fast_window, center=False).mean()
data['slow_mavg'] = data['close'].rolling(window=slow_window, min_periods=slow_window, center=False).mean()

data['sma'] = np.nan

long_signals = data['fast_mavg'] >= data['slow_mavg']
short_signals = data['fast_mavg'] < data['slow_mavg']
data.loc[long_signals, 'sma'] = 1
data.loc[short_signals, 'sma'] = -1
data['sma'].fillna(value=0, inplace=True)

price = data['close']
daily_volatility = get_Daily_Volatility(price)
# how many days we hold the stock which set the vertical barrier
t_final = 10 
#the up and low boundary multipliers
upper_lower_multipliers = [1, 1]
prices = price[daily_volatility.index]
barriers = get_3_barriers()
barriers['out'] = None
get_labels()
data = data.reset_index()
barriers = barriers.reset_index()
data = pd.merge(data,barriers[['date','out','long_ret', 'short_ret', 'top_barrier', 'bottom_barrier']], on="date")
data.dropna(axis=0, how='any', inplace=True)
data = data.reset_index()

In [29]:
feature_list = ['com_rev', 'd_yield', 'pb', 'AD', 'OBV', 'ADOSC', 'bias5',
    'bias10', 'bias20', 'bias60', 'acc5', 'acc10', 'acc20', 'acc60', 'rsi',
    'log_ret', 'mom1', 'mom2', 'mom3', 'mom4', 'mom5', 'volatility',
    'MACD', 'signal', 'hist',
    'autocorr_1', 'autocorr_2', 'autocorr_3', 'autocorr_4', 'autocorr_5',
    'log_t1', 'log_t2', 'log_t3', 'log_t4', 'log_t5', 'b_OBV', 'b_AD', 'b_ADOSC',
    'b_MA5', 'b_MA20', 'b_MA60', 'b_bias5', 'b_bias10', 'b_bias20', 'b_bias60',
    'b_acc5', 'b_acc10', 'b_acc20', 'b_acc60', 'b_volatility']
scale = MinMaxScaler(feature_range = (-1, 1)) #z-scaler物件
for item in feature_list:
    data[item] = scale.fit_transform(np.array(data[item].to_list()).reshape(-1, 1))

In [30]:
data.tail()

Unnamed: 0,index,date,close,open,high,low,volume,rev,com_rev,d_yield,...,log_t4,log_t5,fast_mavg,slow_mavg,sma,out,long_ret,short_ret,top_barrier,bottom_barrier
279,397,2019-01-09,215.5,212.0,216.5,211.0,51255446.0,98389414.0,-0.559784,0.770492,...,-0.18254,-0.318342,215.428571,218.433333,-1.0,1,0.018927,-0.020927,219.794,211.206
280,398,2019-01-10,216.0,216.0,216.5,214.5,20832593.0,89830598.0,-0.696784,0.754098,...,-0.452218,-0.18254,214.071429,217.8,-1.0,1,0.0179858,-0.0199858,220.101,211.899
281,399,2019-01-11,220.5,219.0,220.5,218.0,28658288.0,89830598.0,-0.696784,0.639344,...,0.484794,-0.452218,214.214286,217.766667,-1.0,1,0.0182147,-0.0202147,224.737,216.263
282,400,2019-01-14,218.5,218.5,220.0,217.0,17612296.0,89830598.0,-0.696784,0.688525,...,-0.040709,0.484794,214.642857,217.433333,-1.0,1,0.0175326,-0.0195326,222.549,214.451
283,401,2019-01-15,221.0,216.5,221.0,215.5,42990923.0,89830598.0,-0.696784,0.622951,...,0.442813,-0.040709,216.5,217.4,-1.0,1,0.0169532,-0.0189532,224.968,217.032


In [None]:
data.head()

Unnamed: 0_level_0,index,close,open,high,low,volume,rev,rev,com_rev,d_yield,...,autocorr_4,autocorr_5,log_t1,log_t2,log_t3,log_t4,log_t5,fast_mavg,slow_mavg,sma
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2010-02-10,704.0,58.4,59.0,59.2,58.2,53809831.0,29156271.0,29156271.0,-4.3,5.17,...,,,,,,,,,,
2010-03-10,717.0,61.4,61.3,61.4,60.8,30588530.0,29195234.0,29195234.0,0.13,4.92,...,,,,,,,,,,
2010-05-10,759.0,59.8,59.4,60.0,59.0,56310864.0,32683232.0,32683232.0,6.03,5.02,...,,,0.050094,,,,,,,
2010-06-10,782.0,59.4,59.8,59.8,58.8,28817238.0,33839484.0,33839484.0,3.53,5.05,...,,,-0.026404,0.050094,,,,,,
2010-08-10,824.0,61.0,60.9,61.4,60.8,63934598.0,36156392.0,36156392.0,2.97,4.9,...,,,-0.006711,-0.026404,0.050094,,,,,


In [None]:
price = data['close']
daily_volatility = get_Daily_Volatility(price)
# how many days we hold the stock which set the vertical barrier
t_final = 10 
#the up and low boundary multipliers
upper_lower_multipliers = [2, 2]
prices = price[daily_volatility.index]
barriers = get_3_barriers()
barriers['out'] = None
get_labels()
data = data.reset_index()
barriers = barriers.reset_index()
# data = pd.merge(data,barriers[['date','out']], on="date")
# data.dropna(axis=0, how='any', inplace=True)
# data = data.reset_index()

In [None]:
data = pd.merge(data,barriers[['date','out']], on="date")
data.dropna(axis=0, how='any', inplace=True)
data = data.reset_index()

In [None]:
data.head()

Unnamed: 0,level_0,date,index,close,open,high,low,volume,rev,rev.1,...,autocorr_5,log_t1,log_t2,log_t3,log_t4,log_t5,fast_mavg,slow_mavg,sma,out


In [None]:
1/0

ZeroDivisionError: division by zero

In [None]:
# normalize
feature_list = ['com_rev', 'd_yield', 'pb', 'AD', 'OBV', 'ADOSC', 'bias5',
    'bias10', 'bias20', 'bias60', 'acc5', 'acc10', 'acc20', 'acc60', 'rsi',
    'log_ret', 'mom1', 'mom2', 'mom3', 'mom4', 'mom5', 'volatility',
    'autocorr_1', 'autocorr_2', 'autocorr_3', 'autocorr_4', 'autocorr_5',
    'log_t1', 'log_t2', 'log_t3', 'log_t4', 'log_t5']
scale = StandardScaler() #z-scaler物件
for item in feature_list:
    data[item] = scale.fit_transform(np.array(data[item].to_list()).reshape(-1, 1))

X = data[data.side == 1].copy()
X.drop(['date', 'upperband', 'lowerband', 'middleband', 'open', 'high', 'low','rev','volume', 'cs1', 'close','MA5','MA60',
    'vol1', 'fast_mavg', 'slow_mavg'], axis=1, inplace=True)
feature_list = feature_list + ['side', 'sma', 'out']