In this notebook, I used Ta-lib's abstract API to add all the financial indicators that Ta-lib has to offer. When dealing with a dataset that contains multiple instruments or stocks, one needs to avoid applying the indicators to the entire dataset all-in-once; the correct way is to calculate each stock's indicators individually. This notebook works for datasets that has only one stock or multiple stocks

In [1]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
import os
from tqdm import tqdm
import talib
from talib import abstract
from finta import TA

In [2]:
file_dir = "Raw Data\\"
files_1 = os.listdir(file_dir)
train_df = pd.DataFrame()
for e in files_1:
    if 'train' in e:
        tmp_df = pd.DataFrame(pd.read_csv(file_dir+e))
        tmp_df = tmp_df.drop(columns=['KQ.m@SHFE.rb.open_oi', 'KQ.m@SHFE.rb.close_oi', 'datetime_nano'])
        tmp_df['symbol'] = ['KQ.m@SHFE.rb']*len(tmp_df)
        tmp_df = tmp_df.rename(columns={"symbol": "tic", 'datetime': 'date', 'KQ.m@SHFE.rb.high':'high',
                                       'KQ.m@SHFE.rb.low':'low', 'KQ.m@SHFE.rb.open':'open', 
                                        'KQ.m@SHFE.rb.close':'close', 'KQ.m@SHFE.rb.volume':'volume',})
        order = ['date', 'tic', 'open', 'close', 'high', 'low', 'volume']
        tmp_df = tmp_df[order]
        train_df = train_df.append(tmp_df)
    else:
        pass
train_df = train_df.reset_index(drop=True)
tickers = train_df.tic.unique()
train_df['pct_change'] = train_df.close.pct_change().fillna(0)

np.sum((train_df==0).sum().values), train_df.isna().sum(), tickers, train_df.columns, train_df.shape, train_df.head(5), train_df.tail(5)

(246,
 date          0
 tic           0
 open          0
 close         0
 high          0
 low           0
 volume        0
 pct_change    0
 dtype: int64,
 array(['KQ.m@SHFE.rb'], dtype=object),
 Index(['date', 'tic', 'open', 'close', 'high', 'low', 'volume', 'pct_change'], dtype='object'),
 (8261, 8),
                             date           tic    open   close    high  \
 0  2021-04-15 09:00:00.000000000  KQ.m@SHFE.rb  5147.0  5130.0  5150.0   
 1  2021-04-15 09:15:00.000000000  KQ.m@SHFE.rb  5130.0  5132.0  5139.0   
 2  2021-04-15 09:30:00.000000000  KQ.m@SHFE.rb  5132.0  5126.0  5136.0   
 3  2021-04-15 09:45:00.000000000  KQ.m@SHFE.rb  5126.0  5112.0  5126.0   
 4  2021-04-15 10:00:00.000000000  KQ.m@SHFE.rb  5112.0  5137.0  5137.0   
 
       low  volume  pct_change  
 0  5122.0  106792    0.000000  
 1  5129.0   46088    0.000390  
 2  5125.0   49828   -0.001169  
 3  5108.0  114306   -0.002731  
 4  5112.0   92887    0.004890  ,
                                date       

# Get All Features
The abstract API of Ta-lib libary is used.

In [3]:
columns = ['open', 'close', 'high', 'low', 'volume']
for a in columns:
    train_df[a] = train_df[a].values.astype('float64')

window_size = 125
individual_length = train_df[train_df.tic==tickers[0]].shape[0]
ta_list = talib.get_functions()
pre_processed_train_df = pd.DataFrame()
for i in tqdm(range(len(tickers))):
    tmp_df = train_df[train_df.tic==tickers[i]]
    for x in ta_list:
        try:
            output = eval('abstract.'+x+'(tmp_df)')
            if len(output.shape)>1 and len(output.shape)<10:
                for a in range(output.shape[-1]):
                    tmp_df[str(x)+str(a)] = output.iloc[:,a]
            else:
                tmp_df[str(x)] = output
        except:
            print('Error: ', x)

    pre_processed_train_df = pre_processed_train_df.append(tmp_df.drop(tmp_df.head(200).index))   
pre_processed_train_df = pre_processed_train_df.replace([np.inf, -np.inf], np.nan)
pre_processed_train_df = pre_processed_train_df.fillna(0)
pre_processed_train_df = pre_processed_train_df.reset_index(drop=True)

pre_processed_train_df.shape, pre_processed_train_df.head(1), pre_processed_train_df.tail(1), len(pre_processed_train_df.tic.unique()), pre_processed_train_df.isna().sum()

100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 10.41it/s]

Error:  MAVP





((8061, 182),
                             date           tic    open   close    high  \
 0  2021-04-27 21:15:00.000000000  KQ.m@SHFE.rb  5390.0  5362.0  5394.0   
 
       low    volume  pct_change  HT_DCPERIOD  HT_DCPHASE  ...  \
 0  5361.0  119462.0   -0.005195    21.982462  211.580373  ...   
 
    LINEARREG_SLOPE    STDDEV          TSF     VAR       ATR      NATR  TRANGE  \
 0        -1.002198  20.55821  5394.340659  422.64  19.60218  0.365576    33.0   
 
              AD         ADOSC        OBV  
 0  2.684909e+06 -50754.445123  2396522.0  
 
 [1 rows x 182 columns],
                                date           tic    open   close    high  \
 8060  2022-10-14 22:45:00.000000000  KQ.m@SHFE.rb  3698.0  3685.0  3699.0   
 
          low    volume  pct_change  HT_DCPERIOD  HT_DCPHASE  ...  \
 8060  3683.0  126543.0   -0.003515     30.60919  260.302071  ...   
 
       LINEARREG_SLOPE     STDDEV          TSF     VAR        ATR      NATR  \
 8060        -5.953846  12.073111  3690.70

In [4]:
all_length = []
pre_processed_train_df_2 = pre_processed_train_df.copy()
for i in range(len(tickers)):
    tmp_df = pre_processed_train_df_2[pre_processed_train_df_2.tic==tickers[i]]
    all_length.append(tmp_df.shape[0])
    
min(all_length), max(all_length)

(8061, 8061)

# Drop Features with Too Many Zeros
If data set contains too many zeros, the data quality will be too poor to train machine-learning or reinforcement learning models.

In [5]:
columns = pre_processed_train_df_2.columns
zeros = ((pre_processed_train_df_2==0).sum()>pre_processed_train_df_2.shape[0]*0.1)
columns_to_drop = np.where(zeros==True)[0]

columns_to_drop.shape

(70,)

In [6]:
for i in range(columns_to_drop.shape[0]):
    tmp_index = columns_to_drop[i]
    tmp_column = columns[tmp_index]
    pre_processed_train_df_2 = pre_processed_train_df_2.drop(columns=tmp_column)

pre_processed_train_df_2.isna().sum(), pre_processed_train_df_2.shape, pre_processed_train_df_2.columns, pre_processed_train_df_2.head(5), pre_processed_train_df_2.tail(5)

(date      0
 tic       0
 open      0
 close     0
 high      0
          ..
 NATR      0
 TRANGE    0
 AD        0
 ADOSC     0
 OBV       0
 Length: 112, dtype: int64,
 (8061, 112),
 Index(['date', 'tic', 'open', 'close', 'high', 'low', 'volume', 'pct_change',
        'HT_DCPERIOD', 'HT_DCPHASE',
        ...
        'LINEARREG_SLOPE', 'STDDEV', 'TSF', 'VAR', 'ATR', 'NATR', 'TRANGE',
        'AD', 'ADOSC', 'OBV'],
       dtype='object', length=112),
                             date           tic    open   close    high  \
 0  2021-04-27 21:15:00.000000000  KQ.m@SHFE.rb  5390.0  5362.0  5394.0   
 1  2021-04-27 21:30:00.000000000  KQ.m@SHFE.rb  5362.0  5344.0  5362.0   
 2  2021-04-27 21:45:00.000000000  KQ.m@SHFE.rb  5344.0  5359.0  5363.0   
 3  2021-04-27 22:00:00.000000000  KQ.m@SHFE.rb  5359.0  5359.0  5362.0   
 4  2021-04-27 22:15:00.000000000  KQ.m@SHFE.rb  5359.0  5352.0  5360.0   
 
       low    volume  pct_change  HT_DCPERIOD  HT_DCPHASE  ...  \
 0  5361.0  119462.0   -0.

# Only Keep Valid Features
 The indicator is filtered by finding out the corrolation between this term's indicator result and the next term's return rate. The more positive the corrolation is the stronger the indicator's ability to perdict the next term's return is.

In [7]:
factors = pre_processed_train_df_2.columns[8:]
tickers = pre_processed_train_df_2.tic.unique()
corr_list = []
corr_df = pd.DataFrame()
for i in tqdm(range(len(factors))):
    tmp_corr_2 = []
    for a in tickers:
        tmp_df = pre_processed_train_df_2[pre_processed_train_df_2.tic==a]
        tmp_df['pct_change'] = tmp_df.close.pct_change(-1).fillna(0)
        tmp_corr = np.corrcoef(tmp_df['pct_change'].values, tmp_df[factors[i]].values)[0][-1]
        tmp_corr_2.append(np.nan_to_num(tmp_corr))
    corr_list.append(np.mean(tmp_corr))
        
corr_df['correlation'] = corr_list
corr_df['factors'] = factors
corr_df = corr_df.dropna()
corr_df = corr_df.sort_values(by="correlation",ascending=False).reset_index(drop=True)

corr_df.head(30), corr_df.head(30)['factors'].to_list()

100%|███████████████████████████████████████████████████████████████████████████████| 104/104 [00:00<00:00, 166.82it/s]


(    correlation              factors
 0      0.039332                   DX
 1      0.026986               CORREL
 2      0.024708           HT_PHASOR1
 3      0.024447                  SUB
 4      0.023742             MINUS_DM
 5      0.021331               TRANGE
 6      0.020823             MINUS_DI
 7      0.020666                  DIV
 8      0.020386                  ATR
 9      0.017454                   AD
 10     0.014793                  MAX
 11     0.014793              MINMAX1
 12     0.014399  LINEARREG_INTERCEPT
 13     0.014075                  ADX
 14     0.014033                TRIMA
 15     0.013878                 NATR
 16     0.013877               STDDEV
 17     0.013828         HT_TRENDLINE
 18     0.013669                 KAMA
 19     0.013573                  SUM
 20     0.013573                  SMA
 21     0.013573                   MA
 22     0.013515                  SAR
 23     0.013259                  WMA
 24     0.013172                  EMA
 25     0.01