In this notebook, I used Ta-lib's abstract API to add all the financial indicators that Ta-lib has to offer. When dealing with a dataset that contains multiple instruments or stocks, one needs to avoid applying the indicators to the entire dataset all-in-once; the correct way is to calculate each stock's indicators individually. This notebook works for datasets that has only one stock or multiple stocks

In [1]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
import os
from tqdm import tqdm
import talib
from talib import abstract
from finta import TA

In [2]:
train_df = pd.read_csv("datasets\\processed_full.csv").drop(columns=['Unnamed: 0', 'day', 'macd', 'boll_ub', 
                                                                     'boll_lb', 'rsi_30', 'cci_30', 'dx_30',
                                                                     'close_30_sma', 'close_60_sma', 'vix',
                                                                     'turbulence'])
train_df = train_df.reset_index(drop=True)
tickers = train_df.tic.unique()
train_df['pct_change'] = train_df.close.pct_change().fillna(0)

np.sum((train_df==0).sum().values), train_df.isna().sum(), tickers, train_df.columns, train_df.shape, train_df.head(5), train_df.tail(5)

(1,
 date          0
 tic           0
 open          0
 high          0
 low           0
 close         0
 volume        0
 pct_change    0
 dtype: int64,
 array(['AAPL', 'AMGN', 'AXP', 'BA', 'CAT', 'CRM', 'CSCO', 'CVX', 'DIS',
        'GS', 'HD', 'HON', 'IBM', 'INTC', 'JNJ', 'JPM', 'KO', 'MCD', 'MMM',
        'MRK', 'MSFT', 'NKE', 'PG', 'TRV', 'UNH', 'V', 'VZ', 'WBA', 'WMT'],
       dtype=object),
 Index(['date', 'tic', 'open', 'high', 'low', 'close', 'volume', 'pct_change'], dtype='object'),
 (103124, 8),
          date   tic       open       high        low      close       volume  \
 0  2009-01-02  AAPL   3.067143   3.251429   3.041429   2.758535  746015200.0   
 1  2009-01-02  AMGN  58.590000  59.080002  57.750000  43.832630    6547900.0   
 2  2009-01-02   AXP  18.570000  19.520000  18.400000  15.365303   10955700.0   
 3  2009-01-02    BA  42.799999  45.560001  42.779999  33.941109    7010200.0   
 4  2009-01-02   CAT  44.910000  46.980000  44.709999  31.579325    7117200.0   
 

# Get All Features
The abstract API of Ta-lib libary is used.

In [3]:
columns = ['open', 'close', 'high', 'low', 'volume']
for a in columns:
    train_df[a] = train_df[a].values.astype('float64')

window_size = 125
individual_length = train_df[train_df.tic==tickers[0]].shape[0]
ta_list = talib.get_functions()
pre_processed_train_df = pd.DataFrame()
for i in tqdm(range(len(tickers))):
    tmp_df = train_df[train_df.tic==tickers[i]]
    for x in ta_list:
        try:
            output = eval('abstract.'+x+'(tmp_df)')
            if len(output.shape)>1 and len(output.shape)<10:
                for a in range(output.shape[-1]):
                    tmp_df[str(x)+str(a)] = output.iloc[:,a]
            else:
                tmp_df[str(x)] = output
        except:
            print('Error: ', x)

    pre_processed_train_df = pre_processed_train_df.append(tmp_df.drop(tmp_df.head(200).index))   
pre_processed_train_df = pre_processed_train_df.replace([np.inf, -np.inf], np.nan)
pre_processed_train_df = pre_processed_train_df.fillna(0)
pre_processed_train_df = pre_processed_train_df.reset_index(drop=True)

pre_processed_train_df.shape, pre_processed_train_df.head(1), pre_processed_train_df.tail(1), len(pre_processed_train_df.tic.unique()), pre_processed_train_df.isna().sum()

  7%|█████▋                                                                             | 2/29 [00:00<00:02, 12.49it/s]

Error:  MAVP
Error:  MAVP
Error:  MAVP


 21%|█████████████████▏                                                                 | 6/29 [00:00<00:01, 11.98it/s]

Error:  MAVP
Error:  MAVP
Error:  MAVP


 28%|██████████████████████▉                                                            | 8/29 [00:00<00:01, 11.86it/s]

Error:  MAVP
Error:  MAVP
Error:  MAVP


 34%|████████████████████████████▎                                                     | 10/29 [00:00<00:01, 11.58it/s]

Error:  MAVP
Error:  MAVP


 41%|█████████████████████████████████▉                                                | 12/29 [00:01<00:01, 10.51it/s]

Error:  MAVP
Error:  MAVP
Error:  MAVP


 55%|█████████████████████████████████████████████▏                                    | 16/29 [00:01<00:01, 10.26it/s]

Error:  MAVP
Error:  MAVP


 62%|██████████████████████████████████████████████████▉                               | 18/29 [00:01<00:01, 10.01it/s]

Error:  MAVP
Error:  MAVP
Error:  MAVP


 72%|███████████████████████████████████████████████████████████▍                      | 21/29 [00:02<00:00,  9.51it/s]

Error:  MAVP
Error:  MAVP


 79%|█████████████████████████████████████████████████████████████████                 | 23/29 [00:02<00:00,  9.24it/s]

Error:  MAVP
Error:  MAVP


 83%|███████████████████████████████████████████████████████████████████▊              | 24/29 [00:02<00:00,  9.08it/s]

Error:  MAVP
Error:  MAVP


 90%|█████████████████████████████████████████████████████████████████████████▌        | 26/29 [00:02<00:00,  8.74it/s]

Error:  MAVP
Error:  MAVP


 97%|███████████████████████████████████████████████████████████████████████████████▏  | 28/29 [00:02<00:00,  8.33it/s]

Error:  MAVP
Error:  MAVP


100%|██████████████████████████████████████████████████████████████████████████████████| 29/29 [00:02<00:00,  9.72it/s]


((97324, 182),
          date   tic      open      high       low    close       volume  \
 0  2009-10-19  AAPL  6.708929  6.785714  6.626786  5.77119  942230800.0   
 
    pct_change  HT_DCPERIOD  HT_DCPHASE  ...  LINEARREG_SLOPE    STDDEV  \
 0   -0.847726    23.522297  182.902662  ...         0.015264  0.032712   
 
         TSF      VAR       ATR       NATR    TRANGE            AD  \
 0  5.841551  0.00107  1.039467  18.011316  1.069544 -1.412814e+12   
 
           ADOSC           OBV  
 0 -3.764301e+10  1.432607e+10  
 
 [1 rows x 182 columns],
              date  tic        open        high         low       close  \
 97323  2023-02-16  WMT  145.479996  145.990005  144.179993  144.270004   
 
           volume  pct_change  HT_DCPERIOD  HT_DCPHASE  ...  LINEARREG_SLOPE  \
 97323  5437400.0    2.972192    16.038365   97.752962  ...         0.203254   
 
          STDDEV         TSF       VAR       ATR      NATR    TRANGE  \
 97323  1.050723  144.765831  1.104019  2.643739  1.832494

In [4]:
all_length = []
pre_processed_train_df_2 = pre_processed_train_df.copy()
for i in range(len(tickers)):
    tmp_df = pre_processed_train_df_2[pre_processed_train_df_2.tic==tickers[i]]
    all_length.append(tmp_df.shape[0])
    
min(all_length), max(all_length)

(3356, 3356)

# Drop Features with Too Many Zeros
If data set contains too many zeros, the data quality will be too poor to train machine-learning or reinforcement learning models.

In [5]:
columns = pre_processed_train_df_2.columns
zeros = ((pre_processed_train_df_2==0).sum()>pre_processed_train_df_2.shape[0]*0.1)
columns_to_drop = np.where(zeros==True)[0]

columns_to_drop.shape

(67,)

In [6]:
for i in range(columns_to_drop.shape[0]):
    tmp_index = columns_to_drop[i]
    tmp_column = columns[tmp_index]
    pre_processed_train_df_2 = pre_processed_train_df_2.drop(columns=tmp_column)

pre_processed_train_df_2.isna().sum(), pre_processed_train_df_2.shape, pre_processed_train_df_2.columns, pre_processed_train_df_2.head(5), pre_processed_train_df_2.tail(5)

(date      0
 tic       0
 open      0
 high      0
 low       0
          ..
 NATR      0
 TRANGE    0
 AD        0
 ADOSC     0
 OBV       0
 Length: 115, dtype: int64,
 (97324, 115),
 Index(['date', 'tic', 'open', 'high', 'low', 'close', 'volume', 'pct_change',
        'HT_DCPERIOD', 'HT_DCPHASE',
        ...
        'LINEARREG_SLOPE', 'STDDEV', 'TSF', 'VAR', 'ATR', 'NATR', 'TRANGE',
        'AD', 'ADOSC', 'OBV'],
       dtype='object', length=115),
          date   tic      open      high       low     close        volume  \
 0  2009-10-19  AAPL  6.708929  6.785714  6.626786  5.771190  9.422308e+08   
 1  2009-10-20  AAPL  7.164286  7.205357  7.066071  6.041723  1.141039e+09   
 2  2009-10-21  AAPL  7.125714  7.453929  7.115357  6.228971  1.193727e+09   
 3  2009-10-22  AAPL  7.310714  7.423214  7.232500  6.237481  7.913920e+08   
 4  2009-10-23  AAPL  7.346429  7.350000  7.258214  6.199183  4.207868e+08   
 
    pct_change  HT_DCPERIOD  HT_DCPHASE  ...  LINEARREG_SLOPE    STDDEV  

# Only Keep Valid Features
 The indicator is filtered by finding out the corrolation between this term's indicator result and the next term's return rate. The more positive the corrolation is the stronger the indicator's ability to perdict the next term's return is.

In [7]:
factors = pre_processed_train_df_2.columns[8:]
tickers = pre_processed_train_df_2.tic.unique()
corr_list = []
corr_df = pd.DataFrame()
for i in tqdm(range(len(factors))):
    tmp_corr_2 = []
    for a in tickers:
        tmp_df = pre_processed_train_df_2[pre_processed_train_df_2.tic==a]
        tmp_df['pct_change'] = tmp_df.close.pct_change(-1).fillna(0)
        tmp_corr = np.corrcoef(tmp_df['pct_change'].values, tmp_df[factors[i]].values)[0][-1]
        tmp_corr_2.append(np.nan_to_num(tmp_corr))
    corr_list.append(np.mean(tmp_corr))
        
corr_df['correlation'] = corr_list
corr_df['factors'] = factors
corr_df = corr_df.dropna()
corr_df = corr_df.sort_values(by="correlation",ascending=False).reset_index(drop=True)

corr_df.head(30)

100%|████████████████████████████████████████████████████████████████████████████████| 107/107 [00:22<00:00,  4.77it/s]


Unnamed: 0,correlation,factors
0,0.042606,SUB
1,0.03705,ROCR
2,0.03705,ROCP
3,0.03705,ROCR100
4,0.03705,ROC
5,0.036987,DIV
6,0.036394,MOM
7,0.02825,CMO
8,0.02825,RSI
9,0.027857,MACD0
