In [1]:
import pandas as pd
import numpy as np
import datetime
import yfinance as yf

from finrl.meta.preprocessor.yahoodownloader import YahooDownloader
from finrl.meta.preprocessor.preprocessors import FeatureEngineer, data_split
from finrl import config_tickers
from finrl.config import INDICATORS

import itertools

2023-08-21 23:41:39.812496: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


# Part 2. Fetch data

In [2]:
aapl_df_yf = yf.download(tickers = "aapl", start='2020-01-01', end='2020-01-31')

[*********************100%***********************]  1 of 1 completed


In [3]:
aapl_df_yf.head()

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2020-01-02,74.059998,75.150002,73.797501,75.087502,73.249031,135480400
2020-01-03,74.287498,75.144997,74.125,74.357498,72.536888,146322800
2020-01-06,73.447502,74.989998,73.1875,74.949997,73.114876,118387200
2020-01-07,74.959999,75.224998,74.370003,74.597504,72.771019,108872000
2020-01-08,74.290001,76.110001,74.290001,75.797501,73.941635,132079200


In [4]:
aapl_df_finrl = YahooDownloader(start_date = '2020-01-01',
                                end_date = '2020-01-31',
                                ticker_list = ['aapl']).fetch_data()

[*********************100%***********************]  1 of 1 completed
Shape of DataFrame:  (20, 8)


In [5]:
aapl_df_finrl.head()

Unnamed: 0,date,open,high,low,close,volume,tic,day
0,2020-01-02,74.059998,75.150002,73.797501,73.249031,135480400,aapl,3
1,2020-01-03,74.287498,75.144997,74.125,72.536903,146322800,aapl,4
2,2020-01-06,73.447502,74.989998,73.1875,73.114891,118387200,aapl,0
3,2020-01-07,74.959999,75.224998,74.370003,72.771027,108872000,aapl,1
4,2020-01-08,74.290001,76.110001,74.290001,73.941628,132079200,aapl,2


## Data for the chosen tickers

In [6]:
config_tickers.DOW_30_TICKER

['AXP',
 'AMGN',
 'AAPL',
 'BA',
 'CAT',
 'CSCO',
 'CVX',
 'GS',
 'HD',
 'HON',
 'IBM',
 'INTC',
 'JNJ',
 'KO',
 'JPM',
 'MCD',
 'MMM',
 'MRK',
 'MSFT',
 'NKE',
 'PG',
 'TRV',
 'UNH',
 'CRM',
 'VZ',
 'V',
 'WBA',
 'WMT',
 'DIS',
 'DOW']

In [7]:
TRAIN_START_DATE = '2009-01-01'
TRAIN_END_DATE = '2020-07-01'
TRADE_START_DATE = '2020-07-01'
TRADE_END_DATE = '2021-10-29'

In [8]:
df_raw = YahooDownloader(start_date = TRAIN_START_DATE,
                     end_date = TRADE_END_DATE,
                     ticker_list = config_tickers.DOW_30_TICKER).fetch_data()

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%********

In [9]:
df_raw.head()

Unnamed: 0,date,open,high,low,close,volume,tic,day
0,2009-01-02,3.067143,3.251429,3.041429,2.751011,746015200,AAPL,4
1,2009-01-02,58.59,59.080002,57.75,43.073936,6547900,AMGN,4
2,2009-01-02,18.57,19.52,18.4,15.256273,10955700,AXP,4
3,2009-01-02,42.799999,45.560001,42.779999,33.94109,7010200,BA,4
4,2009-01-02,44.91,46.98,44.709999,31.254063,7117200,CAT,4


# Part 3: Preprocess Data

In [10]:
fe = FeatureEngineer(use_technical_indicator=True,
                     tech_indicator_list = INDICATORS,
                     use_vix=True,
                     use_turbulence=True,
                     user_defined_feature = False)

processed = fe.preprocess_data(df_raw)

Successfully added technical indicators
[*********************100%***********************]  1 of 1 completed
Shape of DataFrame:  (3228, 8)
Successfully added vix
Successfully added turbulence index


In [11]:
list_ticker = processed["tic"].unique().tolist()
list_date = list(pd.date_range(processed['date'].min(),processed['date'].max()).astype(str))
combination = list(itertools.product(list_date,list_ticker))

processed_full = pd.DataFrame(combination,columns=["date","tic"]).merge(processed,on=["date","tic"],how="left")
processed_full = processed_full[processed_full['date'].isin(processed['date'])]
processed_full = processed_full.sort_values(['date','tic'])

processed_full = processed_full.fillna(0)

In [12]:
processed_full.head()

Unnamed: 0,date,tic,open,high,low,close,volume,day,macd,boll_ub,boll_lb,rsi_30,cci_30,dx_30,close_30_sma,close_60_sma,vix,turbulence
0,2009-01-02,AAPL,3.067143,3.251429,3.041429,2.751011,746015200.0,4.0,0.0,2.973257,2.644867,100.0,66.666667,100.0,2.751011,2.751011,39.189999,0.0
1,2009-01-02,AMGN,58.59,59.080002,57.75,43.073936,6547900.0,4.0,0.0,2.973257,2.644867,100.0,66.666667,100.0,43.073936,43.073936,39.189999,0.0
2,2009-01-02,AXP,18.57,19.52,18.4,15.256273,10955700.0,4.0,0.0,2.973257,2.644867,100.0,66.666667,100.0,15.256273,15.256273,39.189999,0.0
3,2009-01-02,BA,42.799999,45.560001,42.779999,33.94109,7010200.0,4.0,0.0,2.973257,2.644867,100.0,66.666667,100.0,33.94109,33.94109,39.189999,0.0
4,2009-01-02,CAT,44.91,46.98,44.709999,31.254063,7117200.0,4.0,0.0,2.973257,2.644867,100.0,66.666667,100.0,31.254063,31.254063,39.189999,0.0


# Part 4: Save the Data

### Split the data for training and trading

In [13]:
train = data_split(processed_full, TRAIN_START_DATE,TRAIN_END_DATE)
trade = data_split(processed_full, TRADE_START_DATE,TRADE_END_DATE)
print(len(train))
print(len(trade))

83897
9715


### Save data to csv file

In [14]:
train.to_csv('train_data.csv')
trade.to_csv('trade_data.csv')

In [15]:
processed_full

Unnamed: 0,date,tic,open,high,low,close,volume,day,macd,boll_ub,boll_lb,rsi_30,cci_30,dx_30,close_30_sma,close_60_sma,vix,turbulence
0,2009-01-02,AAPL,3.067143,3.251429,3.041429,2.751011,746015200.0,4.0,0.000000,2.973257,2.644867,100.000000,66.666667,100.000000,2.751011,2.751011,39.189999,0.000000
1,2009-01-02,AMGN,58.590000,59.080002,57.750000,43.073936,6547900.0,4.0,0.000000,2.973257,2.644867,100.000000,66.666667,100.000000,43.073936,43.073936,39.189999,0.000000
2,2009-01-02,AXP,18.570000,19.520000,18.400000,15.256273,10955700.0,4.0,0.000000,2.973257,2.644867,100.000000,66.666667,100.000000,15.256273,15.256273,39.189999,0.000000
3,2009-01-02,BA,42.799999,45.560001,42.779999,33.941090,7010200.0,4.0,0.000000,2.973257,2.644867,100.000000,66.666667,100.000000,33.941090,33.941090,39.189999,0.000000
4,2009-01-02,CAT,44.910000,46.980000,44.709999,31.254063,7117200.0,4.0,0.000000,2.973257,2.644867,100.000000,66.666667,100.000000,31.254063,31.254063,39.189999,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
135773,2021-10-27,UNH,454.640015,460.440002,453.480011,442.888275,3520400.0,2.0,11.389334,453.317599,363.583100,65.085948,174.334935,51.571330,405.454169,405.739810,16.980000,145.285221
135774,2021-10-27,V,224.750000,224.750000,215.660004,212.550888,22958100.0,2.0,0.013828,233.490236,215.279985,41.737207,-153.711177,30.337021,223.493653,225.707213,16.980000,145.285221
135775,2021-10-27,VZ,53.169998,53.200001,52.470001,47.476547,15007400.0,2.0,-0.220521,49.223191,46.156535,44.604910,-50.018824,8.504850,47.925124,48.541796,16.980000,145.285221
135776,2021-10-27,WBA,48.450001,48.459999,47.090000,42.596809,5652000.0,2.0,-0.014852,45.261754,41.446075,46.167780,-62.274121,5.045608,43.558371,43.880977,16.980000,145.285221


In [26]:
bazar = list(processed_full['tic'].unique())
for i in bazar:
    mask = processed_full['tic'] == i
    count = len(processed_full[mask])
    print(i, count)
 

AAPL 3228
AMGN 3228
AXP 3228
BA 3228
CAT 3228
CRM 3228
CSCO 3228
CVX 3228
DIS 3228
GS 3228
HD 3228
HON 3228
IBM 3228
INTC 3228
JNJ 3228
JPM 3228
KO 3228
MCD 3228
MMM 3228
MRK 3228
MSFT 3228
NKE 3228
PG 3228
TRV 3228
UNH 3228
V 3228
VZ 3228
WBA 3228
WMT 3228


In [27]:
bazar

['AAPL',
 'AMGN',
 'AXP',
 'BA',
 'CAT',
 'CRM',
 'CSCO',
 'CVX',
 'DIS',
 'GS',
 'HD',
 'HON',
 'IBM',
 'INTC',
 'JNJ',
 'JPM',
 'KO',
 'MCD',
 'MMM',
 'MRK',
 'MSFT',
 'NKE',
 'PG',
 'TRV',
 'UNH',
 'V',
 'VZ',
 'WBA',
 'WMT']