In [2]:
%matplotlib inline
%matplotlib notebook

import vectorbtpro as vbt
import numpy as np
import pandas as pd

from numba import njit
import talib

# 管道优化
from vectorbtpro.returns import nb as ret_nb
from vectorbtpro.portfolio import nb as pf_nb
from vectorbtpro.portfolio.enums import Direction

vbt.settings.set_theme('dark')

# 数据下载

### `EURUSD` 各频段数据下载,  工具来源 [`dukascopy-node`](https://github.com/Leo4815162342/dukascopy-node).
<br>`m1`, `m5`, `h1`, `h4`</br>

<br>`安装包安装脚本`</br>
```shell
npm install dukascopy-node --save
```

<br>`Docker以及Conda环境`</br>
在my_packt 的 base 环境下操作即可

<br>`数据下载脚本`</br>
```shell
npx dukascopy-node -i eurusd -p ask -from 2021-01-01 -to 2023-05-08 -t m1 -v true -f csv
npx dukascopy-node -i eurusd -p bid -from 2023-01-01 -to 2023-05-08 -t m1 -v true -f csv

npx dukascopy-node -i eurusd -p ask -from 2021-01-01 -to 2023-04-20 -t m5 -v true -f csv
npx dukascopy-node -i eurusd -p bid -from 2021-01-01 -to 2023-04-20 -t m5 -v true -f csv

npx dukascopy-node -i eurusd -p ask -from 2021-01-01 -to 2023-04-20 -t h1 -v true -f csv
npx dukascopy-node -i eurusd -p bid -from 2021-01-01 -to 2023-04-20 -t h1 -v true -f csv

npx dukascopy-node -i eurusd -p ask -from 2021-01-01 -to 2023-04-20 -t h4 -v true -f csv
npx dukascopy-node -i eurusd -p bid -from 2021-01-01 -to 2023-04-20 -t h4 -v true -f csv
```

# UDF定义 (组合 bid_ask price)

In [2]:
def read_bid_ask_data(ask_file : str, bid_file : str, lowercase_columns = False, set_time_index = False) -> pd.DataFrame:
    """Reads and combines the bid and ask csv files of duksascopy historical market data, into a single OHLCV dataframe."""
    df_ask = pd.read_csv(ask_file, infer_datetime_format = True)
    df_bid = pd.read_csv(bid_file, infer_datetime_format = True)
    df_ask_columns = list(df_ask.columns)
    df_bid_columns = list(df_bid.columns)    
    cols_avg = ["Open", "High", "Low", "Close", "Volume"]     
    cols_avg = cols_avg + ['Timestamp'] if 'timestamp' in df_ask_columns else cols_avg     
    df_ask.columns = df_ask.columns.str.title()
    df_bid.columns = df_bid.columns.str.title()   

    ## Average OHLCV columns for bid and ask data
    df_avg = (df_bid[cols_avg] + df_ask[cols_avg]) / 2.0
    df_avg = df_avg[df_avg["Volume"] > 0.0].reset_index()

    ## Case when we downloaded Dukascopy historical market data from node package: dukascopy-node
    if ('timestamp' in df_ask_columns) or ('timestamp' in df_bid_columns):       
        df_avg['time'] = pd.to_datetime(df_avg['Timestamp'], unit = 'ms')
        df_avg.drop(columns = ["Timestamp"],inplace = True)

    ## Case when we downloaded Dukascopy historical market data from website
    if ("Local time" in df_ask_columns) or ("Local time" in df_bid_columns):
        print(f"Columns in df_avg:{df_avg.columns}")
        df_avg["time"] = df_ask["Local Time"]
        ## Strip ms and GMT TZ in time column
        df_avg["time"] = df_avg["time"].str.replace(r".\d{3} GMT[+-]\d\d\d\d", '', regex = True) 

    if "index" in list(df_avg.columns):
        # print("index column found in dataframe, so dropping them")
        df_avg.drop(labels = "index", axis = 1, inplace = True)

    if lowercase_columns:
        df_avg.columns= df_avg.columns.str.lower()
        
    if set_time_index:
        df_avg["time"] = pd.to_datetime(df_avg["time"],format='%d.%m.%Y %H:%M:%S')
        df_avg = df_avg.set_index("time")      
    return df_avg

In [3]:
### DataFrame Slicing based on nr. of rows on 1m dataframe
def slice_df_by_1m_rows(df : pd.DataFrame, nr_days_to_slice : int):
    """Slice the historical dataframe from most recent to the nr. of days specified"""
    mins_per_day = 24 * 60
    nr_days_to_slice = 365 * mins_per_day
    df = df.iloc[-nr_days_to_slice:].reset_index(drop = True)
    return df

# 处理数据 + 存储为HDF

In [6]:
symbol = "EURUSD"
time_frames = ["M1", "M5", "H1", "H4"]
data_range = slice("2023-01-01", "2023-05-08")
tf_strs = "_".join(time_frames)
folder_path = "./download/"
to_path = "./data"

output_file_path = f"./data/{symbol}_{tf_strs}_OHLCV_{data_range.start}_{data_range.stop}.h5"

In [9]:
## Specify FileNames of Bid / Ask data downloaded from DukaScopy
bid_ask_files = {
    "EURUSD_M1" : {"Bid": f"eurusd-m1-bid-{data_range.start}-{data_range.stop}.csv",
                   "Ask": f"eurusd-m1-ask-{data_range.start}-{data_range.stop}.csv"},
    # "EURUSD_M5" : {"Bid": f"eurusd-m5-bid-{data_range.start}-{data_range.stop}.csv",
    #                "Ask": f"eurusd-m5-ask-{data_range.start}-{data_range.stop}.csv"},
    # "EURUSD_H1" : {"Bid": f"eurusd-h1-bid-{data_range.start}-{data_range.stop}.csv",
    #                "Ask": f"eurusd-h1-ask-{data_range.start}-{data_range.stop}.csv"},
    # "EURUSD_H4" : {"Bid": f"eurusd-h4-bid-{data_range.start}-{data_range.stop}.csv",
    #                "Ask": f"eurusd-h4-ask-{data_range.start}-{data_range.stop}.csv"},
}

In [10]:
## Write everything into one single HDF5 file indexed by keys for the various symbols
import os

if os.path.exists(output_file_path):
    # If file exists, delete it
    os.remove(output_file_path)
    print(f"{output_file_path} has been deleted.")
else:
    print(f"{output_file_path} does not exist.")


for symbol in bid_ask_files.keys():
    print(f'\n{symbol}')
    ask_csv_file = folder_path + bid_ask_files[symbol]["Ask"]
    bid_csv_file = folder_path + bid_ask_files[symbol]["Bid"]
    print("ASK File PATH:",ask_csv_file,'\nBID File PATH:',bid_csv_file)
    df = read_bid_ask_data(ask_csv_file, bid_csv_file, set_time_index = True)
    vbt.Data.from_data(df).to_hdf(output_file_path, key=symbol)

./data/EURUSD_M1_M5_H1_H4_OHLCV_2023-01-01_2023-05-08.h5 has been deleted.

EURUSD_M1
ASK File PATH: ./download/eurusd-m1-ask-2023-01-01-2023-05-08.csv 
BID File PATH: ./download/eurusd-m1-bid-2023-01-01-2023-05-08.csv


# 从mt5客户端搜集来的数据

In [13]:
xau_df = pd.read_csv("data/XAUUSD_20181219_20230516.csv", index_col=["time"])
xau_df

Unnamed: 0_level_0,Open,High,Low,Close,Volume
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2018-12-19 12:00:00,1248.16,1249.16,1246.83,1247.93,2351
2018-12-19 13:00:00,1247.92,1248.70,1247.80,1248.37,2073
2018-12-19 14:00:00,1248.34,1248.45,1247.36,1247.87,2013
2018-12-19 15:00:00,1247.89,1248.65,1247.23,1248.42,2521
2018-12-19 16:00:00,1248.43,1257.06,1248.15,1255.81,3303
...,...,...,...,...,...
2023-05-16 14:00:00,2008.21,2009.32,2005.15,2005.51,4706
2023-05-16 15:00:00,2005.51,2014.86,2001.18,2011.99,11212
2023-05-16 16:00:00,2011.98,2012.34,2007.07,2007.59,12721
2023-05-16 17:00:00,2007.62,2008.40,2001.94,2003.25,11419


In [14]:
xau_df = xau_df.loc["2020-01-01 00:00:00":]
xau_df

Unnamed: 0_level_0,Open,High,Low,Close,Volume
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2020-01-02 01:00:00,1517.97,1521.37,1517.95,1518.41,3598
2020-01-02 02:00:00,1518.45,1519.37,1518.14,1518.69,2403
2020-01-02 03:00:00,1518.69,1520.91,1517.23,1519.44,6110
2020-01-02 04:00:00,1519.44,1520.95,1518.44,1520.63,3475
2020-01-02 05:00:00,1520.63,1521.21,1519.86,1520.37,1345
...,...,...,...,...,...
2023-05-16 14:00:00,2008.21,2009.32,2005.15,2005.51,4706
2023-05-16 15:00:00,2005.51,2014.86,2001.18,2011.99,11212
2023-05-16 16:00:00,2011.98,2012.34,2007.07,2007.59,12721
2023-05-16 17:00:00,2007.62,2008.40,2001.94,2003.25,11419


In [15]:
symbol = "XAUUSD"
time_frames = ["H1"]
date_range = slice("2020-01-01", "2023-05-16")
tf_strs = "_".join(time_frames)
folder_path = "./download/"
to_path = "./data"

output_file_path = f"./data/{symbol}_{tf_strs}_OHLCV_{date_range.start}_{date_range.stop}.h5"

vbt.Data.from_data(xau_df).to_hdf(output_file_path, key="XAUUSD")

In [16]:
vbt_data = vbt.Data.from_data(xau_df)
vbt_data

<vectorbtpro.data.base.Data at 0x7f547cb37fa0>

In [17]:
vbt_data.wrapper.index

DatetimeIndex(['2020-01-02 01:00:00', '2020-01-02 02:00:00',
               '2020-01-02 03:00:00', '2020-01-02 04:00:00',
               '2020-01-02 05:00:00', '2020-01-02 06:00:00',
               '2020-01-02 07:00:00', '2020-01-02 08:00:00',
               '2020-01-02 09:00:00', '2020-01-02 10:00:00',
               ...
               '2023-05-16 09:00:00', '2023-05-16 10:00:00',
               '2023-05-16 11:00:00', '2023-05-16 12:00:00',
               '2023-05-16 13:00:00', '2023-05-16 14:00:00',
               '2023-05-16 15:00:00', '2023-05-16 16:00:00',
               '2023-05-16 17:00:00', '2023-05-16 18:00:00'],
              dtype='datetime64[ns]', name='time', length=19924, freq=None)

In [18]:
vbt_data.wrapper.columns

Index(['Open', 'High', 'Low', 'Close', 'Volume'], dtype='object')