## 获得-处理 数据

#### 导入包

In [1]:
import numpy as np
import pandas as pd
import asyncio

from datetime import time

from Utility.Preprocessing import Preprocessor
from Utility.DataRequest import fetch_tick_data, fetch_candle_data, fetch_nbbo_data

### 必要设置：

In [2]:
SYMBOL = 'SPY'
START_DATE = '2021-07-01'
END_DATE = '2021-07-02'
FINHUB_TOKEN = 'cvop3lhr01qihjtq3uvgcvop3lhr01qihjtq3v00'

### 获取tick数据：

In [None]:
df_ticks = fetch_tick_data(
    symbol=SYMBOL,
    start_date=START_DATE,
    end_date=END_DATE,
    api_key=FINHUB_TOKEN,
    page_workers=10,
    day_workers=10,
    limit=25000,
    sleep_sec=0.2
)


In [None]:
# （可选）
df_ticks.to_csv(f"{SYMBOL}_{START_DATE}_{END_DATE}_ticks.csv", index=False)

In [None]:
df_ticks.tail(5)

In [None]:
df_ticks.shape

### 获取 NBBO 数据

In [36]:
df_nbbo = fetch_nbbo_data(
    symbol=SYMBOL,
    start_date=START_DATE,
    end_date=END_DATE,
    api_key=FINHUB_TOKEN,
    page_workers=10,
    day_workers=5,
    limit=10000,
    sleep_sec=0.2
)

Days:   0%|          | 0/2 [01:40<?, ?it/s]

Failed to fetch 2021-07-01: HTTPSConnectionPool(host='tick.finnhub.io', port=443): Read timed out. (read timeout=10)


Days:  50%|█████     | 1/2 [01:41<01:41, 101.01s/it]

Failed to fetch 2021-07-02: HTTPSConnectionPool(host='tick.finnhub.io', port=443): Read timed out. (read timeout=10)


Days: 100%|██████████| 2/2 [01:41<00:00, 50.61s/it] 


In [4]:
import finnhub

finnhub_client = finnhub.Client(api_key="cvop3lhr01qihjtq3uvgcvop3lhr01qihjtq3v00")

print(finnhub_client.stock_nbbo('AAPL', '2020-07-02', 500, 0))

{'a': [], 'av': [], 'ax': [], 'b': [], 'bv': [], 'bx': [], 'c': [], 's': 'AAPL', 'skip': 0, 't': []}


In [34]:
df_nbbo.tail(5)

### 处理数据：
给数据做集合，过滤

In [None]:
df_minute = Preprocessor.aggregate_tick_to_minute(df_ticks)
df_minute = df_minute.dropna()
df_minute.columns

In [None]:
# （可选）
df_minute.to_csv(f"{SYMBOL}_{START_DATE}_{END_DATE}_minute.csv", index=False)

### 请求分钟数据（可选）
可以用来做验证。看看tick 数据的行数是否对的上


In [None]:
min_interval = 1
# 请求数据
df_finhub_agg = fetch_candle_data(
    symbol= SYMBOL,
    start_date=START_DATE,
    end_date=END_DATE,
    interval=min_interval, # 单位为分钟
    token=FINHUB_TOKEN,
    chunk_days=25,
    max_workers=10
)


### 验证数据行数（可选）
可以在这里比较数据的准确性等

In [None]:
# 定义交易时间
start_time = time(9, 30)
end_time   = time(16, 0)

# 构造掩码：9:30–16:00
df_minute_masked = df_minute[df_minute['timestamp'].dt.time.between(start_time, end_time)]
df_finhub_agg_masked = df_finhub_agg[df_finhub_agg['timestamp'].between(start_time, end_time)]

df_minute_masked.shape, df_finhub_agg_masked.shape

#### 

In [None]:
path_lo = f"{SYMBOL}_{START_DATE}_{END_DATE}_{LOFREQ}m_raw.csv"
path_hi = f"{SYMBOL}_{START_DATE}_{END_DATE}_{HIFRREQ}m_raw.csv"
df_loFreq.to_csv(path_lo, index=False)
df_hiFreq.to_csv(path_hi, index=False)

In [None]:
# 检查数据
df_loFreq.shape
# df_hiFreq.tail(2)

In [None]:
# 把高频转换成低频集合
df_hiFreq_agg = aggregate_high_freq_to_low (df_hiFreq, freq='5min')

# 计算因子
df_loFreq_factored = add_factors (df_loFreq)

# # 拼接
df_merged = pd.merge (df_loFreq_factored, df_hiFreq_agg, how='left', on='timestamp')
df_merged = clean_outliers(df_merged, z_thresh=10, show_msg=True)

In [None]:
# 检查数值
# df_merged[df_merged['close_volatility'] == 0]

df_merged.shape

In [None]:
import re
path = f"{SYMBOL}_{START_DATE}_{END_DATE}_{LOFREQ}m.csv"
# path = re.sub(r'[^\w\-]', '', path)
df_merged.to_csv(path, index=False)