## 处理 WebSocket 获取的单笔交易，统计为 Aggregated Data

In [1]:
import json
import threading
import websocket
import time
import csv
import os
import pytz
import logging
from datetime import datetime, timedelta
from collections import defaultdict

### Customize Fields

In [2]:
# 要订阅的股票列表
SYMBOLS = ["AAPL", "GOOGL", "TSLA"]

# 统计间隔
INTERVALS = {
    "1min": 1,
    "5min": 5,
    "15min": 15,
    "30min": 30,
    "1h": 60
}

# API Key
FINNHUB_TOKEN = "cvop3lhr01qihjtq3uvgcvop3lhr01qihjtq3v00"

### Prepare

In [3]:

WS_URL = f"wss://ws.finnhub.io?token={FINNHUB_TOKEN}"

# 确保必要的目录存在
for interval in INTERVALS.keys():
    directory = f"./data/raw/{interval}"
    os.makedirs(directory, exist_ok=True)

# 配置日志
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(threadName)s - %(levelname)s - \"%(message)s\"',
    handlers=[
        logging.FileHandler("./data/raw/stock_data.log"),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger(__name__)

# 聚合数据存储 - 每个时间间隔一个存储
# 结构: interval_data[interval][symbol][timestamp] = {stats}
interval_data = {interval: defaultdict(lambda: defaultdict(dict)) for interval in INTERVALS.keys()}

# 当前周期的交易数据缓存 - 每个时间间隔一个缓存
# 结构: current_period_data[interval][symbol] = {trades, stats...}
current_period_data = {}
for interval in INTERVALS.keys():
    current_period_data[interval] = defaultdict(lambda: {
        'trades': [],
        'first_trade_price': None,
        'high': float('-inf'),
        'low': float('inf'),
        'volume': 0,
        'value': 0,  # price * volume 的总和，用于计算VWAP
        'count': 0,
    })
    

### Helper Methods

In [4]:
def get_eastern_time():
    """获取当前美国东部时间"""
    eastern_tz = pytz.timezone('America/New_York')
    return datetime.now(pytz.UTC).astimezone(eastern_tz)

def get_interval_timestamp(ts_millis, interval_minutes):
    """将毫秒时间戳转换为指定间隔的时间戳字符串，向下取整到该间隔"""
    dt_utc = datetime.fromtimestamp(ts_millis / 1000, tz=pytz.UTC)
    eastern_tz = pytz.timezone('America/New_York')
    dt_eastern = dt_utc.astimezone(eastern_tz)
    
    minutes_since_midnight = dt_eastern.hour * 60 + dt_eastern.minute
    floor_minutes = (minutes_since_midnight // interval_minutes) * interval_minutes
    new_hour = floor_minutes // 60
    new_minute = floor_minutes % 60
    dt_floored = dt_eastern.replace(hour=new_hour, minute=new_minute, second=0, microsecond=0)
    
    # 格式化为时间戳字符串
    return dt_floored.strftime('%Y-%m-%d %H:%M:00')

def get_next_interval_time(interval_minutes):
    """计算下一个时间间隔的开始时间"""
    now = get_eastern_time()
    minutes_since_midnight = now.hour * 60 + now.minute
    next_interval_minutes = ((minutes_since_midnight // interval_minutes) + 1) * interval_minutes
    
    next_hour = (next_interval_minutes // 60) % 24
    next_minute = next_interval_minutes % 60
    
    # 如果跨天
    days_to_add = 0
    if next_hour < now.hour:
        days_to_add = 1
        
    next_time = now.replace(
        day=now.day + days_to_add,
        hour=next_hour, 
        minute=next_minute, 
        second=0, 
        microsecond=0
    )
    
    return next_time

def aggregate_interval_data(interval_key, interval_minutes):
    """聚合指定时间间隔的所有交易数据，生成统计"""
    now = get_eastern_time()
    interval_timestamp = get_interval_timestamp(int(now.timestamp() * 1000), interval_minutes)
    
    for symbol, data in current_period_data[interval_key].items():
        if not data['trades']:
            continue  # 跳过没有交易的股票
            
        # 计算统计数据
        open_price = data['first_trade_price']
        close_price = data['trades'][-1]['price'] if data['trades'] else None
        high_price = data['high']
        low_price = data['low']
        volume = data['volume']
        vwap = data['value'] / volume if volume > 0 else 0
        num_transactions = data['count']
        
        # 存储聚合数据
        interval_data[interval_key][symbol][interval_timestamp] = {
            'timestamp': interval_timestamp,
            'open': open_price,
            'high': high_price,
            'low': low_price,
            'close': close_price,
            'vwap': vwap,
            'volume': volume,
            'num_transactions': num_transactions
        }
        
        # 将聚合数据写入CSV
        write_to_csv(interval_key, symbol, interval_timestamp, interval_data[interval_key][symbol][interval_timestamp])
        
        # 重置当前周期数据
        current_period_data[interval_key][symbol] = {
            'trades': [],
            'first_trade_price': None,
            'high': float('-inf'),
            'low': float('inf'),
            'volume': 0,
            'value': 0,
            'count': 0,
        }

def write_to_csv(interval_key, symbol, timestamp, stats):
    """将聚合数据写入CSV文件"""
    directory = f"./data/raw/{interval_key}"
    filename = f"{directory}/{symbol}_{interval_key}.csv"
    file_exists = os.path.isfile(filename)
    
    with open(filename, 'a', newline='') as csvfile:
        fieldnames = ['timestamp', 'open', 'high', 'low', 'close', 'vwap', 'volume', 'num_transactions']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        
        if not file_exists:
            writer.writeheader()
            
        writer.writerow(stats)
    
    logger.info(f"数据已写入 {filename}：{stats['timestamp']}")

def on_message(ws, message):
    """收到WebSocket消息时的回调"""
    msg = json.loads(message)
    if msg.get("type") == "trade":
        for trade in msg["data"]:
            symbol = trade["s"]
            price = trade["p"]
            volume = trade["v"]
            ts_millis = trade["t"]
            
            # 存储原始交易信息
            trade_info = {
                'price': price,
                'volume': volume,
                'timestamp': ts_millis
            }
            
            # 更新每个时间间隔的当前周期数据
            for interval_key in INTERVALS.keys():
                if current_period_data[interval_key][symbol]['first_trade_price'] is None:
                    current_period_data[interval_key][symbol]['first_trade_price'] = price
                    
                current_period_data[interval_key][symbol]['trades'].append(trade_info)
                current_period_data[interval_key][symbol]['high'] = max(current_period_data[interval_key][symbol]['high'], price)
                current_period_data[interval_key][symbol]['low'] = min(current_period_data[interval_key][symbol]['low'], price)
                current_period_data[interval_key][symbol]['volume'] += volume
                current_period_data[interval_key][symbol]['value'] += price * volume
                current_period_data[interval_key][symbol]['count'] += 1

            # 打印接收到的交易信息 (可选，交易量大时考虑关闭)
            print(f"[{symbol}] Price={price} Volume={volume} Timestamp={ts_millis}")

def on_error(ws, error):
    print(f"WebSocket错误: {error}")
    logging.error(f"WebSocket错误: {error}")

def on_close(ws, close_status_code, close_msg):
    print(f"WebSocket关闭: 状态码={close_status_code}, 消息={close_msg}，5秒后尝试重新连接...")
    logging.error(f"WebSocket关闭: 状态码={close_status_code}, 消息={close_msg}，5秒后尝试重新连接...")
    time.sleep(5)
    start_websocket()  # 重新启动WebSocket

def start_websocket():
    """启动WebSocket连接的函数，便于重连"""
    ws_app = websocket.WebSocketApp(
        WS_URL,
        on_open=on_open,
        on_message=on_message,
        on_error=on_error,
        on_close=on_close
    )
    
    # 添加ping_interval参数可以保持连接活跃
    ws_app.run_forever(ping_interval=30)

def on_open(ws):
    """连接建立后订阅所有股票"""
    def run():
        for sym in SYMBOLS:
            sub_msg = json.dumps({"type": "subscribe", "symbol": sym})
            ws.send(sub_msg)
            logger.info(f"Subscribed to {sym}")
            time.sleep(0.1)

    threading.Thread(target=run).start()

def start_aggregation_timers():
    """启动多个定时器，每个时间间隔一个"""
    def run_timer(interval_key, minutes):
        while True:
            # 计算到下一个时间间隔的等待时间
            next_time = get_next_interval_time(minutes)
            now = get_eastern_time()
            wait_seconds = (next_time - now).total_seconds()
            
            logger.info(f"等待 {interval_key} 下一次聚合，将在 {next_time.strftime('%Y-%m-%d %H:%M:%S')} 进行，等待 {wait_seconds:.2f} 秒")
            
            # 等待到下一个时间间隔开始
            time.sleep(max(0, wait_seconds))
            
            # 聚合当前时间间隔的数据
            aggregate_interval_data(interval_key, minutes)
    
    # 为每个时间间隔启动一个单独的线程
    for interval_key, minutes in INTERVALS.items():
        threading.Thread(target=run_timer, args=(interval_key, minutes), daemon=True).start()

def start_market_hours_check():
    """启动市场交易时间检查线程，可选，用于只在市场开盘时收集数据"""
    def check_market_hours():
        while True:
            now = get_eastern_time()
            # 检查是否为交易日（周一至周五）
            is_weekday = 0 <= now.weekday() <= 4
            # 检查是否在交易时间（美东时间上午9:30至下午4:00）
            is_trading_hours = (9 < now.hour or (now.hour == 9 and now.minute >= 30)) and now.hour < 16
            
            # 打印市场状态
            if is_weekday and is_trading_hours:
                logger.info("市场开盘中，数据收集活跃")
            else:
                logger.info("市场已关闭，等待开盘")
            
            # 30分钟检查一次
            time.sleep(1800)
    
    threading.Thread(target=check_market_hours, daemon=True).start()

### 开始统计

In [None]:
if __name__ == "__main__":
    # 启动定时聚合
    start_aggregation_timers()
    
    # ----------- 可选：启动市场交易时间检查，只在市场开盘时收集数据 -----------
    # start_market_hours_check()
    
    # 启动WebSocket连接
    ws_app = websocket.WebSocketApp(
        WS_URL,
        on_open=on_open,
        on_message=on_message,
        on_error=on_error,
        on_close=on_close
    )
    ws_app.run_forever()

2025-05-08 14:14:58,736 - Thread-3 (run_timer) - INFO - "等待 1min 下一次聚合，将在 2025-05-08 17:15:00 进行，等待 1.26 秒"
2025-05-08 14:14:58,736 - Thread-4 (run_timer) - INFO - "等待 5min 下一次聚合，将在 2025-05-08 17:15:00 进行，等待 1.26 秒"
2025-05-08 14:14:58,738 - Thread-7 (run_timer) - INFO - "等待 1h 下一次聚合，将在 2025-05-08 18:00:00 进行，等待 2701.26 秒"
2025-05-08 14:14:58,738 - Thread-5 (run_timer) - INFO - "等待 15min 下一次聚合，将在 2025-05-08 17:15:00 进行，等待 1.26 秒"
2025-05-08 14:14:58,738 - Thread-6 (run_timer) - INFO - "等待 30min 下一次聚合，将在 2025-05-08 17:30:00 进行，等待 901.26 秒"
2025-05-08 14:14:58,960 - MainThread - INFO - "Websocket connected"
2025-05-08 14:14:58,962 - Thread-8 (run) - INFO - "Subscribed to AAPL"
2025-05-08 14:14:59,069 - Thread-8 (run) - INFO - "Subscribed to GOOGL"
2025-05-08 14:14:59,177 - Thread-8 (run) - INFO - "Subscribed to TSLA"
2025-05-08 14:15:00,013 - Thread-4 (run_timer) - INFO - "等待 5min 下一次聚合，将在 2025-05-08 17:20:00 进行，等待 299.99 秒"
2025-05-08 14:15:00,013 - Thread-3 (run_timer) - INFO - "等待 1mi

[GOOGL] Price=153.9 Volume=1 Timestamp=1746738920394
[GOOGL] Price=153.87 Volume=1 Timestamp=1746738930490
[GOOGL] Price=153.87 Volume=3 Timestamp=1746738930490


2025-05-08 14:16:00,004 - Thread-3 (run_timer) - INFO - "数据已写入 ./data/raw/1min/GOOGL_1min.csv：2025-05-08 17:16:00"
2025-05-08 14:16:00,006 - Thread-3 (run_timer) - INFO - "等待 1min 下一次聚合，将在 2025-05-08 17:17:00 进行，等待 59.99 秒"


[TSLA] Price=284.68 Volume=33 Timestamp=1746738959522
[AAPL] Price=197.3 Volume=1 Timestamp=1746738960513
[TSLA] Price=284.65 Volume=33 Timestamp=1746738971312
[TSLA] Price=284.65 Volume=17 Timestamp=1746738971312
[GOOGL] Price=153.89 Volume=6 Timestamp=1746738981297
[GOOGL] Price=153.9 Volume=30 Timestamp=1746738982409
[TSLA] Price=284.66 Volume=2 Timestamp=1746738996287
[AAPL] Price=197.29 Volume=1 Timestamp=1746739005640


2025-05-08 14:17:00,007 - Thread-3 (run_timer) - INFO - "数据已写入 ./data/raw/1min/GOOGL_1min.csv：2025-05-08 17:17:00"
2025-05-08 14:17:00,008 - Thread-3 (run_timer) - INFO - "数据已写入 ./data/raw/1min/TSLA_1min.csv：2025-05-08 17:17:00"
2025-05-08 14:17:00,010 - Thread-3 (run_timer) - INFO - "数据已写入 ./data/raw/1min/AAPL_1min.csv：2025-05-08 17:17:00"
2025-05-08 14:17:00,011 - Thread-3 (run_timer) - INFO - "等待 1min 下一次聚合，将在 2025-05-08 17:18:00 进行，等待 59.99 秒"


[TSLA] Price=284.72 Volume=1 Timestamp=1746739020002
[TSLA] Price=284.64 Volume=51 Timestamp=1746739020091
[GOOGL] Price=153.87 Volume=50 Timestamp=1746739050577
[GOOGL] Price=153.87 Volume=5 Timestamp=1746739050577
[GOOGL] Price=153.87 Volume=3 Timestamp=1746739050577
[GOOGL] Price=153.87 Volume=20 Timestamp=1746739050577
[GOOGL] Price=153.87 Volume=8 Timestamp=1746739050577
[AAPL] Price=197.29 Volume=13 Timestamp=1746739058426
[AAPL] Price=197.29 Volume=87 Timestamp=1746739058426
[AAPL] Price=197.29 Volume=100 Timestamp=1746739061235
[AAPL] Price=197.28 Volume=2 Timestamp=1746739061235
[AAPL] Price=197.27 Volume=13 Timestamp=1746739061235
[AAPL] Price=197.26 Volume=2 Timestamp=1746739061235
[AAPL] Price=197.25 Volume=8 Timestamp=1746739061237
[AAPL] Price=197.25 Volume=1 Timestamp=1746739061237
[AAPL] Price=197.25 Volume=34 Timestamp=1746739061237
[AAPL] Price=197.25 Volume=1 Timestamp=1746739061237
[AAPL] Price=197.25 Volume=1 Timestamp=1746739061238
[AAPL] Price=197.25 Volume=2 Tim

2025-05-08 14:18:00,011 - Thread-3 (run_timer) - INFO - "数据已写入 ./data/raw/1min/GOOGL_1min.csv：2025-05-08 17:18:00"
2025-05-08 14:18:00,012 - Thread-3 (run_timer) - INFO - "数据已写入 ./data/raw/1min/TSLA_1min.csv：2025-05-08 17:18:00"
2025-05-08 14:18:00,013 - Thread-3 (run_timer) - INFO - "数据已写入 ./data/raw/1min/AAPL_1min.csv：2025-05-08 17:18:00"
2025-05-08 14:18:00,014 - Thread-3 (run_timer) - INFO - "等待 1min 下一次聚合，将在 2025-05-08 17:19:00 进行，等待 59.99 秒"


[AAPL] Price=197.18 Volume=4 Timestamp=1746739081635
[AAPL] Price=197.18 Volume=20 Timestamp=1746739081635
[GOOGL] Price=153.88 Volume=1 Timestamp=1746739111536
[TSLA] Price=284.7 Volume=33 Timestamp=1746739124982


2025-05-08 14:19:00,002 - Thread-3 (run_timer) - INFO - "数据已写入 ./data/raw/1min/GOOGL_1min.csv：2025-05-08 17:19:00"
2025-05-08 14:19:00,004 - Thread-3 (run_timer) - INFO - "数据已写入 ./data/raw/1min/TSLA_1min.csv：2025-05-08 17:19:00"
2025-05-08 14:19:00,004 - Thread-3 (run_timer) - INFO - "数据已写入 ./data/raw/1min/AAPL_1min.csv：2025-05-08 17:19:00"
2025-05-08 14:19:00,005 - Thread-3 (run_timer) - INFO - "等待 1min 下一次聚合，将在 2025-05-08 17:20:00 进行，等待 59.99 秒"


[TSLA] Price=284.74 Volume=15 Timestamp=1746739156131
[TSLA] Price=284.66 Volume=15 Timestamp=1746739181035


2025-05-08 14:20:00,004 - Thread-3 (run_timer) - INFO - "数据已写入 ./data/raw/1min/TSLA_1min.csv：2025-05-08 17:20:00"
2025-05-08 14:20:00,005 - Thread-4 (run_timer) - INFO - "数据已写入 ./data/raw/5min/GOOGL_5min.csv：2025-05-08 17:20:00"
2025-05-08 14:20:00,005 - Thread-3 (run_timer) - INFO - "等待 1min 下一次聚合，将在 2025-05-08 17:21:00 进行，等待 59.99 秒"
2025-05-08 14:20:00,007 - Thread-4 (run_timer) - INFO - "数据已写入 ./data/raw/5min/TSLA_5min.csv：2025-05-08 17:20:00"
2025-05-08 14:20:00,008 - Thread-4 (run_timer) - INFO - "数据已写入 ./data/raw/5min/AAPL_5min.csv：2025-05-08 17:20:00"
2025-05-08 14:20:00,009 - Thread-4 (run_timer) - INFO - "等待 5min 下一次聚合，将在 2025-05-08 17:25:00 进行，等待 299.99 秒"


[TSLA] Price=284.65 Volume=33 Timestamp=1746739205461
[TSLA] Price=284.66 Volume=15 Timestamp=1746739208565
[AAPL] Price=197.23 Volume=1 Timestamp=1746739214945
[GOOGL] Price=153.93 Volume=5 Timestamp=1746739232339
[GOOGL] Price=153.92 Volume=1 Timestamp=1746739232340
[GOOGL] Price=153.92 Volume=2 Timestamp=1746739232340
[GOOGL] Price=153.91 Volume=3 Timestamp=1746739232342
[GOOGL] Price=153.9 Volume=1 Timestamp=1746739232344
[GOOGL] Price=153.9 Volume=4 Timestamp=1746739232344
[GOOGL] Price=153.9 Volume=5 Timestamp=1746739232344
[AAPL] Price=197.27 Volume=1 Timestamp=1746739235392
[GOOGL] Price=153.89 Volume=1 Timestamp=1746739238704
[GOOGL] Price=153.88 Volume=1 Timestamp=1746739238705


2025-05-08 14:21:00,010 - Thread-3 (run_timer) - INFO - "数据已写入 ./data/raw/1min/GOOGL_1min.csv：2025-05-08 17:21:00"
2025-05-08 14:21:00,012 - Thread-3 (run_timer) - INFO - "数据已写入 ./data/raw/1min/TSLA_1min.csv：2025-05-08 17:21:00"
2025-05-08 14:21:00,013 - Thread-3 (run_timer) - INFO - "数据已写入 ./data/raw/1min/AAPL_1min.csv：2025-05-08 17:21:00"
2025-05-08 14:21:00,013 - Thread-3 (run_timer) - INFO - "等待 1min 下一次聚合，将在 2025-05-08 17:22:00 进行，等待 59.99 秒"


[TSLA] Price=284.68 Volume=33 Timestamp=1746739269745


2025-05-08 14:22:00,015 - Thread-3 (run_timer) - INFO - "数据已写入 ./data/raw/1min/TSLA_1min.csv：2025-05-08 17:22:00"
2025-05-08 14:22:00,016 - Thread-3 (run_timer) - INFO - "等待 1min 下一次聚合，将在 2025-05-08 17:23:00 进行，等待 59.98 秒"


[GOOGL] Price=153.88 Volume=1 Timestamp=1746739344372
[GOOGL] Price=153.89 Volume=69 Timestamp=1746739372641
[GOOGL] Price=153.89 Volume=17 Timestamp=1746739372641


2025-05-08 14:23:00,016 - Thread-3 (run_timer) - INFO - "数据已写入 ./data/raw/1min/GOOGL_1min.csv：2025-05-08 17:23:00"
2025-05-08 14:23:00,017 - Thread-3 (run_timer) - INFO - "等待 1min 下一次聚合，将在 2025-05-08 17:24:00 进行，等待 59.98 秒"


[AAPL] Price=197.3 Volume=1 Timestamp=1746739404314
[AAPL] Price=197.3 Volume=16 Timestamp=1746739406586


2025-05-08 14:24:00,016 - Thread-3 (run_timer) - INFO - "数据已写入 ./data/raw/1min/AAPL_1min.csv：2025-05-08 17:24:00"
2025-05-08 14:24:00,017 - Thread-3 (run_timer) - INFO - "等待 1min 下一次聚合，将在 2025-05-08 17:25:00 进行，等待 59.98 秒"


[GOOGL] Price=153.92 Volume=6 Timestamp=1746739451412
[GOOGL] Price=153.92 Volume=4 Timestamp=1746739451423
[AAPL] Price=197.37 Volume=1 Timestamp=1746739463846


2025-05-08 14:25:00,005 - Thread-4 (run_timer) - INFO - "数据已写入 ./data/raw/5min/GOOGL_5min.csv：2025-05-08 17:25:00"
2025-05-08 14:25:00,005 - Thread-3 (run_timer) - INFO - "数据已写入 ./data/raw/1min/GOOGL_1min.csv：2025-05-08 17:25:00"
2025-05-08 14:25:00,007 - Thread-4 (run_timer) - INFO - "数据已写入 ./data/raw/5min/TSLA_5min.csv：2025-05-08 17:25:00"
2025-05-08 14:25:00,008 - Thread-3 (run_timer) - INFO - "数据已写入 ./data/raw/1min/AAPL_1min.csv：2025-05-08 17:25:00"
2025-05-08 14:25:00,008 - Thread-3 (run_timer) - INFO - "等待 1min 下一次聚合，将在 2025-05-08 17:26:00 进行，等待 59.99 秒"
2025-05-08 14:25:00,009 - Thread-4 (run_timer) - INFO - "数据已写入 ./data/raw/5min/AAPL_5min.csv：2025-05-08 17:25:00"
2025-05-08 14:25:00,010 - Thread-4 (run_timer) - INFO - "等待 5min 下一次聚合，将在 2025-05-08 17:30:00 进行，等待 299.99 秒"


[GOOGL] Price=153.93 Volume=6 Timestamp=1746739500009


2025-05-08 14:25:08,596 - MainThread - ERROR - "WebSocket错误: "
2025-05-08 14:25:08,667 - MainThread - ERROR - "WebSocket关闭: 状态码=None, 消息=None，5秒后尝试重新连接..."


WebSocket错误: 
WebSocket关闭: 状态码=None, 消息=None，5秒后尝试重新连接...


2025-05-08 14:25:13,985 - MainThread - INFO - "Websocket connected"
2025-05-08 14:25:13,986 - Thread-10 (run) - INFO - "Subscribed to AAPL"
2025-05-08 14:25:14,089 - Thread-10 (run) - INFO - "Subscribed to GOOGL"
2025-05-08 14:25:14,197 - Thread-10 (run) - INFO - "Subscribed to TSLA"
