# TWSE Tick Data Processing Demo

探索多種讀取和處理台證所逐筆資料的方法

## 測試方案
1. **Pandas read_fwf**: 使用 pandas 內建的固定寬度文件讀取
2. **Polars 拍平格式**: 將五檔價量拍平成獨立欄位，優化數據格式
3. **分檔存儲**: 按證券代號分檔案存儲，支援批量讀取

## 數據格式 (190 bytes per record)
- 證券代號 (6), 揭示時間 (12), 各種標記欄位
- 成交價格 (6), 成交張數 (8)
- 買進五檔價量 (70), 賣出五檔價量 (70)
- 揭示日期 (8), 撮合人員 (2)


In [15]:
# 導入必要的套件
import gzip
import struct
import json
from pathlib import Path
from typing import Dict, List, Iterator
from datetime import datetime
import warnings

import pandas as pd
import polars as pl
import pyarrow as pa
import pyarrow.parquet as pq

warnings.filterwarnings('ignore')
print("📦 套件載入完成")


📦 套件載入完成


---
## 方法 1: Pandas read_fwf 方法

使用 pandas 內建的 `read_fwf` 函數讀取固定寬度檔案


In [16]:
def read_with_pandas_fwf(gz_file: Path, max_rows: int = 1000) -> pd.DataFrame:
    """
    使用 pandas read_fwf 讀取固定寬度檔案
    """
    # 定義欄位寬度 (基於 190 bytes 格式)
    colspecs = [
        (0, 6),     # securities_code
        (6, 18),    # display_time  
        (18, 19),   # remark
        (19, 20),   # trend_flag
        (20, 21),   # match_flag
        (21, 22),   # trade_ul_flag
        (22, 28),   # trade_price
        (28, 36),   # trade_volume
        (36, 37),   # buy_tick_size
        (37, 38),   # buy_ul_flag
        (38, 108),  # buy_5_price_volume (70 bytes)
        (108, 109), # sell_tick_size
        (109, 110), # sell_ul_flag
        (110, 180), # sell_5_price_volume (70 bytes) 
        (180, 188), # display_date
        (188, 190), # match_staff
    ]
    
    column_names = [
        'securities_code', 'display_time', 'remark', 'trend_flag', 'match_flag',
        'trade_ul_flag', 'trade_price', 'trade_volume', 'buy_tick_size', 
        'buy_ul_flag', 'buy_5_raw', 'sell_tick_size', 'sell_ul_flag', 
        'sell_5_raw', 'display_date', 'match_staff'
    ]
    
    print(f"🔍 使用 pandas.read_fwf 讀取: {gz_file}")
    print(f"📊 限制讀取行數: {max_rows:,}")
    
    # 直接從 gzip 檔案讀取
    with gzip.open(gz_file, 'rt', encoding='ascii', errors='ignore') as f:
        df = pd.read_fwf(
            f, 
            colspecs=colspecs,
            names=column_names,
            nrows=max_rows,
            dtype=str  # 先全部讀成字串
        )
    
    print(f"✅ 成功讀取 {len(df):,} 筆記錄")
    return df

# 測試 pandas 方法
gz_file = Path("../snapshot/Sample_new.gz")
if gz_file.exists():
    df_pandas = read_with_pandas_fwf(gz_file, max_rows=50)
    print("\n📋 Pandas 讀取結果預覽:")
    print(f"DataFrame shape: {df_pandas.shape}")
    print("\n前3筆記錄:")
    display(df_pandas[['securities_code', 'display_time', 'trade_price', 'trade_volume']].head(3))
else:
    print("❌ 測試檔案不存在")


🔍 使用 pandas.read_fwf 讀取: ../snapshot/Sample_new.gz
📊 限制讀取行數: 50
✅ 成功讀取 40 筆記錄

📋 Pandas 讀取結果預覽:
DataFrame shape: (40, 16)

前3筆記錄:


Unnamed: 0,securities_code,display_time,trade_price,trade_volume
0,50,83004446448,0,0
1,50,83009462008,0,0
2,50,83014478574,0,0


---
## 方法 2: Polars 拍平格式

使用 Polars 並將五檔價量拍平成獨立欄位，優化數據型別


In [17]:
# 定義拍平格式的 Schema
def create_flattened_schema() -> pa.Schema:
    """
    創建拍平格式的 Arrow Schema
    """
    fields = [
        # ("securities_code", pa.dictionary(pa.int8(), pa.string())),  # 類別型
        ("securities_code", pa.string()),
        ("datetime", pa.timestamp('us')),  # 合併時間
        ("remark", pa.dictionary(pa.int8(), pa.string())),
        ("trend_flag", pa.dictionary(pa.int8(), pa.string())),
        ("match_flag", pa.dictionary(pa.int8(), pa.string())),
        ("trade_ul_flag", pa.dictionary(pa.int8(), pa.string())),
        ("trade_price", pa.float64()),
        ("trade_volume", pa.int64()),
        ("buy_tick_size", pa.int8()),
        ("buy_ul_flag", pa.dictionary(pa.int8(), pa.string())),
        ("sell_tick_size", pa.int8()),
        ("sell_ul_flag", pa.dictionary(pa.int8(), pa.string())),
        ("match_staff", pa.dictionary(pa.int8(), pa.string())),
    ]
    
    # 買進五檔價量 (拍平)
    for i in range(1, 6):
        fields.extend([
            (f"bid_price_{i}", pa.float64()),
            (f"bid_volume_{i}", pa.int32()),
        ])
    
    # 賣出五檔價量 (拍平)
    for i in range(1, 6):
        fields.extend([
            (f"ask_price_{i}", pa.float64()),
            (f"ask_volume_{i}", pa.int32()),
        ])
    
    return pa.schema(fields)

FLATTENED_SCHEMA = create_flattened_schema()
print(f"📋 拍平格式 Schema 包含 {len(FLATTENED_SCHEMA)} 個欄位")
print("\n🔍 Schema 預覽:")
for i, field in enumerate(FLATTENED_SCHEMA):
    if i < 10:  # 只顯示前10個欄位
        print(f"  {field.name:20s}: {field.type}")
    elif i == 10:
        print("  ... (更多欄位)")
        break


📋 拍平格式 Schema 包含 33 個欄位

🔍 Schema 預覽:
  securities_code     : string
  datetime            : timestamp[us]
  remark              : dictionary<values=string, indices=int8, ordered=0>
  trend_flag          : dictionary<values=string, indices=int8, ordered=0>
  match_flag          : dictionary<values=string, indices=int8, ordered=0>
  trade_ul_flag       : dictionary<values=string, indices=int8, ordered=0>
  trade_price         : double
  trade_volume        : int64
  buy_tick_size       : int8
  buy_ul_flag         : dictionary<values=string, indices=int8, ordered=0>
  ... (更多欄位)


In [18]:
def parse_5_levels_flattened(raw_bytes: bytes) -> List[float]:
    """
    解析 70 bytes 的五檔價量，回傳拍平的 [price1, vol1, price2, vol2, ...]
    """
    result = []
    for i in range(5):
        offset = i * 14
        price_bytes = raw_bytes[offset:offset+6]
        volume_bytes = raw_bytes[offset+6:offset+14]
        
        try:
            price_str = price_bytes.decode().strip()
            volume_str = volume_bytes.decode().strip()
            
            price = float(price_str) / 100.0 if price_str else 0.0
            volume = int(volume_str) if volume_str else 0
        except (ValueError, UnicodeDecodeError):
            price = 0.0
            volume = 0
            
        result.extend([price, volume])
    
    return result

def parse_datetime(date_str: str, time_str: str) -> datetime:
    """
    合併日期和時間成 datetime 物件
    """
    try:
        # 處理時間格式 (HHMMSSSSSSSS -> HH:MM:SS.SSSSSS)
        if len(time_str) >= 6:
            hour = int(time_str[:2])
            minute = int(time_str[2:4])
            second = int(time_str[4:6])
            microsecond = int(time_str[6:].ljust(6, '0')[:6]) if len(time_str) > 6 else 0
        else:
            return datetime(2024, 1, 1)  # 預設值
        
        # 處理日期格式 (YYYYMMDD)
        year = int(date_str[:4])
        month = int(date_str[4:6])
        day = int(date_str[6:8])
        
        return datetime(year, month, day, hour, minute, second, microsecond)
    except (ValueError, IndexError):
        return datetime(2024, 1, 1)  # 預設值

def safe_categorical(value: str) -> str:
    """處理類別型數據"""
    return value.strip() if value.strip() else "Unknown"

print("🔧 拍平格式解析函數定義完成")


🔧 拍平格式解析函數定義完成


In [19]:
# 修正parse_line_flattened函數，使用正確的category處理
def parse_line_flattened_fixed(line: bytes) -> List:
    """
    解析單行記錄成拍平格式（修正版本）
    """
    if len(line) != 190:
        raise ValueError(f"Line length should be 190 bytes, got {len(line)}")
    
    # 使用 struct 解析
    FMT = struct.Struct("6s12s1s1s1s1s6s8s1s1s70s1s1s70s8s2s")
    rec = FMT.unpack(line)
    
    # 基本欄位
    securities_code = rec[0].decode().strip()
    time_str = rec[1].decode().strip()
    date_str = rec[14].decode().strip()
    
    # 合併成 datetime
    dt = parse_datetime(date_str, time_str)
    
    # 數值欄位
    try:
        trade_price = float(rec[6].decode().strip()) / 100.0 if rec[6].decode().strip() else 0.0
    except ValueError:
        trade_price = 0.0
        
    try:
        trade_volume = int(rec[7].decode().strip()) if rec[7].decode().strip() else 0
    except ValueError:
        trade_volume = 0
    
    # 解析五檔資料
    bid_data = parse_5_levels_flattened(rec[10])  # [price1, vol1, price2, vol2, ...]
    ask_data = parse_5_levels_flattened(rec[13])
    
    # 組裝最終記錄 - 使用修正的category處理
    result = [
        securities_code,
        dt,
        safe_categorical_fixed(rec[2].decode(), 'remark'),
        safe_categorical_fixed(rec[3].decode(), 'trend_flag'),
        safe_categorical_fixed(rec[4].decode(), 'match_flag'),
        safe_categorical_fixed(rec[5].decode(), 'trade_ul_flag'),
        trade_price,
        trade_volume,
        int(rec[8].decode().strip()) if rec[8].decode().strip() else 0,
        safe_categorical_fixed(rec[9].decode(), 'buy_ul_flag'),
        int(rec[11].decode().strip()) if rec[11].decode().strip() else 0,
        safe_categorical_fixed(rec[12].decode(), 'sell_ul_flag'),
        safe_categorical_fixed(rec[15].decode(), 'match_staff'),
    ]
    
    # 加入拍平的買進五檔
    result.extend(bid_data)
    # 加入拍平的賣出五檔
    result.extend(ask_data)
    
    return result

print("🔧 修正版解析函數定義完成")


🔧 修正版解析函數定義完成


In [20]:
# 修正category字段并重新生成拍平格式文件
def safe_categorical_fixed(value: str, field_name: str) -> str:
    """處理類別型數據，根據欄位設定預設值"""
    clean_value = value.strip()
    if clean_value:
        return clean_value
    
    # 根據欄位設定適當的預設值
    defaults = {
        'remark': 'Normal',           # 空白表示一般揭示
        'trend_flag': 'Normal',       # 空白表示未實施穩定措施  
        'match_flag': 'No',           # 空白表示未成交
        'trade_ul_flag': 'Normal',    # 空白表示正常
        'buy_ul_flag': 'Normal',      # 空白表示正常
        'sell_ul_flag': 'Normal',     # 空白表示正常
        'match_staff': 'Unknown'      # 撮合人員可能為空
    }
    
    return defaults.get(field_name, 'Unknown')

def convert_to_flattened_parquet_fixed(src: Path, dst: Path, max_rows: int = 10000) -> None:
    """
    轉換成拍平格式的 Parquet（修正版本）
    """
    dst.parent.mkdir(parents=True, exist_ok=True)
    
    print(f"🔄 轉換成拍平格式（修正版）: {src} -> {dst}")
    print(f"📊 限制處理: {max_rows:,} 筆記錄")
    
    records = []
    processed = 0
    
    with gzip.open(src, "rb") as fh:
        for line in fh:
            if processed >= max_rows:
                break
                
            try:
                clean_line = line.rstrip(b'\\n\\r')
                if len(clean_line) == 190:
                    record = parse_line_flattened_fixed(clean_line)
                    records.append(record)
                    processed += 1
                    
                    if processed % 1000 == 0:
                        print(f"⏳ 已處理 {processed:,} 筆記錄")
                        
            except Exception as e:
                print(f"⚠️ 第 {processed+1} 行解析失敗: {e}")
                continue
    
    if records:
        # 轉換成 Arrow Table
        cols = list(zip(*records))
        arrays = [pa.array(col, type=t) for col, t in zip(cols, FLATTENED_SCHEMA.types)]
        table = pa.Table.from_arrays(arrays, schema=FLATTENED_SCHEMA)
        
        # 寫入 Parquet
        pq.write_table(table, dst, compression="zstd", compression_level=7)
        
        file_size = dst.stat().st_size / 1024 / 1024
        print(f"✅ 轉換完成！共 {len(records):,} 筆記錄")
        print(f"💾 檔案大小: {file_size:.2f} MB")
    else:
        print("❌ 沒有成功解析的記錄")

# 重新生成修正版的拍平格式文件
if gz_file.exists():
    flattened_pq_fixed = Path("./demo_flattened_fixed.parquet")
    convert_to_flattened_parquet_fixed(gz_file, flattened_pq_fixed, max_rows=100)
    
    # 檢查修正後的category值
    if flattened_pq_fixed.exists():
        print("\\n🔍 修正後的Category字段檢查:")
        df_fixed = pl.read_parquet(flattened_pq_fixed)
        print("remark:", df_fixed['remark'].value_counts())
        print("trend_flag:", df_fixed['trend_flag'].value_counts())
        print("match_flag:", df_fixed['match_flag'].value_counts())
        print("trade_ul_flag:", df_fixed['trade_ul_flag'].value_counts())
        
        # 更新變數指向修正版檔案
        flattened_pq = flattened_pq_fixed
        print("\\n✅ 已更新為修正版檔案")
    
else:
    print("❌ 測試檔案不存在")


🔄 轉換成拍平格式（修正版）: ../snapshot/Sample_new.gz -> demo_flattened_fixed.parquet
📊 限制處理: 100 筆記錄
❌ 沒有成功解析的記錄


In [21]:
def parse_line_flattened(line: bytes) -> List:
    """
    解析單行記錄成拍平格式
    """
    if len(line) != 190:
        raise ValueError(f"Line length should be 190 bytes, got {len(line)}")
    
    # 使用 struct 解析
    FMT = struct.Struct("6s12s1s1s1s1s6s8s1s1s70s1s1s70s8s2s")
    rec = FMT.unpack(line)
    
    # 基本欄位
    securities_code = rec[0].decode().strip()
    time_str = rec[1].decode().strip()
    date_str = rec[14].decode().strip()
    
    # 合併成 datetime
    dt = parse_datetime(date_str, time_str)
    
    # 數值欄位
    try:
        trade_price = float(rec[6].decode().strip()) / 100.0 if rec[6].decode().strip() else 0.0
    except ValueError:
        trade_price = 0.0
        
    try:
        trade_volume = int(rec[7].decode().strip()) if rec[7].decode().strip() else 0
    except ValueError:
        trade_volume = 0
    
    # 解析五檔資料
    bid_data = parse_5_levels_flattened(rec[10])  # [price1, vol1, price2, vol2, ...]
    ask_data = parse_5_levels_flattened(rec[13])
    
    # 組裝最終記錄
    result = [
        securities_code,
        dt,
        safe_categorical(rec[2].decode()),
        safe_categorical(rec[3].decode()),
        safe_categorical(rec[4].decode()),
        safe_categorical(rec[5].decode()),
        trade_price,
        trade_volume,
        int(rec[8].decode().strip()) if rec[8].decode().strip() else 0,
        safe_categorical(rec[9].decode()),
        int(rec[11].decode().strip()) if rec[11].decode().strip() else 0,
        safe_categorical(rec[12].decode()),
        safe_categorical(rec[15].decode()),
    ]
    
    # 加入拍平的買進五檔
    result.extend(bid_data)
    # 加入拍平的賣出五檔
    result.extend(ask_data)
    
    return result

def convert_to_flattened_parquet(src: Path, dst: Path, max_rows: int = 10000) -> None:
    """
    轉換成拍平格式的 Parquet
    """
    dst.parent.mkdir(parents=True, exist_ok=True)
    
    print(f"🔄 轉換成拍平格式: {src} -> {dst}")
    print(f"📊 限制處理: {max_rows:,} 筆記錄")
    
    records = []
    processed = 0
    
    with gzip.open(src, "rb") as fh:
        for line in fh:
            if processed >= max_rows:
                break
                
            try:
                clean_line = line.rstrip(b'\n\r')
                if len(clean_line) == 190:
                    record = parse_line_flattened(clean_line)
                    records.append(record)
                    processed += 1
                    
                    if processed % 1000 == 0:
                        print(f"⏳ 已處理 {processed:,} 筆記錄")
                        
            except Exception as e:
                print(f"⚠️ 第 {processed+1} 行解析失敗: {e}")
                continue
    
    if records:
        # 轉換成 Arrow Table
        cols = list(zip(*records))
        arrays = [pa.array(col, type=t) for col, t in zip(cols, FLATTENED_SCHEMA.types)]
        table = pa.Table.from_arrays(arrays, schema=FLATTENED_SCHEMA)
        
        # 寫入 Parquet
        pq.write_table(table, dst, compression="zstd", compression_level=7)
        
        file_size = dst.stat().st_size / 1024 / 1024
        print(f"✅ 轉換完成！共 {len(records):,} 筆記錄")
        print(f"💾 檔案大小: {file_size:.2f} MB")
    else:
        print("❌ 沒有成功解析的記錄")

# 測試拍平格式轉換
if gz_file.exists():
    flattened_pq = Path("./demo_flattened.parquet")
    convert_to_flattened_parquet(gz_file, flattened_pq, max_rows=100)
else:
    print("❌ 測試檔案不存在")


🔄 轉換成拍平格式: ../snapshot/Sample_new.gz -> demo_flattened.parquet
📊 限制處理: 100 筆記錄
✅ 轉換完成！共 40 筆記錄
💾 檔案大小: 0.01 MB


In [22]:
# 預覽拍平格式結果
flattened_pq = Path("./demo_flattened.parquet")
if flattened_pq.exists():
    print("📊 拍平格式 Parquet 預覽:")
    
    # 使用 Polars 讀取
    df_flat = pl.read_parquet(flattened_pq)
    print(f"\n📏 資料形狀: {df_flat.shape}")
    print(f"💾 記憶體使用: {df_flat.estimated_size('mb'):.2f} MB")
    
    print("\n🔍 Schema 概覽:")
    # for name, dtype in list(df_flat.schema.items())[:15]:  # 前15個欄位
    #     print(f"  {name:20s}: {dtype}")
    # print("  ... (更多欄位)")
    for name, dtype in list(df_flat.schema.items()):
        print(f"  {name:20s}: {dtype}")
    
    print("\n📋 基本欄位預覽:")
    basic_cols = ['securities_code', 'datetime', 'trade_price', 'trade_volume', 
                  'bid_price_1', 'bid_volume_1', 'ask_price_1', 'ask_volume_1']
    display(df_flat.select(basic_cols).head(5))
    
    print("\n📈 數據統計:")
    stats = df_flat.select([
        pl.col('securities_code').n_unique().alias('不重複證券數'),
        pl.col('trade_price').filter(pl.col('trade_price') > 0).count().alias('有成交價筆數'),
        pl.col('trade_volume').sum().alias('總成交量'),
        pl.col('bid_price_1').filter(pl.col('bid_price_1') > 0).count().alias('有買一價筆數'),
    ])
    display(stats)
else:
    print("❌ 拍平格式檔案不存在")


📊 拍平格式 Parquet 預覽:

📏 資料形狀: (40, 33)
💾 記憶體使用: 0.01 MB

🔍 Schema 概覽:
  securities_code     : String
  datetime            : Datetime(time_unit='us', time_zone=None)
  remark              : Categorical(ordering='physical')
  trend_flag          : Categorical(ordering='physical')
  match_flag          : Categorical(ordering='physical')
  trade_ul_flag       : Categorical(ordering='physical')
  trade_price         : Float64
  trade_volume        : Int64
  buy_tick_size       : Int8
  buy_ul_flag         : Categorical(ordering='physical')
  sell_tick_size      : Int8
  sell_ul_flag        : Categorical(ordering='physical')
  match_staff         : Categorical(ordering='physical')
  bid_price_1         : Float64
  bid_volume_1        : Int32
  bid_price_2         : Float64
  bid_volume_2        : Int32
  bid_price_3         : Float64
  bid_volume_3        : Int32
  bid_price_4         : Float64
  bid_volume_4        : Int32
  bid_price_5         : Float64
  bid_volume_5        : Int32
  ask_p

securities_code,datetime,trade_price,trade_volume,bid_price_1,bid_volume_1,ask_price_1,ask_volume_1
str,datetime[μs],f64,i64,f64,i32,f64,i32
"""0050""",2024-11-11 08:30:04.446448,0.0,0,199.5,29,203.0,2
"""0050""",2024-11-11 08:30:09.462008,0.0,0,199.5,27,200.0,1
"""0050""",2024-11-11 08:30:14.478574,0.0,0,199.5,17,200.0,2
"""0050""",2024-11-11 08:30:19.493222,0.0,0,199.5,14,200.0,2
"""0050""",2024-11-11 08:30:24.508770,0.0,0,199.5,12,200.0,13



📈 數據統計:


不重複證券數,有成交價筆數,總成交量,有買一價筆數
u32,u32,i64,u32
2,20,190165,40


In [23]:
def split_by_securities_and_date(src_parquet: Path, output_dir: Path) -> Dict[str, Dict[str, Path]]:
    """
    將 Parquet 檔案按證券代號和日期分檔存儲
    結構: output_dir/YYYY-MM-DD/securities_code.parquet
    """
    output_dir.mkdir(parents=True, exist_ok=True)
    
    print(f"📁 按證券代號和日期分檔: {src_parquet} -> {output_dir}/")
    
    # 讀取原始檔案
    df = pl.read_parquet(src_parquet)
    
    # 從datetime欄位提取日期
    df = df.with_columns([
        pl.col('datetime').dt.date().alias('date_str')
    ])
    
    # 獲取所有日期和證券代號組合
    date_securities = df.select(['date_str', 'securities_code']).unique().sort(['date_str', 'securities_code'])
    
    file_mapping = {}
    
    for row in date_securities.iter_rows(named=True):
        date_val = row['date_str']
        code = row['securities_code']
        
        # 創建日期目錄
        date_dir = output_dir / str(date_val)
        date_dir.mkdir(parents=True, exist_ok=True)
        
        # 過濾該日期和證券的資料
        filtered_df = df.filter(
            (pl.col('date_str') == date_val) & 
            (pl.col('securities_code') == code)
        ).drop('date_str')  # 移除輔助欄位
        
        # 儲存檔案
        output_file = date_dir / f"{code}.parquet"
        filtered_df.write_parquet(
            output_file, 
            compression="zstd", 
            compression_level=7
        )
        
        # 記錄檔案映射
        if str(date_val) not in file_mapping:
            file_mapping[str(date_val)] = {}
        file_mapping[str(date_val)][code] = output_file
        
        file_size = output_file.stat().st_size / 1024
        print(f"  ✅ {date_val}/{code}: {len(filtered_df):,} 筆記錄 -> {output_file} ({file_size:.1f} KB)")
    
    return file_mapping

def batch_read_by_date_and_securities(
    data_dir: Path, 
    target_date: str = None, 
    securities_codes: List[str] = None,
    engine: str = 'polars'  # 'polars' or 'pandas'
):
    """
    批量讀取指定日期和證券的資料
    
    Args:
        data_dir: 數據目錄
        target_date: 目標日期 (YYYY-MM-DD)，None表示所有日期
        securities_codes: 證券代號列表，None表示所有證券
        engine: 讀取引擎 ('polars' 或 'pandas')
    """
    
    # 確定要讀取的日期目錄
    if target_date:
        date_dirs = [data_dir / target_date] if (data_dir / target_date).exists() else []
    else:
        date_dirs = [d for d in data_dir.iterdir() if d.is_dir()]
    
    if not date_dirs:
        print("❌ 找不到符合條件的日期目錄")
        return pl.DataFrame() if engine == 'polars' else pd.DataFrame()
    
    print(f"📖 批量讀取資料:")
    print(f"  📅 日期: {target_date or '所有日期'} ({len(date_dirs)} 個目錄)")
    print(f"  📊 證券: {securities_codes or '所有證券'}")
    print(f"  🔧 引擎: {engine}")
    
    dfs = []
    total_records = 0
    
    for date_dir in sorted(date_dirs):
        date_str = date_dir.name
        print(f"\\n  📅 處理日期: {date_str}")
        
        # 確定要讀取的檔案
        if securities_codes:
            parquet_files = [date_dir / f"{code}.parquet" for code in securities_codes 
                           if (date_dir / f"{code}.parquet").exists()]
        else:
            parquet_files = list(date_dir.glob("*.parquet"))
        
        if not parquet_files:
            print(f"    ⚠️ 該日期下沒有符合條件的檔案")
            continue
            
        # 讀取該日期的所有檔案
        date_dfs = []
        for file_path in parquet_files:
            code = file_path.stem
            try:
                if engine == 'polars':
                    df = pl.read_parquet(file_path)
                else:  # pandas
                    df = pd.read_parquet(file_path)
                
                date_dfs.append(df)
                record_count = len(df)
                total_records += record_count
                print(f"    ✅ {code}: {record_count:,} 筆記錄")
                
            except Exception as e:
                print(f"    ❌ {code}: 讀取失敗 - {e}")
        
        # 合併該日期的所有資料
        if date_dfs:
            if engine == 'polars':
                date_combined = pl.concat(date_dfs)
            else:  # pandas
                date_combined = pd.concat(date_dfs, ignore_index=True)
            dfs.append(date_combined)
    
    # 合併所有日期的資料
    if dfs:
        if engine == 'polars':
            combined_df = pl.concat(dfs)
        else:  # pandas
            combined_df = pd.concat(dfs, ignore_index=True)
            
        print(f"\\n🔗 批量讀取完成")
        print(f"  📊 總計: {total_records:,} 筆記錄")
        print(f"  📏 最終形狀: {combined_df.shape}")
        return combined_df
    else:
        print("\\n❌ 沒有找到任何資料")
        return pl.DataFrame() if engine == 'polars' else pd.DataFrame()

print("🔧 進階批量讀取函數定義完成")


🔧 進階批量讀取函數定義完成


In [24]:
# 測試進階分檔功能
if flattened_pq.exists():
    # 1. 按日期和證券分檔
    date_based_dir = Path("./securities_by_date/")
    print("📁 測試按日期分檔功能:")
    file_mapping = split_by_securities_and_date(flattened_pq, date_based_dir)
    
    print(f"\\n📋 分檔結果:")
    for date_str, securities in file_mapping.items():
        print(f"  📅 {date_str}: {len(securities)} 個證券")
        for code, path in securities.items():
            print(f"    📊 {code} -> {path.name}")

else:
    print("❌ 無法進行進階測試，拍平格式檔案不存在")


📁 測試按日期分檔功能:
📁 按證券代號和日期分檔: demo_flattened.parquet -> securities_by_date/
  ✅ 2024-11-11/0050: 20 筆記錄 -> securities_by_date/2024-11-11/0050.parquet (12.4 KB)
  ✅ 2024-11-11/9958: 20 筆記錄 -> securities_by_date/2024-11-11/9958.parquet (12.1 KB)
\n📋 分檔結果:
  📅 2024-11-11: 2 個證券
    📊 0050 -> 0050.parquet
    📊 9958 -> 9958.parquet


In [25]:
# 測試批量讀取功能
date_based_dir = Path("./securities_by_date/")
if date_based_dir.exists():
    print("\\n" + "="*50)
    print("📖 測試批量讀取功能")
    print("="*50)
    
    # 1. 讀取單日單票 (使用polars)
    print("\\n🔸 測試 1: 單日單票 (0050)")
    try:
        df_single = batch_read_by_date_and_securities(
            date_based_dir, 
            target_date="2024-11-11", 
            securities_codes=["0050"],
            engine='polars'
        )
        if len(df_single) > 0:
            print(f"✅ 讀取成功: {df_single.shape}")
            print("前3筆預覽:")
            print(df_single.select(['securities_code', 'datetime', 'trade_price', 'bid_price_1']).head(3))
    except Exception as e:
        print(f"❌ 讀取失敗: {e}")
    
    # 2. 讀取單日多票 (使用pandas)
    print("\\n🔸 測試 2: 單日多票 (0050, 9958)")
    try:
        df_multi = batch_read_by_date_and_securities(
            date_based_dir, 
            target_date="2024-11-11", 
            securities_codes=["0050", "9958"],
            engine='pandas'
        )
        if len(df_multi) > 0:
            print(f"✅ 讀取成功: {df_multi.shape}")
            print("各證券筆數統計:")
            print(df_multi.groupby('securities_code').size())
    except Exception as e:
        print(f"❌ 讀取失敗: {e}")
    
    # 3. 讀取單日所有證券
    print("\\n🔸 測試 3: 單日所有證券")
    try:
        df_all = batch_read_by_date_and_securities(
            date_based_dir, 
            target_date="2024-11-11", 
            securities_codes=None,  # 所有證券
            engine='polars'
        )
        if len(df_all) > 0:
            print(f"✅ 讀取成功: {df_all.shape}")
            print("數據統計:")
            stats = df_all.select([
                pl.col('securities_code').n_unique().alias('證券數量'),
                pl.col('trade_price').filter(pl.col('trade_price') > 0).count().alias('有成交筆數'),
                pl.col('datetime').min().alias('最早時間'),
                pl.col('datetime').max().alias('最晚時間'),
            ])
            print(stats)
    except Exception as e:
        print(f"❌ 讀取失敗: {e}")

else:
    print("❌ 找不到按日期分檔的測試資料")


📖 測試批量讀取功能
\n🔸 測試 1: 單日單票 (0050)
📖 批量讀取資料:
  📅 日期: 2024-11-11 (1 個目錄)
  📊 證券: ['0050']
  🔧 引擎: polars
\n  📅 處理日期: 2024-11-11
    ✅ 0050: 20 筆記錄
\n🔗 批量讀取完成
  📊 總計: 20 筆記錄
  📏 最終形狀: (20, 33)
✅ 讀取成功: (20, 33)
前3筆預覽:
shape: (3, 4)
┌─────────────────┬────────────────────────────┬─────────────┬─────────────┐
│ securities_code ┆ datetime                   ┆ trade_price ┆ bid_price_1 │
│ ---             ┆ ---                        ┆ ---         ┆ ---         │
│ str             ┆ datetime[μs]               ┆ f64         ┆ f64         │
╞═════════════════╪════════════════════════════╪═════════════╪═════════════╡
│ 0050            ┆ 2024-11-11 08:30:04.446448 ┆ 0.0         ┆ 199.5       │
│ 0050            ┆ 2024-11-11 08:30:09.462008 ┆ 0.0         ┆ 199.5       │
│ 0050            ┆ 2024-11-11 08:30:14.478574 ┆ 0.0         ┆ 199.5       │
└─────────────────┴────────────────────────────┴─────────────┴─────────────┘
\n🔸 測試 2: 單日多票 (0050, 9958)
📖 批量讀取資料:
  📅 日期: 2024-11-11 (1 個目錄)
  📊 證券: ['0050'

---
## 方法 3: 分檔存儲 (按證券代號)

將不同證券代號的資料分別存儲，方便個股分析


In [26]:
def split_by_securities(src_parquet: Path, output_dir: Path) -> Dict[str, Path]:
    """
    將 Parquet 檔案按證券代號分檔存儲
    """
    output_dir.mkdir(parents=True, exist_ok=True)
    
    print(f"📁 按證券代號分檔: {src_parquet} -> {output_dir}/")
    
    # 讀取原始檔案
    df = pl.read_parquet(src_parquet)
    securities_list = df.select('securities_code').unique().sort('securities_code')
    
    print(f"🔍 發現 {len(securities_list)} 個不同證券代號")
    
    file_mapping = {}
    
    for row in securities_list.iter_rows(named=True):
        code = row['securities_code']
        
        # 過濾該證券的資料
        security_df = df.filter(pl.col('securities_code') == code)
        
        # 儲存檔案
        output_file = output_dir / f"{code}.parquet"
        security_df.write_parquet(
            output_file, 
            compression="zstd", 
            compression_level=7
        )
        
        file_mapping[code] = output_file
        file_size = output_file.stat().st_size / 1024
        print(f"  ✅ {code}: {len(security_df):,} 筆記錄 -> {output_file.name} ({file_size:.1f} KB)")
    
    return file_mapping

def batch_read_securities(securities_dir: Path, codes = None) -> pl.DataFrame:
    """
    批量讀取指定證券代號的資料
    """
    if codes is None:
        # 讀取目錄下所有 .parquet 檔案
        parquet_files = list(securities_dir.glob("*.parquet"))
        codes = [f.stem for f in parquet_files]
    
    print(f"📖 批量讀取 {len(codes)} 個證券: {codes}")
    
    dfs = []
    for code in codes:
        file_path = securities_dir / f"{code}.parquet"
        if file_path.exists():
            df = pl.read_parquet(file_path)
            dfs.append(df)
            print(f"  ✅ {code}: {len(df):,} 筆記錄")
        else:
            print(f"  ❌ {code}: 檔案不存在")
    
    if dfs:
        combined_df = pl.concat(dfs)
        print(f"🔗 合併完成，總計 {len(combined_df):,} 筆記錄")
        return combined_df
    else:
        print("❌ 沒有找到任何資料")
        return pl.DataFrame()

# 測試分檔功能
if flattened_pq.exists():
    securities_dir = Path("./securities_split/")
    file_mapping = split_by_securities(flattened_pq, securities_dir)
    
    print(f"\n📁 分檔結果: {len(file_mapping)} 個檔案")
    for code, path in file_mapping.items():
        print(f"  {code} -> {path.name}")
else:
    print("❌ 無法進行分檔測試，拍平格式檔案不存在")


📁 按證券代號分檔: demo_flattened.parquet -> securities_split/
🔍 發現 2 個不同證券代號
  ✅ 0050: 20 筆記錄 -> 0050.parquet (12.4 KB)
  ✅ 9958: 20 筆記錄 -> 9958.parquet (12.1 KB)

📁 分檔結果: 2 個檔案
  0050 -> 0050.parquet
  9958 -> 9958.parquet


In [27]:
batch_read_securities(Path("./securities_split/"))

📖 批量讀取 2 個證券: ['9958', '0050']
  ✅ 9958: 20 筆記錄
  ✅ 0050: 20 筆記錄
🔗 合併完成，總計 40 筆記錄


securities_code,datetime,remark,trend_flag,match_flag,trade_ul_flag,trade_price,trade_volume,buy_tick_size,buy_ul_flag,sell_tick_size,sell_ul_flag,match_staff,bid_price_1,bid_volume_1,bid_price_2,bid_volume_2,bid_price_3,bid_volume_3,bid_price_4,bid_volume_4,bid_price_5,bid_volume_5,ask_price_1,ask_volume_1,ask_price_2,ask_volume_2,ask_price_3,ask_volume_3,ask_price_4,ask_volume_4,ask_price_5,ask_volume_5
str,datetime[μs],cat,cat,cat,cat,f64,i64,i8,cat,i8,cat,cat,f64,i32,f64,i32,f64,i32,f64,i32,f64,i32,f64,i32,f64,i32,f64,i32,f64,i32,f64,i32
"""9958""",2024-11-11 13:28:17.464384,"""T""","""Unknown""","""Unknown""","""Unknown""",184.5,9491,5,"""Unknown""",5,"""Unknown""","""AA""",185.0,8,184.5,115,184.0,173,183.5,46,183.0,56,185.5,7,186.0,12,186.5,7,187.0,43,187.5,26
"""9958""",2024-11-11 13:28:27.497621,"""T""","""Unknown""","""Unknown""","""Unknown""",184.5,9491,5,"""Unknown""",5,"""Unknown""","""AA""",184.5,115,184.0,173,183.5,46,183.0,56,182.5,34,185.0,6,185.5,7,186.0,12,186.5,7,187.0,43
"""9958""",2024-11-11 13:28:32.513228,"""T""","""Unknown""","""Unknown""","""Unknown""",184.5,9491,5,"""Unknown""",5,"""Unknown""","""AA""",184.5,115,184.0,175,183.5,46,183.0,52,182.5,34,185.0,5,185.5,7,186.0,12,186.5,7,187.0,43
"""9958""",2024-11-11 13:28:37.530112,"""T""","""Unknown""","""Unknown""","""Unknown""",184.5,9491,5,"""Unknown""",5,"""Unknown""","""AA""",184.5,115,184.0,175,183.5,46,183.0,52,182.5,34,185.0,3,185.5,7,186.0,12,186.5,7,187.0,43
"""9958""",2024-11-11 13:28:42.545459,"""T""","""Unknown""","""Unknown""","""Unknown""",184.5,9491,5,"""Unknown""",5,"""Unknown""","""AA""",184.5,115,184.0,175,183.5,46,183.0,53,182.5,36,185.0,1,185.5,7,186.0,12,186.5,7,187.0,43
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""0050""",2024-11-11 08:31:19.677393,"""T""","""Unknown""","""Unknown""","""Unknown""",0.0,0,5,"""Unknown""",5,"""Unknown""","""AA""",199.05,2,199.0,68,198.95,2,198.9,1,198.85,1,199.2,2,199.3,1,199.5,2,199.6,1,199.8,2
"""0050""",2024-11-11 08:31:24.694034,"""T""","""Unknown""","""Unknown""","""Unknown""",0.0,0,5,"""Unknown""",5,"""Unknown""","""AA""",199.05,2,199.0,68,198.95,2,198.9,1,198.85,1,199.2,2,199.3,1,199.5,3,199.6,1,199.8,2
"""0050""",2024-11-11 08:31:29.709667,"""T""","""Unknown""","""Unknown""","""Unknown""",0.0,0,5,"""Unknown""",5,"""Unknown""","""AA""",199.0,68,198.95,2,198.9,1,198.85,1,198.8,2,199.05,2,199.1,1,199.2,2,199.3,1,199.5,3
"""0050""",2024-11-11 08:31:34.726835,"""T""","""Unknown""","""Unknown""","""Unknown""",0.0,0,5,"""Unknown""",5,"""Unknown""","""AA""",199.0,74,198.95,2,198.9,1,198.85,1,198.8,2,199.05,1,199.1,1,199.2,2,199.3,1,199.5,4


---

In [28]:
import pandas as pd
pd.read_parquet("./securities_split/0050.parquet")

Unnamed: 0,securities_code,datetime,remark,trend_flag,match_flag,trade_ul_flag,trade_price,trade_volume,buy_tick_size,buy_ul_flag,...,ask_price_1,ask_volume_1,ask_price_2,ask_volume_2,ask_price_3,ask_volume_3,ask_price_4,ask_volume_4,ask_price_5,ask_volume_5
0,50,2024-11-11 08:30:04.446448,T,Unknown,Unknown,Unknown,0.0,0,5,Unknown,...,203.0,2,0.0,0,0.0,0,0.0,0,0.0,0
1,50,2024-11-11 08:30:09.462008,T,Unknown,Unknown,Unknown,0.0,0,5,Unknown,...,200.0,1,200.65,1,201.0,1,201.5,1,203.0,4
2,50,2024-11-11 08:30:14.478574,T,Unknown,Unknown,Unknown,0.0,0,5,Unknown,...,200.0,2,200.65,1,201.0,2,201.5,1,203.0,4
3,50,2024-11-11 08:30:19.493222,T,Unknown,Unknown,Unknown,0.0,0,5,Unknown,...,200.0,2,200.2,1,200.3,1,200.6,6,200.65,1
4,50,2024-11-11 08:30:24.508770,T,Unknown,Unknown,Unknown,0.0,0,5,Unknown,...,200.0,13,200.2,3,200.3,1,200.5,3,200.6,6
5,50,2024-11-11 08:30:29.525440,T,Unknown,Unknown,Unknown,0.0,0,5,Unknown,...,200.0,34,200.2,3,200.3,1,200.5,3,200.6,6
6,50,2024-11-11 08:30:34.542076,T,Unknown,Unknown,Unknown,0.0,0,5,Unknown,...,199.8,1,200.0,40,200.2,4,200.3,1,200.5,4
7,50,2024-11-11 08:30:39.555650,T,Unknown,Unknown,Unknown,0.0,0,5,Unknown,...,199.8,1,200.0,49,200.1,1,200.2,4,200.3,1
8,50,2024-11-11 08:30:44.571226,T,Unknown,Unknown,Unknown,0.0,0,5,Unknown,...,199.8,1,200.0,52,200.1,1,200.2,4,200.3,1
9,50,2024-11-11 08:30:49.585802,T,Unknown,Unknown,Unknown,0.0,0,5,Unknown,...,199.6,1,199.8,2,199.9,1,200.0,55,200.05,1
