In [14]:
import pyarrow.parquet as pq
import json
import re

def parquet_columns_to_json(parquet_file_path, output_json_path=None):
    # 读取元数据
    parquet_metadata = pq.read_metadata(parquet_file_path)
    schema_str = str(parquet_metadata.schema)
    
    columns_info = []
    # 正则表达式匹配列信息行（如：optional int64 field_id=-1 date (...);）
    # 分组解释：
    # (\w+)\s+：匹配可空性（optional/required）
    # (\w+)\s+：匹配数据类型（int64/binary/double等）
    # field_id=.*?\s+：跳过field_id部分
    # (\w+)\s*：匹配列名
    # (?:\(.*?\))?;：匹配可能的类型参数（如(String)）和结尾分号
    pattern = re.compile(r'(\w+)\s+(\w+)\s+field_id=.*?\s+(\w+)\s*(?:\(.*?\))?;')
    
    for line in schema_str.split('\n'):
        line = line.strip()
        # 只处理包含列信息的行（排除group和空行）
        if 'field_id=' in line and 'group' not in line:
            match = pattern.search(line)
            if match:
                # 提取分组内容
                nullable_flag, data_type, col_name = match.groups()
                # 可空性判断：optional为True，required为False
                nullable = (nullable_flag == 'optional')
                # 特殊处理binary类型（实际是string）
                if data_type == 'binary' and '(String)' in line:
                    data_type = 'string'
                # 处理timestamp类型
                if data_type == 'int64' and 'Timestamp' in line:
                    data_type = 'timestamp[ns]'
                columns_info.append({
                    "column_name": col_name,
                    "data_type": data_type,
                    "nullable": nullable
                })
    
    columns_json = json.dumps(columns_info, indent=2, ensure_ascii=False)
    
    if output_json_path:
        with open(output_json_path, "w", encoding="utf-8") as f:
            f.write(columns_json)
    
    return columns_json

# 示例调用
json_result = parquet_columns_to_json(
    parquet_file_path=r"D:\workspace\xiaoyao\data\stock_daily_price.parquet",
    output_json_path="columns_info.json"
)
print(json_result)

[
  {
    "column_name": "date",
    "data_type": "timestamp[ns]",
    "nullable": true
  },
  {
    "column_name": "stock_code",
    "data_type": "string",
    "nullable": true
  },
  {
    "column_name": "open",
    "data_type": "double",
    "nullable": true
  },
  {
    "column_name": "close",
    "data_type": "double",
    "nullable": true
  },
  {
    "column_name": "low",
    "data_type": "double",
    "nullable": true
  },
  {
    "column_name": "high",
    "data_type": "double",
    "nullable": true
  },
  {
    "column_name": "volume",
    "data_type": "double",
    "nullable": true
  },
  {
    "column_name": "money",
    "data_type": "double",
    "nullable": true
  },
  {
    "column_name": "factor",
    "data_type": "double",
    "nullable": true
  },
  {
    "column_name": "high_limit",
    "data_type": "double",
    "nullable": true
  },
  {
    "column_name": "low_limit",
    "data_type": "double",
    "nullable": true
  },
  {
    "column_name": "avg",
    "data_type"