In [None]:
import redis
import pickle
import os
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import threading
import time
import traceback
from datetime import datetime
from concurrent.futures import ThreadPoolExecutor  # 线程池

# ---------------------- 配置参数 ----------------------
REDIS_CONFIG_PATH = "redis.conf"
RESULT_QUEUE = "function_results"
STORAGE_ROOT = r"D:\workspace\xiaoyao\data\stock_minutely_price"
MAX_CONVERT_THREADS = 20  # 线程池大小，与worker数量匹配
TEMP_DIR = r"D:\workspace\xiaoyao\redis\temp_csv"  # 临时文件目录

# ---------------------- 工具函数 ----------------------
def load_redis_config(config_path):
    if not os.path.exists(config_path):
        raise FileNotFoundError(f"Redis配置文件不存在：{config_path}")
    
    host = "localhost"
    port = 6379
    password = ""
    
    with open(config_path, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if not line or line.startswith('#'):
                continue
            if line.startswith('host='):
                host = line.split('=', 1)[1].strip()
            elif line.startswith('port='):
                try:
                    port = int(line.split('=', 1)[1].strip())
                except ValueError:
                    print(f"警告：port配置格式错误，使用默认值{port}")
            elif line.startswith('password='):
                password = line.split('=', 1)[1].strip()
    
    return {
        "host": host,
        "port": port,
        "password": password,
        "decode_responses": False,
        "socket_timeout": 30,
        "socket_keepalive": True
    }

# ---------------------- 结果处理类 ----------------------
class ResultProcessor:
    def __init__(self, redis_config):
        # 1. 初始化目录（临时目录+存储目录）
        os.makedirs(STORAGE_ROOT, exist_ok=True)
        os.makedirs(TEMP_DIR, exist_ok=True)  # 确保临时目录存在
        print(f"✅ 临时文件目录：{TEMP_DIR}")
        print(f"✅ 最终存储目录：{STORAGE_ROOT}")
        
        # 2. Redis连接初始化
        self.redis_config = redis_config
        self.redis = redis.Redis(** redis_config)
        self._test_connection()
        
        # 3. 元信息存储在Redis中
        self.redis_metadata_key = "task_metadata"
        self.total_tasks = self.redis.hlen(self.redis_metadata_key)
        print(f"✅ 从Redis获取任务元信息，共{self.total_tasks}个任务")
        
        # 4. 统计与线程安全配置
        self.processed_count = 0
        self.success_count = 0
        self.failed_count = 0
        self.lock = threading.Lock()  # 线程锁（确保统计计数安全）
        
        # 5. 初始化线程池替代手动管理线程
        self.thread_pool = ThreadPoolExecutor(max_workers=MAX_CONVERT_THREADS)
        
        # 6. 失败任务记录
        self.failed_tasks = []
        self.failed_tasks_lock = threading.Lock()

    def _test_connection(self):
        try:
            self.redis.ping()
            print("✅ Redis连接成功")
        except Exception as e:
            print(f"❌ Redis连接失败：{e}")
            raise SystemExit(1)

    def _create_temp_file(self, task_id, csv_str):
        """将CSV字符串写入临时文件，返回临时文件路径"""
        try:
            # 生成唯一临时文件名（避免冲突：任务ID+时间戳）
            temp_filename = f"temp_{task_id}_{datetime.now().strftime('%H%M%S%f')}.csv"
            temp_file_path = os.path.join(TEMP_DIR, temp_filename)
            
            # 写入CSV数据（UTF-8编码，避免中文乱码）
            with open(temp_file_path, 'w', encoding='utf-8') as f:
                f.write(csv_str)
            
            return temp_file_path  # 返回临时文件路径
        except Exception as e:
            print(f"❌ 任务 {task_id} 创建临时文件失败：{str(e)}")
            return None

    def _append_to_parquet(self, task_id, temp_file_path, trade_date, stock_code):
        """从临时文件读取CSV，追加到对应分区的Parquet"""
        try:
            # 1. 从临时文件读取CSV（容错处理）
            df = pd.read_csv(
                temp_file_path,
                encoding='utf-8',
                sep=',',
                on_bad_lines='skip',  # 跳过格式错误的行
                dtype={
                    "date": "str",
                    "stock_code": "str",
                    "time": "str",
                    "open": "float64",
                    "close": "float64",
                    "high": "float64",
                    "low": "float64",
                    "volume": "int64"
                }
            )
            
            # 2. 数据校验（确保字段完整）
            required_cols = ["date", "stock_code", "time", "open", "close", "high", "low", "volume"]
            if not all(col in df.columns for col in required_cols):
                missing = [col for col in required_cols if col not in df.columns]
                raise ValueError(f"缺少必要字段：{missing}")
            
            # 3. 创建Parquet分区目录
            # 关键修改：将 stock= 改为 stock_code=，保持与数据字段一致
            partition_dir = os.path.join(STORAGE_ROOT, f"date={trade_date}", f"stock_code={stock_code}")
            os.makedirs(partition_dir, exist_ok=True)
            parquet_path = os.path.join(partition_dir, "data.parquet")
            
            # 4. 追加到Parquet（确保不添加额外字段）
            table = pa.Table.from_pandas(df)
            
            # 检查并删除可能存在的多余字段
            if 'stock' in table.column_names:
                table = table.drop(['stock'])
            
            if os.path.exists(parquet_path):
                existing_table = pq.read_table(parquet_path)
                # 清理现有文件中可能的多余字段
                if 'stock' in existing_table.column_names:
                    existing_table = existing_table.drop(['stock'])
                combined_table = pa.concat_tables([existing_table, table])
                pq.write_table(combined_table, parquet_path, compression="snappy")
            else:
                pq.write_table(table, parquet_path, compression="snappy")
            
            print(f"✅ 任务 {task_id}：成功追加到 {parquet_path}")
            return True
        except Exception as e:
            print(f"❌ 任务 {task_id} 追加Parquet失败：{str(e)}")
            return False

    def _delete_temp_file(self, task_id, temp_file_path):
        """删除临时文件（确保数据已成功追加后调用）"""
        try:
            if os.path.exists(temp_file_path):
                os.remove(temp_file_path)
                print(f"ℹ️  任务 {task_id}：已删除临时文件 {os.path.basename(temp_file_path)}")
            return True
        except Exception as e:
            print(f"⚠️  任务 {task_id} 删除临时文件失败：{str(e)}（需手动清理）")
            return False

    def _process_task(self, task_id, csv_str):
        """处理单个任务的函数，供线程池调用"""
        temp_file_path = None
        
        try:
            # 1. 创建临时文件
            temp_file_path = self._create_temp_file(task_id, csv_str)
            if not temp_file_path:
                raise Exception("创建临时文件失败")
            
            # 2. 从Redis获取任务元信息
            metadata_value = self.redis.hget(self.redis_metadata_key, task_id)
            if not metadata_value:
                raise KeyError(f"任务{task_id}的元信息在Redis中不存在")
            trade_date, stock_code = pickle.loads(metadata_value)
            
            # 3. 追加到Parquet
            append_success = self._append_to_parquet(task_id, temp_file_path, trade_date, stock_code)
            if not append_success:
                raise Exception("追加Parquet失败")
            
            # 4. 删除临时文件和Redis中的元信息
            self._delete_temp_file(task_id, temp_file_path)
            self.redis.hdel(self.redis_metadata_key, task_id)
            
            # 5. 更新成功计数
            with self.lock:
                self.success_count += 1
        
        except Exception as e:
            # 记录失败信息
            with self.failed_tasks_lock:
                self.failed_tasks.append({
                    "task_id": task_id,
                    "error": str(e),
                    "traceback": traceback.format_exc()[:1000]
                })
            
            # 更新失败计数
            with self.lock:
                self.failed_count += 1
            
            print(f"❌ 任务 {task_id} 处理失败：{str(e)}")
            
            # 失败时也尝试删除临时文件
            if temp_file_path:
                self._delete_temp_file(task_id, temp_file_path)
        
        finally:
            # 更新已处理计数
            with self.lock:
                self.processed_count += 1
                # 每100个任务打印一次进度
                if self.processed_count % 100 == 0:
                    if self.total_tasks > 0:
                        progress = (self.processed_count / self.total_tasks) * 100
                        print(f"📊 处理中：进度 {progress:.2f}%，成功{self.success_count}/失败{self.failed_count}/总计{self.processed_count}")
                    else:
                        print(f"📊 处理中：成功{self.success_count}/失败{self.failed_count}/总计{self.processed_count}")

    def receive_and_process_results(self):
        """接收Redis结果，提交到线程池处理"""
        print(f"✅ 开始接收结果，总任务数：{self.total_tasks}")
        
        # 如果总任务数为0，直接等待并退出
        if self.total_tasks == 0:
            print("ℹ️  没有任务需要处理，将等待10秒后退出")
            time.sleep(10)
            print("✅ 退出程序")
            return
            
        last_report_time = time.time()
        
        # 用于跟踪线程池中的任务
        futures = []
        
        while True:
            # 检查是否所有任务已处理完成
            with self.lock:
                if self.processed_count >= self.total_tasks:
                    break
            
            # 从Redis获取结果（支持重连）
            try:
                result_data = self.redis.blpop(RESULT_QUEUE, timeout=60)
            except redis.exceptions.ConnectionError:
                print(f"⚠️ Redis连接断开，尝试重连...")
                self.redis = redis.Redis(** self.redis_config)
                time.sleep(5)
                continue
            except Exception as e:
                print(f"❌ 获取Redis结果失败：{str(e)}")
                time.sleep(3)
                continue
            
            if not result_data:
                # 定期打印进度（每30秒一次）
                with self.lock:
                    if self.total_tasks > 0:
                        progress = (self.processed_count / self.total_tasks) * 100
                        print(f"⏳ 处理中：进度 {progress:.2f}%，成功{self.success_count}/失败{self.failed_count}/总计{self.processed_count}")
                    else:
                        print(f"⏳ 处理中：成功{self.success_count}/失败{self.failed_count}/总计{self.processed_count}")
                last_report_time = time.time()
                continue
            
            # 解析Redis结果，提交到线程池处理
            _, result_bytes = result_data
            try:
                result = pickle.loads(result_bytes)
                task_id = result.get("task_id", "未知")
                csv_str = result.get("result", "")
                
                if result["status"] == "success" and csv_str.strip():
                    # 提交任务到线程池处理
                    future = self.thread_pool.submit(self._process_task, task_id, csv_str)
                    futures.append(future)
                    print(f"ℹ️  任务 {task_id}：已提交到线程池处理")
                else:
                    # 远端直接返回失败，记录错误
                    with self.failed_tasks_lock:
                        self.failed_tasks.append({
                            "task_id": task_id,
                            "error": result.get("error", "远端执行失败")
                        })
                    
                    with self.lock:
                        self.failed_count += 1
                        self.processed_count += 1
                    
                    print(f"❌ 任务 {task_id}：远端执行失败 → {result.get('error', '未知原因')}")
            
            except Exception as e:
                # 结果解析失败
                with self.failed_tasks_lock:
                    self.failed_tasks.append({
                        "task_id": "未知",
                        "error": f"结果解析失败：{str(e)}",
                        "traceback": traceback.format_exc()[:500]
                    })
                
                with self.lock:
                    self.processed_count += 1
                    self.failed_count += 1
                
                print(f"❌ 结果解析失败：{str(e)}")
        
        # 等待所有线程池任务完成
        print("✅ 所有任务已接收，等待线程池处理完成...")
        for future in futures:
            future.result()  # 等待任务完成
        
        # 关闭线程池
        self.thread_pool.shutdown()
        
        # 保存失败任务列表
        if self.failed_tasks:
            failed_path = f"failed_tasks_{datetime.now().strftime('%Y%m%d_%H%M%S')}.pkl"
            with open(failed_path, "wb") as f:
                pickle.dump(self.failed_tasks, f)
            print(f"⚠️  已保存 {len(self.failed_tasks)} 个失败任务到 {failed_path}")
        
        # 最终统计
        print("\n" + "="*60)
        print(f"处理完成：总任务数 {self.total_tasks}")
        if self.total_tasks > 0:
            print(f"成功：{self.success_count} ({self.success_count/self.total_tasks*100:.2f}%)")
            print(f"失败：{self.failed_count} ({self.failed_count/self.total_tasks*100:.2f}%)")
        else:
            print(f"成功：{self.success_count}")
            print(f"失败：{self.failed_count}")
        print(f"临时文件目录：{TEMP_DIR}（残留文件需手动清理）")
        print("="*60)

# ---------------------- 主函数 ----------------------
if __name__ == "__main__":
    try:
        # 1. 加载Redis配置
        redis_config = load_redis_config(REDIS_CONFIG_PATH)
        print(f"✅ 已加载Redis配置：host={redis_config['host']}, port={redis_config['port']}")
        
        # 2. 初始化处理器并启动
        processor = ResultProcessor(redis_config)
        processor.receive_and_process_results()
        
    except Exception as e:
        print(f"❌ 程序启动失败：{str(e)}")
        raise SystemExit(1)
    

✅ 已加载Redis配置：host=220.203.1.124, port=6379
✅ 临时文件目录：D:\workspace\xiaoyao\redis\temp_csv
✅ 最终存储目录：D:\workspace\xiaoyao\data\stock_minutely_price
✅ Redis连接成功
✅ 从Redis获取任务元信息，共71718个任务
❌ 程序启动失败：'ResultProcessor' object has no attribute 'receive_and_process_results'


SystemExit: 1

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
