In [1]:
#增强版数据工程模块

import logging
import pandas as pd
import numpy as np
import os
import gc
from typing import Dict, List, Optional, Tuple, Union
from pathlib import Path
import pyarrow.parquet as pq
from datetime import datetime, timedelta
from sklearn.preprocessing import PolynomialFeatures
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.feature_selection import mutual_info_regression
import warnings
warnings.filterwarnings('ignore')

import logging
from pathlib import Path
from datetime import datetime

# 配置日志
logging.basicConfig(
    level="INFO",
    format="%(asctime)s | %(levelname)8s | %(name)s | %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S"
)

# 配置文件日志处理器
file_handler = logging.FileHandler(f"flight_ranking_{datetime.now().strftime('%Y%m%d_%H%M%S')}.log")
file_handler.setFormatter(logging.Formatter(
    "%(asctime)s | %(levelname)8s | %(name)s | %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S"
))

# 获取logger并添加文件处理器
logger = logging.getLogger(__name__)
logger.addHandler(file_handler)
logger.setLevel("INFO")
logger.info("核心控制器初始化完成")


2025-08-05 20:38:11 |     INFO | __main__ | 核心控制器初始化完成


In [2]:
class EnhancedDataEngineering:
    """增强版数据工程类 - 深度特征工程"""
    
    def __init__(self, logger=None):
        self.logger = logger
        self.processing_stats = {}
        self.feature_importance_cache = {}
        self.cluster_models = {}
        
    def log_info(self, message: str):
        """统一日志输出"""
        if self.logger:
            self.logger.info(message)
    
    
    def create_economic_features(self, df: pd.DataFrame) -> pd.DataFrame:
        """创建具有经济学意义的特征"""
        df = df.copy()
        self.log_info("创建经济学特征...")
        
        # 1. 价格弹性和需求理论特征
        if 'totalPrice_bin' in df.columns and 'ranker_id' in df.columns:
            # 价格弹性指标（组内价格敏感度）
            group_price_stats = df.groupby('ranker_id')['totalPrice_bin'].agg([
                'mean', 'std', 'min', 'max', 'count'
            ]).reset_index()
            group_price_stats.columns = ['ranker_id', 'group_price_mean', 'group_price_std', 
                                       'group_price_min', 'group_price_max', 'group_size']
            
            df = df.merge(group_price_stats, on='ranker_id', how='left')
            
            # 相对价格位置（价格锚定效应）
            df['price_relative_position'] = (df['totalPrice_bin'] - df['group_price_min']) / (
                df['group_price_max'] - df['group_price_min'] + 1)
            
            # 价格离散系数（选择复杂度）
            df['price_coefficient_variation'] = df['group_price_std'] / (df['group_price_mean'] + 1)
            
            # 是否为极值选项（锚定效应）
            df['is_price_anchor_high'] = (df['totalPrice_bin'] == df['group_price_max']).astype('int8')
            df['is_price_anchor_low'] = (df['totalPrice_bin'] == df['group_price_min']).astype('int8')
            
            # 价格-质量感知比（基于舱位和价格）
            if 'avg_cabin_class' in df.columns:
                df['price_quality_ratio'] = df['totalPrice_bin'] / (df['avg_cabin_class'] + 1)
        
        # 2. 时间价值特征（Time Value of Money）
        if 'legs0_departureAt_hour' in df.columns:
            # 商务时间溢价（Business Time Premium）
            business_hours = [7, 8, 9, 17, 18, 19, 20]
            df['is_business_prime_time'] = df['legs0_departureAt_hour'].isin(business_hours).astype('int8')
            
            # 红眼航班折扣效应
            redeye_hours = [22, 23, 0, 1, 2, 3, 4, 5, 6]
            df['is_redeye_discount'] = df['legs0_departureAt_hour'].isin(redeye_hours).astype('int8')
            
            # 黄金时间（8-10点，18-20点）
            golden_hours = [8, 9, 18, 19]
            df['is_golden_time'] = df['legs0_departureAt_hour'].isin(golden_hours).astype('int8')
        
        # 3. 预订行为经济学特征
        if 'legs0_days_ahead' in df.columns:
            # 预订时机分类（行为经济学）
            df['booking_urgency'] = np.select([
                df['legs0_days_ahead'] <= 1,    # 紧急预订
                df['legs0_days_ahead'] <= 7,    # 临时预订
                df['legs0_days_ahead'] <= 14,   # 正常预订
                df['legs0_days_ahead'] <= 30,   # 提前预订
                df['legs0_days_ahead'] <= 60,   # 早期预订
            ], [4, 3, 2, 1, 0], default=0).astype('int8')
            
            # 最佳预订窗口（21-60天通常价格最优）
            df['in_optimal_booking_window'] = (
                (df['legs0_days_ahead'] >= 21) & (df['legs0_days_ahead'] <= 60)
            ).astype('int8')
            
            # 冲动购买指标（<=3天）
            df['is_impulse_booking'] = (df['legs0_days_ahead'] <= 3).astype('int8')
        
        # 4. 网络效应和市场集中度
        if 'legs0_segments0_marketingCarrier_code' in df.columns:
            # 航空公司市场份额（网络效应）
            carrier_market_share = df['legs0_segments0_marketingCarrier_code'].value_counts(normalize=True)
            df['carrier_market_share'] = df['legs0_segments0_marketingCarrier_code'].map(carrier_market_share)
            
            # 是否为市场领导者（>20%市场份额）
            df['is_market_leader'] = (df['carrier_market_share'] > 0.2).astype('int8')
        
        # 5. 服务质量感知特征
        if 'legs0_segments0_cabinClass' in df.columns and 'totalPrice_bin' in df.columns:
            # 性价比指标
            cabin_price_ratio = df.groupby('legs0_segments0_cabinClass')['totalPrice_bin'].mean()
            df['cabin_price_expectation'] = df['legs0_segments0_cabinClass'].map(cabin_price_ratio)
            df['price_expectation_gap'] = df['totalPrice_bin'] - df['cabin_price_expectation']
            
        # 6. 便利性溢价特征
        if 'total_segments' in df.columns:
            # 直飞溢价（便利性价值）
            df['convenience_score'] = np.select([
                df['total_segments'] == 2,  # 往返都直飞
                df['total_segments'] == 3,  # 一程直飞
                df['total_segments'] >= 4,  # 都需转机
            ], [2, 1, 0], default=0).astype('int8')
        
        return df
    
    
    def create_behavioral_features(self, df: pd.DataFrame) -> pd.DataFrame:
        """创建行为经济学特征"""
        df = df.copy()
        self.log_info("创建行为经济学特征...")
        
        # 1. 选择过载理论（Choice Overload）
        if 'ranker_id' in df.columns:
            # 选择复杂度
            choice_complexity = df.groupby('ranker_id').agg({
                'Id': 'count',  # 选项数量
                'totalPrice_bin': ['std', 'max', 'min'],  # 价格分布
                'total_duration': ['std', 'mean'] if 'total_duration' in df.columns else ['count', 'count']
            }).reset_index()
            
            choice_complexity.columns = ['ranker_id', 'choice_set_size', 'price_variance', 
                                       'price_range_max', 'price_range_min', 'duration_variance', 'duration_mean']
            
            df = df.merge(choice_complexity, on='ranker_id', how='left')
            
            # 选择过载指标
            df['choice_overload_score'] = np.select([
                df['choice_set_size'] >= 20,  # 高过载
                df['choice_set_size'] >= 10,  # 中等过载
                df['choice_set_size'] >= 5,   # 轻微过载
            ], [3, 2, 1], default=0).astype('int8')
            
            # 选择集中度（基于价格分布）
            df['choice_concentration'] = 1 / (df['price_variance'] + 1)
        
        # 2. 损失厌恶特征（Loss Aversion）
        if 'miniRules0_monetaryAmount' in df.columns and 'miniRules1_monetaryAmount' in df.columns:
            # 潜在损失风险
            df['max_cancellation_loss'] = np.maximum(
                df['miniRules0_monetaryAmount'].fillna(0),
                df['miniRules1_monetaryAmount'].fillna(0)
            )
            
            # 损失厌恶阈值（相对于票价的百分比）
            df['loss_aversion_ratio'] = df['max_cancellation_loss'] / (df['totalPrice_bin'] + 1)
            
            # 低风险选项（可免费取消/改签）
            df['is_low_risk_option'] = (
                (df['miniRules0_monetaryAmount'] == 0) | 
                (df['miniRules1_monetaryAmount'] == 0)
            ).astype('int8')
        
        # 3. 社会认同特征（Social Proof）
        if 'legs0_segments0_marketingCarrier_code' in df.columns and 'ranker_id' in df.columns:
            # 同组最受欢迎航空公司
            popular_carrier_in_group = df.groupby('ranker_id')['legs0_segments0_marketingCarrier_code'].agg(
                lambda x: x.mode().iloc[0] if len(x.mode()) > 0 else -1
            ).reset_index()
            popular_carrier_in_group.columns = ['ranker_id', 'popular_carrier']
            
            df = df.merge(popular_carrier_in_group, on='ranker_id', how='left')
            df['is_popular_carrier_choice'] = (
                df['legs0_segments0_marketingCarrier_code'] == df['popular_carrier']
            ).astype('int8')
        
        # 4. 认知偏差特征
        if 'legs0_departureAt_hour' in df.columns and 'legs0_arrivalAt_hour' in df.columns:
            # 到达时间偏好（认知便利性）
            df['arrival_time_preference'] = np.select([
                df['legs0_arrivalAt_hour'].between(8, 12),   # 上午到达（便于安排）
                df['legs0_arrivalAt_hour'].between(13, 18),  # 下午到达
                df['legs0_arrivalAt_hour'].between(19, 22),  # 晚上到达
            ], [2, 1, 0], default=0).astype('int8')
        
        # 5. 框架效应特征（Framing Effect）
        if 'totalPrice_bin' in df.columns and 'taxes_bin' in df.columns:
            # 价格透明度（税费占比）
            df['price_transparency'] = df['taxes_bin'] / (df['totalPrice_bin'] + 1)
            
            # 隐藏成本感知
            df['hidden_cost_perception'] = np.select([
                df['price_transparency'] > 0.3,  # 高税费比例
                df['price_transparency'] > 0.15, # 中等税费比例
            ], [2, 1], default=0).astype('int8')
        
        return df
    
    
    def create_operational_features(self, df: pd.DataFrame) -> pd.DataFrame:
        """创建运营相关特征"""
        df = df.copy()
        self.log_info("创建运营特征...")
        
        # 1. 航线运营效率特征
        if 'legs0_segments0_departureFrom_airport_iata' in df.columns and 'legs0_segments0_arrivalTo_airport_iata' in df.columns:
            # 航线标识符
            df['route_identifier'] = df['legs0_segments0_departureFrom_airport_iata'].astype(str) + '_' + \
                                   df['legs0_segments0_arrivalTo_airport_iata'].astype(str)
            
            # 航线频次（运营密度）
            route_frequency = df['route_identifier'].value_counts()
            df['route_frequency'] = df['route_identifier'].map(route_frequency)
            
            # 热门航线标识
            df['is_popular_route'] = (df['route_frequency'] > df['route_frequency'].quantile(0.8)).astype('int8')
        
        # 2. 运力管理特征
        if 'legs0_segments0_seatsAvailable' in df.columns:
            # 舱位紧张度
            df['seat_scarcity'] = np.select([
                df['legs0_segments0_seatsAvailable'] <= 3,   # 极度紧张
                df['legs0_segments0_seatsAvailable'] <= 7,   # 紧张
                df['legs0_segments0_seatsAvailable'] <= 15,  # 中等
            ], [3, 2, 1], default=0).astype('int8')
            
            # 舱位可得性压力
            df['availability_pressure'] = 1 / (df['legs0_segments0_seatsAvailable'] + 1)
        
        # 3. 网络连通性特征
        if 'total_segments' in df.columns:
            # 连接复杂度
            df['connection_complexity'] = df['total_segments'] - 2  # 基础往返为2段
            
            # 枢纽依赖度（多段航班通常依赖枢纽）
            df['hub_dependency'] = (df['total_segments'] > 2).astype('int8')
        
        # 4. 时刻表优化特征
        if 'legs0_departureAt_hour' in df.columns and 'legs0_arrivalAt_hour' in df.columns:
            # 时刻表效率（避开拥堵时段）
            congested_hours = [7, 8, 17, 18, 19]
            df['avoids_congestion'] = (
                ~df['legs0_departureAt_hour'].isin(congested_hours) &
                ~df['legs0_arrivalAt_hour'].isin(congested_hours)
            ).astype('int8')
            
            # 时间窗口利用率
            prime_departure = [8, 9, 10, 17, 18, 19]  # 优选出发时间
            df['optimal_departure_window'] = df['legs0_departureAt_hour'].isin(prime_departure).astype('int8')
        
        return df
    
    
    def create_customer_segmentation_features(self, df: pd.DataFrame) -> pd.DataFrame:
        """创建客户细分特征"""
        df = df.copy()
        self.log_info("创建客户细分特征...")
        
        # 1. 商务旅客识别
        business_indicators = []
        
        if 'corporateTariffCode' in df.columns:
            business_indicators.append('corporateTariffCode != -1')
        if 'isVip' in df.columns:
            business_indicators.append('isVip == 1')
        if 'has_corporate_tariff' in df.columns:
            business_indicators.append('has_corporate_tariff == 1')
        
        if business_indicators:
            business_query = ' or '.join(business_indicators)
            df['business_traveler_score'] = df.eval(business_query).astype('int8')
        
        # 2. 价格敏感度细分
        if 'totalPrice_bin' in df.columns and 'ranker_id' in df.columns:
            # 价格选择行为
            price_behavior = df.groupby('ranker_id').apply(
                lambda x: (x['totalPrice_bin'] == x['totalPrice_bin'].min()).sum() / len(x)
            ).reset_index()
            price_behavior.columns = ['ranker_id', 'price_sensitive_ratio']
            
            df = df.merge(price_behavior, on='ranker_id', how='left')
            
            # 价格敏感度分类
            df['price_sensitivity_segment'] = np.select([
                df['price_sensitive_ratio'] > 0.7,  # 高价格敏感
                df['price_sensitive_ratio'] > 0.3,  # 中等价格敏感
            ], [2, 1], default=0).astype('int8')
        
        # 3. 便利性偏好细分
        if 'total_segments' in df.columns:
            # 便利性偏好指标
            convenience_pref = df.groupby('ranker_id').apply(
                lambda x: (x['total_segments'] == x['total_segments'].min()).sum() / len(x)
            ).reset_index()
            convenience_pref.columns = ['ranker_id', 'convenience_preference']
            
            df = df.merge(convenience_pref, on='ranker_id', how='left')
            
            # 便利性细分
            df['convenience_segment'] = np.select([
                df['convenience_preference'] > 0.6,  # 高便利性需求
                df['convenience_preference'] > 0.3,  # 中等便利性需求
            ], [2, 1], default=0).astype('int8')
        
        # 4. 忠诚度特征
        if 'has_frequent_flyer' in df.columns and 'legs0_segments0_marketingCarrier_code' in df.columns:
            # 品牌忠诚度指标
            df['brand_loyalty_indicator'] = (
                (df['has_frequent_flyer'] == 1) & 
                (df['legs0_segments0_marketingCarrier_code'] != -1)
            ).astype('int8')
        
        return df
    
    
    def create_market_dynamics_features(self, df: pd.DataFrame) -> pd.DataFrame:
        """创建市场动态特征"""
        df = df.copy()
        self.log_info("创建市场动态特征...")
        
        # 1. 竞争强度特征
        if 'ranker_id' in df.columns and 'legs0_segments0_marketingCarrier_code' in df.columns:
            # 航空公司竞争数量
            carrier_competition = df.groupby('ranker_id')['legs0_segments0_marketingCarrier_code'].nunique().reset_index()
            carrier_competition.columns = ['ranker_id', 'carrier_competition_count']
            df = df.merge(carrier_competition, on='ranker_id', how='left')
            
            # 竞争激烈程度
            df['competition_intensity'] = np.select([
                df['carrier_competition_count'] >= 5,  # 激烈竞争
                df['carrier_competition_count'] >= 3,  # 中等竞争
                df['carrier_competition_count'] >= 2,  # 低竞争
            ], [3, 2, 1], default=0).astype('int8')
        
        # 2. 市场集中度特征
        if 'legs0_segments0_marketingCarrier_code' in df.columns:
            # HHI指数（赫芬达尔-赫希曼指数）近似
            carrier_shares = df['legs0_segments0_marketingCarrier_code'].value_counts(normalize=True)
            hhi_approx = (carrier_shares ** 2).sum()
            
            # 每个航空公司的市场地位
            df['carrier_market_position'] = df['legs0_segments0_marketingCarrier_code'].map(carrier_shares)
            
            # 市场地位分类
            df['market_position_tier'] = np.select([
                df['carrier_market_position'] > 0.3,  # 市场领导者
                df['carrier_market_position'] > 0.15, # 主要竞争者
                df['carrier_market_position'] > 0.05, # 挑战者
            ], [3, 2, 1], default=0).astype('int8')
        
        # 3. 供需平衡特征
        if 'legs0_segments0_seatsAvailable' in df.columns and 'ranker_id' in df.columns:
            # 供需比例
            demand_supply = df.groupby('ranker_id').agg({
                'Id': 'count',  # 需求（选项数量代理）
                'legs0_segments0_seatsAvailable': 'mean'  # 供给
            }).reset_index()
            demand_supply.columns = ['ranker_id', 'demand_proxy', 'avg_supply']
            demand_supply['supply_demand_ratio'] = demand_supply['avg_supply'] / (demand_supply['demand_proxy'] + 1)
            
            df = df.merge(demand_supply[['ranker_id', 'supply_demand_ratio']], on='ranker_id', how='left')
            
            # 供需状态分类
            df['supply_demand_state'] = np.select([
                df['supply_demand_ratio'] < 0.5,  # 供不应求
                df['supply_demand_ratio'] < 1.5,  # 均衡
                df['supply_demand_ratio'] >= 1.5, # 供过于求
            ], [2, 1, 0], default=0).astype('int8')
        
        return df
    
    
    def create_deep_interaction_features(self, df: pd.DataFrame) -> pd.DataFrame:
        """创建深度交互特征"""
        df = df.copy()
        self.log_info("创建深度交互特征...")
        
        # 选择关键特征进行交互
        key_features = []
        
        # 收集存在的关键特征
        candidate_features = [
            'totalPrice_bin', 'is_business_prime_time', 'booking_urgency',
            'convenience_score', 'carrier_market_share', 'competition_intensity',
            'choice_overload_score', 'price_sensitivity_segment', 'business_traveler_score'
        ]
        
        for feature in candidate_features:
            if feature in df.columns:
                key_features.append(feature)
        
        if len(key_features) >= 3:
            # 创建重要的二次交互特征
            interactions = [
                ('totalPrice_bin', 'booking_urgency', 'price_urgency_interaction'),
                ('business_traveler_score', 'is_business_prime_time', 'business_time_match'),
                ('convenience_score', 'competition_intensity', 'convenience_competition'),
                ('price_sensitivity_segment', 'carrier_market_share', 'price_brand_interaction')
            ]
            
            for feat1, feat2, new_name in interactions:
                if feat1 in df.columns and feat2 in df.columns:
                    df[new_name] = (df[feat1] * df[feat2]).astype('int16')
        
        return df
    
    
    def create_polynomial_features(self, df: pd.DataFrame, degree: int = 2, 
                                 max_features: int = 50) -> pd.DataFrame:
        """创建多项式特征"""
        df = df.copy()
        self.log_info(f"创建{degree}次多项式特征...")
        
        # 选择数值特征进行多项式变换
        numeric_cols = []
        candidates = ['totalPrice_bin', 'total_duration', 'legs0_days_ahead', 
                     'group_size', 'carrier_market_share', 'route_frequency']
        
        for col in candidates:
            if col in df.columns:
                numeric_cols.append(col)
        
        if len(numeric_cols) >= 2:
            # 限制特征数量避免维度爆炸
            selected_cols = numeric_cols[:min(5, len(numeric_cols))]
            
            try:
                poly = PolynomialFeatures(degree=degree, interaction_only=True, 
                                        include_bias=False)
                
                # 小批量处理避免内存问题
                sample_size = min(10000, len(df))
                sample_df = df[selected_cols].fillna(0).iloc[:sample_size]
                
                poly_features = poly.fit_transform(sample_df)
                feature_names = poly.get_feature_names_out(selected_cols)
                
                # 选择最重要的多项式特征
                if len(feature_names) > max_features:
                    # 使用方差筛选
                    variances = np.var(poly_features, axis=0)
                    top_indices = np.argsort(variances)[-max_features:]
                    feature_names = feature_names[top_indices]
                    
                    # 对全量数据应用选中的特征
                    poly_full = poly.transform(df[selected_cols].fillna(0))
                    poly_selected = poly_full[:, top_indices]
                else:
                    poly_selected = poly.transform(df[selected_cols].fillna(0))
                
                # 添加到原始数据框
                for i, name in enumerate(feature_names):
                    if name not in selected_cols:  # 避免重复原始特征
                        df[f'poly_{name}'] = poly_selected[:, i].astype('int32')
                        
            except Exception as e:
                self.log_info(f"多项式特征创建失败: {str(e)}")
        
        return df
    
    
    def create_clustering_features(self, df: pd.DataFrame, n_clusters: int = 8) -> pd.DataFrame:
        """创建聚类特征"""
        df = df.copy()
        self.log_info(f"创建{n_clusters}类聚类特征...")
        
        # 选择聚类特征
        cluster_features = []
        candidates = ['totalPrice_bin', 'total_duration', 'business_traveler_score',
                     'booking_urgency', 'convenience_score', 'competition_intensity']
        
        for col in candidates:
            if col in df.columns:
                cluster_features.append(col)
        
        if len(cluster_features) >= 3:
            try:
                # 准备聚类数据
                cluster_data = df[cluster_features].fillna(0)
                
                # K-means聚类
                kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
                
                # 使用样本进行拟合以提高效率
                sample_size = min(50000, len(cluster_data))
                sample_indices = np.random.choice(len(cluster_data), sample_size, replace=False)
                kmeans.fit(cluster_data.iloc[sample_indices])
                
                # 预测所有数据的聚类标签
                cluster_labels = kmeans.predict(cluster_data)
                df['customer_cluster'] = cluster_labels.astype('int8')
                
                # 计算到聚类中心的距离
                distances = kmeans.transform(cluster_data)
                df['cluster_distance'] = np.min(distances, axis=1).astype('float32')
                
                # 聚类稳定性（到最近两个中心的距离比）
                sorted_distances = np.sort(distances, axis=1)
                df['cluster_stability'] = (sorted_distances[:, 1] / (sorted_distances[:, 0] + 0.001)).astype('float32')
                
                # 保存聚类模型
                self.cluster_models['customer_kmeans'] = kmeans
                
            except Exception as e:
                self.log_info(f"聚类特征创建失败: {str(e)}")
        
        return df
    
    
    def create_embedding_features(self, df: pd.DataFrame) -> pd.DataFrame:
        """创建嵌入特征（降维特征）"""
        df = df.copy()
        self.log_info("创建嵌入特征...")
        
        # 选择高维特征进行降维
        embedding_candidates = []
        for col in df.columns:
            if ('segments' in col and col.endswith('_code')) or col in ['ranker_id', 'companyID']:
                if df[col].nunique() > 10:  # 高基数分类特征
                    embedding_candidates.append(col)
        
        # 对每个高基数特征创建嵌入
        for col in embedding_candidates[:3]:  # 限制数量
            try:
                # 频率编码作为简单嵌入
                value_counts = df[col].value_counts()
                df[f'{col}_frequency'] = df[col].map(value_counts).fillna(0).astype('int16')
                
                # 稀有度编码（逆频率）
                df[f'{col}_rarity'] = (1 / (df[f'{col}_frequency'] + 1) * 1000).astype('int16')
                
                # 分位数编码
                freq_quantiles = df[f'{col}_frequency'].quantile([0.25, 0.5, 0.75])
                df[f'{col}_freq_tier'] = np.select([
                    df[f'{col}_frequency'] <= freq_quantiles[0.25],
                    df[f'{col}_frequency'] <= freq_quantiles[0.5],
                    df[f'{col}_frequency'] <= freq_quantiles[0.75],
                ], [0, 1, 2], default=3).astype('int8')
                
            except Exception as e:
                self.log_info(f"嵌入特征创建失败 {col}: {str(e)}")
        
        return df
    
    
    def create_temporal_features(self, df: pd.DataFrame) -> pd.DataFrame:
        """创建时间序列特征"""
        df = df.copy()
        self.log_info("创建时间序列特征...")
        
        # 1. 季节性特征
        if 'legs0_departureAt_weekday' in df.columns:
            # 工作日/周末
            df['is_weekend'] = (df['legs0_departureAt_weekday'].isin([5, 6])).astype('int8')
            
            # 周中位置（周一=0, 周日=6）
            df['week_position'] = df['legs0_departureAt_weekday'].astype('int8')
            
            # 商务周期（周一-周四商务需求高）
            df['business_week_cycle'] = (df['legs0_departureAt_weekday'] <= 3).astype('int8')
        
        # 2. 假期效应（基于简化假期规则）
        if 'legs0_departureAt' in df.columns:
            # 节假日前后的出行高峰
            # 这里使用简化的假期检测，实际应用中需要真实假期数据
            
            # 月末月初效应（商务出行模式）
            if hasattr(df['legs0_departureAt'], 'dt'):
                df['is_month_end'] = (df['legs0_departureAt'].dt.day >= 25).astype('int8')
                df['is_month_start'] = (df['legs0_departureAt'].dt.day <= 5).astype('int8')
        
        # 3. 预订时间模式
        if 'legs0_days_ahead' in df.columns:
            # 预订时间聚类
            booking_clusters = [0, 1, 3, 7, 14, 21, 30, 60, 90]
            df['booking_time_cluster'] = pd.cut(df['legs0_days_ahead'], 
                                              bins=booking_clusters + [float('inf')], 
                                              labels=False, 
                                              duplicates='drop').fillna(0).astype('int8')
        
        return df
    
    
    def create_network_features(self, df: pd.DataFrame) -> pd.DataFrame:
        """创建网络结构特征"""
        df = df.copy()
        self.log_info("创建网络结构特征...")
        
        # 1. 机场网络特征
        if ('legs0_segments0_departureFrom_airport_iata' in df.columns and 
            'legs0_segments0_arrivalTo_airport_iata' in df.columns):
            
            # 机场出发频次（枢纽重要性）
            departure_freq = df['legs0_segments0_departureFrom_airport_iata'].value_counts()
            df['departure_airport_centrality'] = df['legs0_segments0_departureFrom_airport_iata'].map(departure_freq).fillna(0)
            
            # 机场到达频次
            arrival_freq = df['legs0_segments0_arrivalTo_airport_iata'].value_counts()
            df['arrival_airport_centrality'] = df['legs0_segments0_arrivalTo_airport_iata'].map(arrival_freq).fillna(0)
            
            # 总体机场重要性
            df['airport_network_importance'] = (df['departure_airport_centrality'] + 
                                               df['arrival_airport_centrality']) / 2
            
            # 枢纽机场识别（前10%）
            hub_threshold = df['airport_network_importance'].quantile(0.9)
            df['is_hub_route'] = (df['airport_network_importance'] > hub_threshold).astype('int8')
        
        # 2. 航空公司网络特征
        if 'legs0_segments0_marketingCarrier_code' in df.columns:
            # 航空公司路线多样性
            carrier_routes = df.groupby('legs0_segments0_marketingCarrier_code')['route_identifier'].nunique()
            df['carrier_route_diversity'] = df['legs0_segments0_marketingCarrier_code'].map(carrier_routes).fillna(0)
            
            # 网络覆盖度分级
            df['carrier_network_tier'] = pd.qcut(df['carrier_route_diversity'], 
                                                q=4, labels=[0, 1, 2, 3], 
                                                duplicates='drop').fillna(0).astype('int8')
        
        return df
    
    
    def create_competitive_features(self, df: pd.DataFrame) -> pd.DataFrame:
        """创建竞争态势特征"""
        df = df.copy()
        self.log_info("创建竞争态势特征...")
        
        if 'ranker_id' in df.columns:
            # 1. 价格竞争特征
            if 'totalPrice_bin' in df.columns:
                # 价格排名百分位
                df['price_percentile'] = df.groupby('ranker_id')['totalPrice_bin'].rank(pct=True)
                
                # 价格四分位数分类
                df['price_quartile'] = pd.qcut(df['price_percentile'], 
                                             q=4, labels=[0, 1, 2, 3], 
                                             duplicates='drop').fillna(0).astype('int8')
                
                # 与最低价的差距
                min_price = df.groupby('ranker_id')['totalPrice_bin'].transform('min')
                df['price_gap_from_min'] = df['totalPrice_bin'] - min_price
                
                # 价格竞争优势
                df['price_competitive_advantage'] = np.select([
                    df['price_quartile'] == 0,  # 最便宜25%
                    df['price_quartile'] == 1,  # 次便宜25%
                ], [2, 1], default=0).astype('int8')
            
            # 2. 时间竞争特征
            if 'total_duration' in df.columns:
                # 时间排名
                df['duration_rank'] = df.groupby('ranker_id')['total_duration'].rank()
                df['duration_percentile'] = df.groupby('ranker_id')['total_duration'].rank(pct=True)
                
                # 时间竞争优势
                df['time_competitive_advantage'] = (df['duration_percentile'] <= 0.3).astype('int8')
            
            # 3. 综合竞争力得分
            competitive_factors = []
            if 'price_competitive_advantage' in df.columns:
                competitive_factors.append('price_competitive_advantage')
            if 'time_competitive_advantage' in df.columns:
                competitive_factors.append('time_competitive_advantage')
            if 'convenience_score' in df.columns:
                competitive_factors.append('convenience_score')
            
            if len(competitive_factors) >= 2:
                df['overall_competitiveness'] = df[competitive_factors].sum(axis=1).astype('int8')
        
        return df
    
    
    def create_anomaly_features(self, df: pd.DataFrame) -> pd.DataFrame:
        """创建异常检测特征"""
        df = df.copy()
        self.log_info("创建异常检测特征...")
        
        # 1. 价格异常
        if 'totalPrice_bin' in df.columns:
            # 价格异常值检测（基于IQR）
            Q1 = df['totalPrice_bin'].quantile(0.25)
            Q3 = df['totalPrice_bin'].quantile(0.75)
            IQR = Q3 - Q1
            
            lower_bound = Q1 - 1.5 * IQR
            upper_bound = Q3 + 1.5 * IQR
            
            df['is_price_outlier'] = (
                (df['totalPrice_bin'] < lower_bound) | 
                (df['totalPrice_bin'] > upper_bound)
            ).astype('int8')
            
            # 价格异常程度
            df['price_anomaly_score'] = np.maximum(
                (lower_bound - df['totalPrice_bin']) / IQR,
                (df['totalPrice_bin'] - upper_bound) / IQR
            )
            df['price_anomaly_score'] = np.clip(df['price_anomaly_score'], 0, 5).astype('float32')
        
        # 2. 时间异常
        if 'total_duration' in df.columns:
            # 时间异常检测
            duration_mean = df['total_duration'].mean()
            duration_std = df['total_duration'].std()
            
            df['is_duration_outlier'] = (
                np.abs(df['total_duration'] - duration_mean) > 2 * duration_std
            ).astype('int8')
        
        # 3. 组合异常（价格-时间组合）
        if 'totalPrice_bin' in df.columns and 'total_duration' in df.columns:
            # 性价比异常（极高价格但极长时间，或极低价格但极短时间）
            df['value_anomaly'] = (
                ((df['totalPrice_bin'] > df['totalPrice_bin'].quantile(0.9)) & 
                 (df['total_duration'] > df['total_duration'].quantile(0.9))) |
                ((df['totalPrice_bin'] < df['totalPrice_bin'].quantile(0.1)) & 
                 (df['total_duration'] < df['total_duration'].quantile(0.1)))
            ).astype('int8')
        
        return df
    
    
    def create_meta_features(self, df: pd.DataFrame) -> pd.DataFrame:
        """创建元特征（关于特征的特征）"""
        df = df.copy()
        self.log_info("创建元特征...")
        
        # 1. 特征完整性统计
        key_feature_cols = ['totalPrice_bin', 'total_duration', 'legs0_departureAt_hour', 
                           'business_traveler_score', 'booking_urgency']
        
        existing_key_features = [col for col in key_feature_cols if col in df.columns]
        
        if existing_key_features:
            # 关键特征的非缺失数量
            df['key_features_available'] = (df[existing_key_features] != -1).sum(axis=1).astype('int8')
            
            # 特征完整性比例
            df['feature_completeness'] = df['key_features_available'] / len(existing_key_features)
        
        # 2. 选择复杂度元特征
        if 'ranker_id' in df.columns:
            # 每个ranker_id的特征分布复杂度
            complexity_features = ['totalPrice_bin', 'total_duration', 'convenience_score']
            existing_complexity_features = [col for col in complexity_features if col in df.columns]
            
            if existing_complexity_features:
                # 计算组内特征方差（复杂度代理）
                for feature in existing_complexity_features:
                    variance_col = f'{feature}_group_variance'
                    df[variance_col] = df.groupby('ranker_id')[feature].transform('std').fillna(0).astype('float32')
        
        # 3. 特征相关性元特征
        if 'totalPrice_bin' in df.columns and 'total_duration' in df.columns:
            # 价格-时间相关性（组内）
            def group_correlation(group):
                if len(group) > 2:
                    return group['totalPrice_bin'].corr(group['total_duration'])
                return 0
            
            group_corr = df.groupby('ranker_id').apply(group_correlation).fillna(0)
            group_corr.name = 'price_duration_correlation'
            df = df.merge(group_corr.reset_index(), on='ranker_id', how='left')
        
        return df
    
    
    def apply_feature_selection(self, df: pd.DataFrame, target_col: str = 'selected', 
                               max_features: int = 200) -> pd.DataFrame:
        """应用特征选择"""
        df = df.copy()
        self.log_info(f"应用特征选择，目标最大特征数：{max_features}")
        
        if target_col not in df.columns:
            self.log_info(f"目标列 {target_col} 不存在，跳过特征选择")
            return df
        
        # 分离特征和目标
        feature_cols = [col for col in df.columns if col not in ['Id', 'ranker_id', target_col]]
        
        if len(feature_cols) <= max_features:
            self.log_info(f"特征数量({len(feature_cols)})未超过限制，跳过特征选择")
            return df
        
        try:
            # 使用互信息进行特征选择
            X = df[feature_cols].fillna(0)
            y = df[target_col].fillna(0)
            
            # 随机抽样以提高效率
            sample_size = min(50000, len(df))
            if len(df) > sample_size:
                sample_indices = np.random.choice(len(df), sample_size, replace=False)
                X_sample = X.iloc[sample_indices]
                y_sample = y.iloc[sample_indices]
            else:
                X_sample = X
                y_sample = y
            
            # 计算互信息
            mi_scores = mutual_info_regression(X_sample, y_sample, random_state=42)
            
            # 选择top特征
            top_indices = np.argsort(mi_scores)[-max_features:]
            selected_features = [feature_cols[i] for i in top_indices]
            
            # 保留选中的特征
            keep_cols = ['Id', 'ranker_id', target_col] + selected_features
            df_selected = df[keep_cols].copy()
            
            # 保存特征重要性
            feature_importance = dict(zip([feature_cols[i] for i in top_indices], 
                                        mi_scores[top_indices]))
            self.feature_importance_cache[target_col] = feature_importance
            
            self.log_info(f"特征选择完成：{len(feature_cols)} -> {len(selected_features)}")
            
            return df_selected
            
        except Exception as e:
            self.log_info(f"特征选择失败：{str(e)}")
            return df
    
    
    def process_enhanced_features(self, df: pd.DataFrame, 
                                feature_types: List[str] = None,
                                apply_selection: bool = True,
                                max_features: int = 200) -> pd.DataFrame:
        """
        增强特征工程主入口
        
        Args:
            df: 输入数据（已编码）
            feature_types: 要创建的特征类型
            apply_selection: 是否应用特征选择
            max_features: 最大特征数量
        """
        if feature_types is None:
            feature_types = ['economic', 'behavioral', 'operational', 'customer', 
                           'market', 'interaction', 'temporal', 'network', 
                           'competitive', 'anomaly', 'meta']
        
        df_processed = df.copy()
        original_shape = df_processed.shape
        
        self.processing_stats['original_shape'] = original_shape
        self.log_info(f"开始增强特征工程: {original_shape}")
        
        # 按类型创建特征
        feature_methods = {
            'economic': self.create_economic_features,
            'behavioral': self.create_behavioral_features,
            'operational': self.create_operational_features,
            'customer': self.create_customer_segmentation_features,
            'market': self.create_market_dynamics_features,
            'interaction': self.create_deep_interaction_features,
            'polynomial': lambda x: self.create_polynomial_features(x, degree=2),
            'clustering': self.create_clustering_features,
            'embedding': self.create_embedding_features,
            'temporal': self.create_temporal_features,
            'network': self.create_network_features,
            'competitive': self.create_competitive_features,
            'anomaly': self.create_anomaly_features,
            'meta': self.create_meta_features
        }
        
        for feature_type in feature_types:
            if feature_type in feature_methods:
                try:
                    self.log_info(f"创建 {feature_type} 特征...")
                    df_processed = feature_methods[feature_type](df_processed)
                        
                except Exception as e:
                    self.log_info(f"创建 {feature_type} 特征失败: {str(e)}")
                    continue
        
        # 特征选择
        if apply_selection and 'selected' in df_processed.columns:
            df_processed = self.apply_feature_selection(df_processed, max_features=max_features)
                    
        # 记录处理后状态
        final_shape = df_processed.shape
        self.processing_stats['final_shape'] = final_shape
        self.processing_stats['features_added'] = final_shape[1] - original_shape[1]
        
        self.log_info(f"增强特征工程完成: {original_shape} -> {final_shape}")
        self.log_info(f"新增特征: {self.processing_stats['features_added']}")
        
        del df
        gc.collect()
        
        return df_processed
    
    def get_feature_importance(self, target_col: str = 'selected') -> Dict:
        """获取特征重要性"""
        return self.feature_importance_cache.get(target_col, {})
    
    def get_cluster_models(self) -> Dict:
        """获取聚类模型"""
        return self.cluster_models
    
    def get_processing_summary(self) -> Dict:
        """获取处理总结"""
        summary = self.processing_stats.copy()
        summary['feature_importance'] = self.feature_importance_cache
        summary['cluster_models_count'] = len(self.cluster_models)
        return summary





In [3]:
class AutoFeatureDiscovery:
    """自动特征发现类 - 使用深度学习思想的探索式特征生成"""
    
    def __init__(self, logger=None):
        self.logger = logger
        self.discovered_features = {}
        self.feature_generators = []
    
    def log_info(self, message: str):
        """统一日志输出"""
        if self.logger:
            self.logger.info(message)
    
    
    def discover_arithmetic_features(self, df: pd.DataFrame, 
                                   top_k: int = 20) -> pd.DataFrame:
        """发现算术组合特征"""
        df = df.copy()
        self.log_info("发现算术组合特征...")
        
        # 选择数值特征
        numeric_cols = []
        for col in df.columns:
            if (df[col].dtype in ['int8', 'int16', 'int32', 'float32', 'float64'] and 
                col not in ['Id', 'ranker_id', 'selected']):
                numeric_cols.append(col)
        
        if len(numeric_cols) < 2:
            return df
        
        # 限制特征数量以避免组合爆炸
        selected_numeric = numeric_cols[:min(10, len(numeric_cols))]
        discovered_count = 0
        
        # 生成二元算术特征
        operations = [
            ('add', lambda x, y: x + y),
            ('subtract', lambda x, y: np.abs(x - y)),
            ('multiply', lambda x, y: x * y),
            ('divide', lambda x, y: x / (y + 1)),
            ('max', lambda x, y: np.maximum(x, y)),
            ('min', lambda x, y: np.minimum(x, y))
        ]
        
        candidate_features = []
        
        for i, col1 in enumerate(selected_numeric):
            for j, col2 in enumerate(selected_numeric[i+1:], i+1):
                for op_name, op_func in operations:
                    if discovered_count >= top_k:
                        break
                    
                    try:
                        feature_name = f'auto_{op_name}_{col1}_{col2}'
                        feature_values = op_func(df[col1].fillna(0), df[col2].fillna(0))
                        
                        # 检查特征质量
                        if self._is_good_feature(feature_values):
                            candidate_features.append((feature_name, feature_values))
                            discovered_count += 1
                            
                    except Exception:
                        continue
                        
                if discovered_count >= top_k:
                    break
            if discovered_count >= top_k:
                break
        
        # 添加发现的特征
        for feature_name, feature_values in candidate_features:
            df[feature_name] = feature_values.astype('int32')
        
        self.log_info(f"发现 {len(candidate_features)} 个算术特征")
        return df
    
    
    def discover_conditional_features(self, df: pd.DataFrame, 
                                    top_k: int = 15) -> pd.DataFrame:
        """发现条件特征"""
        df = df.copy()
        self.log_info("发现条件特征...")
        
        # 选择特征
        numeric_cols = [col for col in df.columns 
                       if df[col].dtype in ['int8', 'int16', 'int32', 'float32'] 
                       and col not in ['Id', 'ranker_id', 'selected']]
        
        if len(numeric_cols) < 2:
            return df
        
        discovered_count = 0
        
        # 生成条件特征
        for i, col1 in enumerate(numeric_cols[:8]):
            for j, col2 in enumerate(numeric_cols[:8]):
                if i == j or discovered_count >= top_k:
                    continue
                
                try:
                    # 条件1: col1 > median(col1) AND col2 > median(col2)
                    med1 = df[col1].median()
                    med2 = df[col2].median()
                    
                    feature_name1 = f'auto_both_high_{col1}_{col2}'
                    feature_values1 = ((df[col1] > med1) & (df[col2] > med2)).astype('int8')
                    
                    if self._is_good_feature(feature_values1):
                        df[feature_name1] = feature_values1
                        discovered_count += 1
                    
                    # 条件2: col1 > 75th percentile OR col2 > 75th percentile
                    p75_1 = df[col1].quantile(0.75)
                    p75_2 = df[col2].quantile(0.75)
                    
                    feature_name2 = f'auto_either_high_{col1}_{col2}'
                    feature_values2 = ((df[col1] > p75_1) | (df[col2] > p75_2)).astype('int8')
                    
                    if self._is_good_feature(feature_values2) and discovered_count < top_k:
                        df[feature_name2] = feature_values2
                        discovered_count += 1
                        
                except Exception:
                    continue
        
        self.log_info(f"发现 {discovered_count} 个条件特征")
        return df
    
    
    def discover_ranking_features(self, df: pd.DataFrame, 
                                top_k: int = 10) -> pd.DataFrame:
        """发现排名特征"""
        df = df.copy()
        self.log_info("发现排名特征...")
        
        if 'ranker_id' not in df.columns:
            return df
        
        # 选择数值特征
        numeric_cols = [col for col in df.columns 
                       if df[col].dtype in ['int8', 'int16', 'int32', 'float32'] 
                       and col not in ['Id', 'ranker_id', 'selected']]
        
        discovered_count = 0
        
        for col in numeric_cols[:top_k]:
            try:
                # 组内排名
                feature_name1 = f'auto_rank_{col}'
                df[feature_name1] = df.groupby('ranker_id')[col].rank().astype('int16')
                
                # 组内百分位排名
                feature_name2 = f'auto_pct_rank_{col}'
                df[feature_name2] = (df.groupby('ranker_id')[col].rank(pct=True) * 100).astype('int8')
                
                # 与组内最值的比较
                feature_name3 = f'auto_vs_max_{col}'
                group_max = df.groupby('ranker_id')[col].transform('max')
                df[feature_name3] = (df[col] / (group_max + 1) * 100).astype('int8')
                
                discovered_count += 3
                
            except Exception:
                continue
        
        self.log_info(f"发现 {discovered_count} 个排名特征")
        return df
    
    
    def discover_aggregation_features(self, df: pd.DataFrame, 
                                    top_k: int = 15) -> pd.DataFrame:
        """发现聚合特征"""
        df = df.copy()
        self.log_info("发现聚合特征...")
        
        if 'ranker_id' not in df.columns:
            return df
        
        # 选择数值特征
        numeric_cols = [col for col in df.columns 
                       if df[col].dtype in ['int8', 'int16', 'int32', 'float32'] 
                       and col not in ['Id', 'ranker_id', 'selected']]
        
        # 聚合函数
        agg_functions = {
            'mean': 'mean',
            'std': 'std', 
            'range': lambda x: x.max() - x.min(),
            'q75_q25': lambda x: x.quantile(0.75) - x.quantile(0.25)
        }
        
        discovered_count = 0
        
        for col in numeric_cols[:min(5, len(numeric_cols))]:
            for agg_name, agg_func in agg_functions.items():
                if discovered_count >= top_k:
                    break
                
                try:
                    feature_name = f'auto_group_{agg_name}_{col}'
                    
                    if agg_name in ['mean', 'std']:
                        feature_values = df.groupby('ranker_id')[col].transform(agg_func).fillna(0)
                    else:
                        feature_values = df.groupby('ranker_id')[col].transform(agg_func).fillna(0)
                    
                    if self._is_good_feature(feature_values):
                        df[feature_name] = feature_values.astype('float32')
                        discovered_count += 1
                        
                except Exception:
                    continue
        
        self.log_info(f"发现 {discovered_count} 个聚合特征")
        return df
    
    def _is_good_feature(self, feature_values: np.ndarray, 
                        min_variance: float = 0.01, 
                        max_missing_rate: float = 0.8) -> bool:
        """判断特征质量"""
        try:
            # 检查方差
            if np.var(feature_values) < min_variance:
                return False
            
            # 检查缺失率
            missing_rate = np.isnan(feature_values).mean()
            if missing_rate > max_missing_rate:
                return False
            
            # 检查唯一值数量
            unique_values = len(np.unique(feature_values[~np.isnan(feature_values)]))
            if unique_values < 2:
                return False
            
            return True
            
        except Exception:
            return False
    
    
    def auto_discover_features(self, df: pd.DataFrame, 
                             max_total_features: int = 50) -> pd.DataFrame:
        """自动特征发现主入口"""
        df_processed = df.copy()
        original_features = df_processed.shape[1]
        
        self.log_info(f"开始自动特征发现，原始特征数：{original_features}")
        
        # 按重要性顺序应用特征发现方法
        discovery_methods = [
            ('arithmetic', self.discover_arithmetic_features, max_total_features // 3),
            ('ranking', self.discover_ranking_features, max_total_features // 5),
            ('conditional', self.discover_conditional_features, max_total_features // 4),
            ('aggregation', self.discover_aggregation_features, max_total_features // 3)
        ]
        
        total_discovered = 0
        
        for method_name, method_func, max_features in discovery_methods:
            if total_discovered >= max_total_features:
                break
                
            try:
                before_count = df_processed.shape[1]
                df_processed = method_func(df_processed, top_k=max_features)
                after_count = df_processed.shape[1]
                
                method_discovered = after_count - before_count
                total_discovered += method_discovered
                
                self.log_info(f"{method_name} 方法发现 {method_discovered} 个特征")
                
            except Exception as e:
                self.log_info(f"{method_name} 特征发现失败: {str(e)}")
                continue
        
        final_features = df_processed.shape[1]
        self.discovered_features['auto_discovery'] = {
            'original_features': original_features,
            'final_features': final_features,
            'discovered_features': total_discovered
        }
        
        self.log_info(f"自动特征发现完成：{original_features} -> {final_features} 特征")
        return df_processed

In [4]:
class IntegratedDataEngineering:
    """集成数据工程类 - 整合所有特征工程方法"""
    
    def __init__(self, logger=None, config: Dict = None):
        self.logger = logger
        self.config = config or {}
        
        # 初始化各个组件
        self.enhanced_eng = EnhancedDataEngineering(logger=logger)
        self.auto_discovery = AutoFeatureDiscovery(logger=logger)
        
        # 处理统计
        self.processing_stats = {}
    
    def log_info(self, message: str):
        """统一日志输出"""
        if self.logger:
            self.logger.info(message)
    
    
    def create_all_features(self, df: pd.DataFrame, 
                          enhanced_features: bool = True,
                          auto_discovery: bool = True,
                          feature_selection: bool = True,
                          max_features: int = 200) -> pd.DataFrame:
        """创建所有类型的特征"""
        df_processed = df.copy()
        
        self.log_info("=" * 60)
        self.log_info("开始集成特征工程")
        self.log_info("=" * 60)
        
        original_shape = df_processed.shape
        self.processing_stats['original_shape'] = original_shape
        
        # 阶段1：增强特征工程
        if enhanced_features:
            self.log_info("阶段1：增强特征工程")
            
            enhanced_config = self.config.get('enhanced_features', {})
            feature_types = enhanced_config.get('feature_types', 
                                              ['economic', 'behavioral', 'operational', 
                                               'customer', 'market', 'temporal', 'network'])
            
            try:
                df_processed = self.enhanced_eng.process_enhanced_features(
                    df_processed, 
                    feature_types=feature_types,
                    apply_selection=False,  # 稍后统一进行特征选择
                    max_features=max_features * 2  # 给自动发现留空间
                )
                
                enhanced_shape = df_processed.shape
                self.processing_stats['enhanced_shape'] = enhanced_shape
                self.log_info(f"增强特征工程完成：{original_shape} -> {enhanced_shape}")
                
            except Exception as e:
                self.log_info(f"增强特征工程失败：{str(e)}")
                enhanced_shape = df_processed.shape
        
        # 阶段2：自动特征发现
        if auto_discovery:
            self.log_info("阶段2：自动特征发现")
            
            auto_config = self.config.get('auto_discovery', {})
            max_auto_features = auto_config.get('max_features', 50)
            
            try:
                df_processed = self.auto_discovery.auto_discover_features(
                    df_processed, 
                    max_total_features=max_auto_features
                )
                
                auto_shape = df_processed.shape
                self.processing_stats['auto_discovery_shape'] = auto_shape
                self.log_info(f"自动特征发现完成：{enhanced_shape} -> {auto_shape}")
                
            except Exception as e:
                self.log_info(f"自动特征发现失败：{str(e)}")
                auto_shape = df_processed.shape
        
        # 阶段3：特征选择
        if feature_selection and 'selected' in df_processed.columns:
            self.log_info("阶段3：特征选择")
            
            try:
                df_processed = self.enhanced_eng.apply_feature_selection(
                    df_processed, 
                    target_col='selected',
                    max_features=max_features
                )
                
                final_shape = df_processed.shape
                self.processing_stats['final_shape'] = final_shape
                self.log_info(f"特征选择完成：{auto_shape} -> {final_shape}")
                
            except Exception as e:
                self.log_info(f"特征选择失败：{str(e)}")
                final_shape = df_processed.shape
        else:
            final_shape = df_processed.shape
            self.processing_stats['final_shape'] = final_shape
                
        # 统计汇总
        total_features_added = final_shape[1] - original_shape[1]
        self.processing_stats['total_features_added'] = total_features_added
        
        self.log_info("=" * 60)
        self.log_info("集成特征工程完成")
        self.log_info(f"最终结果：{original_shape} -> {final_shape}")
        self.log_info(f"新增特征：{total_features_added}")
        self.log_info("=" * 60)
        
        return df_processed
    
    
    def process_segment_file(self, input_file: str, output_file: str,
                           enhanced_features: bool = True,
                           auto_discovery: bool = True,
                           feature_selection: bool = True,
                           max_features: int = 200) -> bool:
        """处理单个segment文件"""
        try:
            if not os.path.exists(input_file):
                self.log_info(f"输入文件不存在: {input_file}")
                return False
            
            # 检查文件是否为空
            pf = pq.ParquetFile(input_file)
            if pf.metadata.num_rows == 0:
                self.log_info(f"空文件，跳过: {input_file}")
                # 创建空的输出文件
                empty_df = pd.DataFrame()
                empty_df.to_parquet(output_file, index=False)
                return True
            
            # 读取数据
            df = pd.read_parquet(input_file)
            
            if len(df) == 0:
                self.log_info(f"空数据，跳过: {input_file}")
                df.to_parquet(output_file, index=False)
                return True
            
            # 集成特征工程
            df_processed = self.create_all_features(
                df, 
                enhanced_features=enhanced_features,
                auto_discovery=auto_discovery,
                feature_selection=feature_selection,
                max_features=max_features
            )
            
            # 保存结果
            os.makedirs(os.path.dirname(output_file), exist_ok=True)
            df_processed.to_parquet(output_file, index=False)
            
            self.log_info(f"处理完成: {input_file} -> {output_file}")
            self.log_info(f"  形状: {df.shape} -> {df_processed.shape}")
            
            return True
            
        except Exception as e:
            self.log_info(f"处理文件失败 {input_file}: {str(e)}")
            return False
    
    
    def process_all_segments(self, input_dir: str, output_dir: str, 
                           data_type: str,
                           enhanced_features: bool = True,
                           auto_discovery: bool = True,
                           feature_selection: bool = True,
                           max_features: int = 200) -> bool:
        """处理所有segment文件"""
        input_dir = Path(input_dir)
        output_dir = Path(output_dir)
        
        # 确保输出目录存在
        output_dir.mkdir(parents=True, exist_ok=True)
        
        success_count = 0
        total_count = 0
        
        for segment_level in [0, 1, 2, 3]:
            input_file = input_dir / f"{data_type}_segment_{segment_level}.parquet"
            output_file = output_dir / f"{data_type}_segment_{segment_level}.parquet"
            
            total_count += 1
            
            if self.process_segment_file(
                str(input_file), str(output_file),
                enhanced_features=enhanced_features,
                auto_discovery=auto_discovery,
                feature_selection=feature_selection,
                max_features=max_features
            ):
                success_count += 1
        
        self.log_info(f"批处理完成: {success_count}/{total_count} 个文件成功")
        return success_count == total_count
    
    def get_comprehensive_summary(self) -> Dict:
        """获取综合处理总结"""
        summary = {
            'processing_stats': self.processing_stats,
            'enhanced_engineering': self.enhanced_eng.get_processing_summary(),
            'auto_discovery': self.auto_discovery.discovered_features,
            'feature_importance': self.enhanced_eng.get_feature_importance(),
            'cluster_models': self.enhanced_eng.get_cluster_models()
        }
        return summary
    
    def save_feature_metadata(self, output_path: str):
        """保存特征元数据"""
        metadata = self.get_comprehensive_summary()
        
        try:
            import pickle
            with open(output_path, 'wb') as f:
                pickle.dump(metadata, f)
            self.log_info(f"特征元数据已保存: {output_path}")
        except Exception as e:
            self.log_info(f"保存特征元数据失败: {str(e)}")


In [5]:
# 使用示例和配置
ENHANCED_FEATURE_CONFIG = {
    'enhanced_features': {
        'feature_types': [
            'economic',      # 经济学特征
            'behavioral',    # 行为经济学特征  
            'operational',   # 运营特征
            'customer',      # 客户细分特征
            'market',        # 市场动态特征
            'temporal',      # 时间特征
            'network',       # 网络特征
            'competitive',   # 竞争特征
            'anomaly',       # 异常特征
            'meta'           # 元特征
        ]
    },
    'auto_discovery': {
        'max_features': 50,
        'enable_arithmetic': True,
        'enable_conditional': True,
        'enable_ranking': True,
        'enable_aggregation': True
    },
    'feature_selection': {
        'method': 'mutual_info',
        'max_features': 200,
        'selection_ratio': 0.8
    }
}

In [6]:
# 示例用法
config = ENHANCED_FEATURE_CONFIG

# 初始化集成特征工程
integrated_eng = IntegratedDataEngineering(
    logger=logger,  # 传入实际的logger
    config=config
)

# 处理单个数据类型的所有segments
def process_data_type_enhanced(data_type: str, 
                                segment_dir: str, 
                                output_dir: str):
    """处理单个数据类型"""
    
    success = integrated_eng.process_all_segments(
        input_dir=f"{segment_dir}/{data_type}",
        output_dir=f"{output_dir}/{data_type}",
        data_type=data_type,
        enhanced_features=True,
        auto_discovery=True,
        feature_selection=True,
        max_features=200
    )
    
    if success:
        # 保存特征元数据
        metadata_path = f"{output_dir}/{data_type}_feature_metadata.pkl"
        integrated_eng.save_feature_metadata(metadata_path)
        
        # 获取处理总结
        summary = integrated_eng.get_comprehensive_summary()
        print(f"{data_type} 处理总结:")
        print(f"  - 原始特征: {summary['processing_stats'].get('original_shape', [0, 0])[1]}")
        print(f"  - 最终特征: {summary['processing_stats'].get('final_shape', [0, 0])[1]}")
        print(f"  - 新增特征: {summary['processing_stats'].get('total_features_added', 0)}")
        
        return True
    
    return False

process_data_type_enhanced("test", "../data/aeroclub-recsys-2025/processed", "../data/aeroclub-recsys-2025/enhanced")

2025-08-05 20:38:48 |     INFO | __main__ | 开始集成特征工程
2025-08-05 20:38:48 |     INFO | __main__ | 阶段1：增强特征工程
2025-08-05 20:38:48 |     INFO | __main__ | 开始增强特征工程: (2427225, 103)
2025-08-05 20:38:48 |     INFO | __main__ | 创建 economic 特征...
2025-08-05 20:38:49 |     INFO | __main__ | 创建经济学特征...
2025-08-05 20:38:49 |     INFO | __main__ | 创建 behavioral 特征...
2025-08-05 20:38:50 |     INFO | __main__ | 创建行为经济学特征...
2025-08-05 20:38:52 |     INFO | __main__ | 创建 operational 特征...
2025-08-05 20:38:52 |     INFO | __main__ | 创建运营特征...
2025-08-05 20:38:54 |     INFO | __main__ | 创建 customer 特征...
2025-08-05 20:38:54 |     INFO | __main__ | 创建客户细分特征...
2025-08-05 20:38:58 |     INFO | __main__ | 创建 market 特征...
2025-08-05 20:38:58 |     INFO | __main__ | 创建市场动态特征...
2025-08-05 20:39:00 |     INFO | __main__ | 创建 temporal 特征...
2025-08-05 20:39:00 |     INFO | __main__ | 创建时间序列特征...
2025-08-05 20:39:00 |     INFO | __main__ | 创建 network 特征...
2025-08-05 20:39:01 |     INFO | __main__ | 创建网络结构特征.

test 处理总结:
  - 原始特征: 151
  - 最终特征: 287
  - 新增特征: 136


True

In [9]:
import pandas as pd
import numpy as np
import xgboost as xgb
import lightgbm as lgb
from sklearn.model_selection import GroupKFold
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif
from sklearn.ensemble import RandomForestClassifier
from tqdm import tqdm
import time
import os
import gc
import warnings
import subprocess
from typing import List, Dict, Tuple, Optional
warnings.filterwarnings('ignore')

class FeatureSelector:
    def __init__(self, max_features: int = 200):
        self.max_features = max_features
        self.selected_features = None
        
    def remove_constant_features(self, X: pd.DataFrame) -> List[str]:
        valid_features = []
        for col in tqdm(X.columns, desc="移除常量特征", leave=False):
            unique_ratio = X[col].nunique() / len(X)
            if X[col].nunique() > 1:
                mode_ratio = X[col].value_counts().iloc[0] / len(X)
                if mode_ratio < 0.95 and unique_ratio > 0.01:
                    valid_features.append(col)
        return valid_features
    
    def remove_correlated_features(self, X: pd.DataFrame) -> List[str]:
        corr_matrix = X.corr().abs()
        upper_triangle = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
        to_drop = [col for col in upper_triangle.columns if any(upper_triangle[col] > 0.95)]
        return [col for col in X.columns if col not in to_drop]
    
    def select_by_importance(self, X: pd.DataFrame, y: pd.Series, top_k: int) -> List[str]:
        y_binary = (y == 1).astype(int)
        
        if len(X) > 50000:
            sample_idx = np.random.choice(len(X), 50000, replace=False)
            X_sample, y_sample = X.iloc[sample_idx], y_binary.iloc[sample_idx]
        else:
            X_sample, y_sample = X, y_binary
        
        try:
            model = lgb.LGBMClassifier(n_estimators=50, max_depth=6, verbose=-1, random_state=42)
            model.fit(X_sample, y_sample)
            importance = dict(zip(X.columns, model.feature_importances_))
        except:
            model = RandomForestClassifier(n_estimators=50, max_depth=8, random_state=42, n_jobs=4)
            model.fit(X_sample, y_sample)
            importance = dict(zip(X.columns, model.feature_importances_))
        
        return [f[0] for f in sorted(importance.items(), key=lambda x: x[1], reverse=True)[:top_k]]
    
    def select_by_statistical(self, X: pd.DataFrame, y: pd.Series, top_k: int) -> List[str]:
        y_binary = (y == 1).astype(int)
        
        if len(X) > 30000:
            sample_idx = np.random.choice(len(X), 30000, replace=False)
            X_sample, y_sample = X.iloc[sample_idx], y_binary.iloc[sample_idx]
        else:
            X_sample, y_sample = X, y_binary
        
        try:
            selector = SelectKBest(score_func=f_classif, k=min(top_k, X_sample.shape[1]))
            selector.fit(X_sample, y_sample)
            scores = dict(zip(X.columns, selector.scores_))
            return [f[0] for f in sorted(scores.items(), key=lambda x: x[1], reverse=True)[:top_k]]
        except:
            return X.columns.tolist()[:top_k]
    
    def select_features(self, X: pd.DataFrame, y: pd.Series) -> List[str]:
        print(f"开始特征选择: {X.shape[1]} -> {self.max_features}")
        
        # 移除常量和相关特征
        features = self.remove_constant_features(X)
        X_filtered = X[features]
        
        with tqdm(total=1, desc="移除相关特征", leave=False) as pbar:
            features = self.remove_correlated_features(X_filtered)
            pbar.update(1)
        
        X_filtered = X_filtered[features]
        
        if len(features) <= self.max_features:
            self.selected_features = features
            print(f"特征选择完成: {len(features)} 个特征")
            return features
        
        # # 多方法选择
        # k = min(self.max_features // 2, len(features))
        
        # with tqdm(total=2, desc="特征选择") as pbar:
        #     importance_features = self.select_by_importance(X_filtered, y, k)
        #     pbar.update(1)
        #     statistical_features = self.select_by_statistical(X_filtered, y, k)
        #     pbar.update(1)
        
        # # 合并去重
        # combined = list(set(importance_features + statistical_features))
        # self.selected_features = combined[:self.max_features]
        
        self.selected_features = features
        print(f"特征选择完成: {len(self.selected_features)} 个特征")
        return self.selected_features

class BaseRanker:
    def __init__(self, name: str):
        self.name = name
        self.models = []
        
    def train_fold(self, X_train, y_train, train_groups, X_val, y_val, val_groups, fold_idx):
        raise NotImplementedError
    
    def predict_fold(self, X_test, fold_idx):
        raise NotImplementedError
    
    def predict_ensemble(self, X_test):
        predictions = [self.predict_fold(X_test, i) for i in range(len(self.models))]
        return np.mean(predictions, axis=0)

class XGBoostRanker(BaseRanker):
    def __init__(self, use_gpu=True):
        super().__init__("XGBoost")
        
        self.params = {
            'objective': 'rank:pairwise',
            'learning_rate': 0.05,
            'gamma': 1.0,
            'min_child_weight': 0.1,
            'max_depth': 8,
            'subsample': 0.8,
            'colsample_bytree': 0.8,
            'eval_metric': 'ndcg@3',
            'seed': 42
        }
        
        if use_gpu and self._check_gpu():
            self.params.update({
                'tree_method': 'gpu_hist',
                'gpu_id': 0,
                'predictor': 'gpu_predictor'
            })
        else:
            self.params['n_jobs'] = 8
    
    def _check_gpu(self):
        try:
            return subprocess.run(['nvidia-smi'], capture_output=True).returncode == 0
        except:
            return False
    
    def train_fold(self, X_train, y_train, train_groups, X_val, y_val, val_groups, fold_idx):
        dtrain = xgb.DMatrix(X_train, label=y_train)
        dtrain.set_group(train_groups)
        
        dval = xgb.DMatrix(X_val, label=y_val)
        dval.set_group(val_groups)
        
        model = xgb.train(
            self.params, dtrain,
            num_boost_round=300,
            evals=[(dval, 'val')],
            early_stopping_rounds=15,
            verbose_eval=False
        )
        
        self.models.append(model)
        return model
    
    def predict_fold(self, X_test, fold_idx):
        dtest = xgb.DMatrix(X_test)
        return self.models[fold_idx].predict(dtest)

class LightGBMRanker(BaseRanker):
    def __init__(self, use_gpu=True):
        super().__init__("LightGBM")
        
        self.params = {
            'objective': 'lambdarank',
            'metric': 'ndcg',
            'ndcg_eval_at': [3],
            'learning_rate': 0.05,
            'num_leaves': 255,
            'max_depth': 8,
            'feature_fraction': 0.8,
            'bagging_fraction': 0.8,
            'bagging_freq': 5,
            'verbose': -1,
            'seed': 42
        }
        
        if use_gpu and self._check_gpu():
            self.params.update({
                'device_type': 'gpu',
                'gpu_platform_id': 0,
                'gpu_device_id': 0
            })
        else:
            self.params['num_threads'] = 8
    
    def _check_gpu(self):
        try:
            return subprocess.run(['nvidia-smi'], capture_output=True).returncode == 0
        except:
            return False
    
    def train_fold(self, X_train, y_train, train_groups, X_val, y_val, val_groups, fold_idx):
        train_data = lgb.Dataset(X_train, label=y_train, group=train_groups)
        val_data = lgb.Dataset(X_val, label=y_val, group=val_groups, reference=train_data)
        
        model = lgb.train(
            self.params, train_data,
            num_boost_round=300,
            valid_sets=[val_data],
            callbacks=[lgb.early_stopping(15), lgb.log_evaluation(0)]
        )
        
        self.models.append(model)
        return model
    
    def predict_fold(self, X_test, fold_idx):
        return self.models[fold_idx].predict(X_test, num_iteration=self.models[fold_idx].best_iteration)

class StratifiedFlightRanker:
    def __init__(self, data_path: str, output_path: str, segment_num:int = 0, use_gpu: bool = True, 
                 max_rankers_per_layer: int = 1000, n_folds: int = 3, max_features: int = 200):
        self.data_path = data_path
        self.output_path = output_path
        self.segment_num = segment_num
        self.max_rankers_per_layer = max_rankers_per_layer
        self.n_folds = n_folds
        os.makedirs(self.output_path, exist_ok=True)
        
        self.rankers = {}
        self.selected_features = None
        self.feature_selector = FeatureSelector(max_features)
    
    def add_ranker(self, ranker: BaseRanker):
        self.rankers[ranker.name] = ranker
    
    def load_and_align_data(self) -> Tuple[pd.DataFrame, pd.DataFrame]:
        print("加载数据...")
        train_df = pd.read_parquet(os.path.join(self.data_path, f'train/train_segment_{self.segment_num}.parquet'))
        test_df = pd.read_parquet(os.path.join(self.data_path, f'test/test_segment_{self.segment_num}.parquet'))
        
        # 找共同特征
        train_features = [col for col in train_df.columns if col not in ['Id', 'ranker_id', 'selected']]
        test_features = [col for col in test_df.columns if col not in ['Id', 'ranker_id', 'selected']]
        common_features = sorted(list(set(train_features) & set(test_features)))
        
        # 清理数据
        with tqdm(total=2, desc="清理数据") as pbar:
            train_df = self._clean_data(train_df[['Id', 'ranker_id', 'selected'] + common_features])
            pbar.update(1)
            test_df = self._clean_data(test_df[['Id', 'ranker_id'] + common_features])
            pbar.update(1)
        
        # 特征选择
        if len(train_df) > 100000:
            sample_groups = np.random.choice(train_df['ranker_id'].unique(), 
                                           min(1000, len(train_df['ranker_id'].unique())), replace=False)
            sample_mask = train_df['ranker_id'].isin(sample_groups)
            X_sample = train_df.loc[sample_mask, common_features]
            y_sample = train_df.loc[sample_mask, 'selected']
        else:
            X_sample = train_df[common_features]
            y_sample = train_df['selected']
        
        self.selected_features = self.feature_selector.select_features(X_sample, y_sample)
        
        train_df = train_df[['Id', 'ranker_id', 'selected'] + self.selected_features]
        test_df = test_df[['Id', 'ranker_id'] + self.selected_features]
        
        print(f"数据加载完成: 训练集{train_df.shape}, 测试集{test_df.shape}")
        return train_df, test_df
    
    def _clean_data(self, df: pd.DataFrame) -> pd.DataFrame:
        for col in df.columns:
            if df[col].dtype.name == 'object' and col not in ['Id', 'ranker_id']:
                try:
                    df[col] = pd.to_numeric(df[col], errors='coerce')
                except:
                    df[col] = df[col].astype('category').cat.codes
        
        return df.replace([np.inf, -np.inf], np.nan).fillna(-1)
    
    def create_stratified_layers(self, train_df: pd.DataFrame) -> List[List[str]]:
        unique_rankers = train_df['ranker_id'].unique()
        np.random.shuffle(unique_rankers)
        
        return [unique_rankers[i:i + self.max_rankers_per_layer].tolist() 
                for i in range(0, len(unique_rankers), self.max_rankers_per_layer)]
    
    def train_layer_cv(self, train_df: pd.DataFrame, layer_rankers: List[str]):
        layer_data = train_df[train_df['ranker_id'].isin(layer_rankers)].copy()
        
        X = layer_data[self.selected_features]
        y = layer_data['selected']
        groups = layer_data['ranker_id']
        
        gkf = GroupKFold(n_splits=self.n_folds)
        total_folds = len(self.rankers) * self.n_folds
        
        with tqdm(total=total_folds, desc=f"训练分层({len(layer_rankers)} rankers)") as pbar:
            for ranker_name, ranker in self.rankers.items():
                for fold_idx, (train_idx, val_idx) in enumerate(gkf.split(X, y, groups=groups)):
                    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
                    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
                    
                    train_groups = layer_data.iloc[train_idx].groupby('ranker_id').size().to_numpy()
                    val_groups = layer_data.iloc[val_idx].groupby('ranker_id').size().to_numpy()
                    
                    pbar.set_postfix({"模型": ranker_name, "Fold": fold_idx+1})
                    ranker.train_fold(X_train, y_train, train_groups, X_val, y_val, val_groups, fold_idx)
                    pbar.update(1)
                    gc.collect()
    
    def predict_stratified(self, test_df: pd.DataFrame, model_weights: Optional[Dict] = None) -> pd.DataFrame:
        if model_weights is None:
            model_weights = {name: 1.0 for name in self.rankers.keys()}
        
        X_test = test_df[self.selected_features]
        final_scores = np.zeros(len(X_test))
        total_weight = 0
        
        with tqdm(total=len(self.rankers), desc="集成预测") as pbar:
            for name, ranker in self.rankers.items():
                if ranker.models:
                    pbar.set_postfix({"模型": name})
                    scores = ranker.predict_ensemble(X_test)
                    weight = model_weights.get(name, 1.0)
                    final_scores += weight * scores
                    total_weight += weight
                pbar.update(1)
        
        final_scores /= total_weight
        
        # 生成排名
        test_df_copy = test_df.copy()
        test_df_copy['score'] = final_scores
        
        rankings = []
        unique_rankers = test_df_copy['ranker_id'].unique()
        
        with tqdm(total=len(unique_rankers), desc="生成排名", leave=False) as pbar:
            for ranker_id in unique_rankers:
                group = test_df_copy[test_df_copy['ranker_id'] == ranker_id]
                group = group.sort_values('score', ascending=False)
                group['rank'] = range(1, len(group) + 1)
                rankings.append(group[['Id', 'rank']])
                pbar.update(1)
        
        rankings_df = pd.concat(rankings)
        results = test_df[['Id', 'ranker_id']].merge(rankings_df, on='Id', how='left')
        results = results.rename(columns={'rank': 'selected'})
        
        if results['selected'].isna().sum() > 0:
            max_rank = results.groupby('ranker_id')['selected'].transform('max')
            results['selected'] = results['selected'].fillna(max_rank + 1)
        
        results['selected'] = results['selected'].astype(int)
        return results
    
    def validate_rankings(self, rankings_df: pd.DataFrame) -> bool:
        if not np.issubdtype(rankings_df['selected'].dtype, np.integer):
            return False
        
        for ranker_id, group in rankings_df.groupby('ranker_id'):
            ranks = sorted(group['selected'].tolist())
            expected_ranks = list(range(1, len(ranks) + 1))
            if ranks != expected_ranks:
                return False
        
        return True
    
    def save_predictions(self, results: pd.DataFrame):
        filename = f"prediction_segment_{self.segment_num}.csv"
        output_path = os.path.join(self.output_path, filename)
        results[['Id', 'ranker_id', 'selected']].to_csv(output_path, index=False)
        
        # 保存特征列表
        with open(os.path.join(self.output_path, 'selected_features.txt'), 'w') as f:
            f.write('\n'.join(self.selected_features))
    
    def run_pipeline(self, model_weights: Optional[Dict] = None) -> pd.DataFrame:
        print("🚀 开始航班推荐排序流程")
        
        train_df, test_df = self.load_and_align_data()
        layers = self.create_stratified_layers(train_df)
        
        print(f"📊 分层训练: {len(layers)} 层, 每层最多 {self.max_rankers_per_layer} 个ranker")
        
        # 训练所有分层
        for i, layer_rankers in enumerate(tqdm(layers, desc="总体进度")):
            print(f"\n第 {i+1}/{len(layers)} 层")
            self.train_layer_cv(train_df, layer_rankers)
        
        print("\n🔮 开始预测...")
        predictions = self.predict_stratified(test_df, model_weights)
        
        print("✅ 验证排名...")
        if self.validate_rankings(predictions):
            self.save_predictions(predictions)
            print(f" 完成! 使用 {len(self.selected_features)} 个特征, {self.n_folds} 折交叉验证")
            return predictions
        else:
            raise ValueError("排名验证失败")

# 使用示例
if __name__ == "__main__":
    DATA_PATH = "../data/aeroclub-recsys-2025/enhanced"
    OUTPUT_PATH = "../data/aeroclub-recsys-2025/enhanced_predicted"
    
    all_submissions = []
    
    for segment_num in range(3):
        ranker = StratifiedFlightRanker(
            DATA_PATH, OUTPUT_PATH,
            max_rankers_per_layer=1000,
            n_folds=3,
            max_features=150,
            segment_num=segment_num
        )
        
        ranker.add_ranker(XGBoostRanker(use_gpu=True))
        ranker.add_ranker(LightGBMRanker(use_gpu=True))
        
        submission = ranker.run_pipeline({"XGBoost": 0.6, "LightGBM": 0.4})
        all_submissions.append(submission)
    
    
    # 假设 all_submissions 是包含三个DataFrame的列表
    final_submission = pd.concat(all_submissions, axis=0, ignore_index=True)

    # 保存为CSV文件
    final_submission.to_csv("final_submission.csv", index=False)
    

🚀 开始航班推荐排序流程
加载数据...


清理数据: 100%|██████████| 2/2 [00:15<00:00,  7.88s/it]


开始特征选择: 238 -> 150


                                                                

特征选择完成: 6 个特征
数据加载完成: 训练集(6089760, 9), 测试集(2427225, 8)
📊 分层训练: 19 层, 每层最多 1000 个ranker


总体进度:   0%|          | 0/19 [00:00<?, ?it/s]


第 1/19 层




Training until validation scores don't improve for 15 rounds





Early stopping, best iteration is:
[7]	valid_0's ndcg@3: 0.796523


训练分层(1000 rankers):  67%|██████▋   | 4/6 [00:04<00:02,  1.47s/it, 模型=LightGBM, Fold=2][A

Training until validation scores don't improve for 15 rounds





Early stopping, best iteration is:
[5]	valid_0's ndcg@3: 0.808478


训练分层(1000 rankers):  83%|████████▎ | 5/6 [00:06<00:01,  1.66s/it, 模型=LightGBM, Fold=3][A

Training until validation scores don't improve for 15 rounds


训练分层(1000 rankers): 100%|██████████| 6/6 [00:10<00:00,  1.80s/it, 模型=LightGBM, Fold=3]
总体进度:   5%|▌         | 1/19 [00:10<03:16, 10.91s/it]

Early stopping, best iteration is:
[37]	valid_0's ndcg@3: 0.775471

第 2/19 层




Training until validation scores don't improve for 15 rounds




Early stopping, best iteration is:
[16]	valid_0's ndcg@3: 0.784646
Training until validation scores don't improve for 15 rounds





Early stopping, best iteration is:
[38]	valid_0's ndcg@3: 0.752934


训练分层(1000 rankers):  83%|████████▎ | 5/6 [00:07<00:02,  2.16s/it, 模型=LightGBM, Fold=3][A

Training until validation scores don't improve for 15 rounds


训练分层(1000 rankers): 100%|██████████| 6/6 [00:11<00:00,  1.98s/it, 模型=LightGBM, Fold=3]
总体进度:  11%|█         | 2/19 [00:22<03:17, 11.60s/it]

Early stopping, best iteration is:
[45]	valid_0's ndcg@3: 0.799464

第 3/19 层




Training until validation scores don't improve for 15 rounds





Early stopping, best iteration is:
[20]	valid_0's ndcg@3: 0.819912


训练分层(1000 rankers):  67%|██████▋   | 4/6 [00:04<00:02,  1.39s/it, 模型=LightGBM, Fold=2][A

Training until validation scores don't improve for 15 rounds




Early stopping, best iteration is:
[15]	valid_0's ndcg@3: 0.776127
Training until validation scores don't improve for 15 rounds


训练分层(1000 rankers): 100%|██████████| 6/6 [00:08<00:00,  1.43s/it, 模型=LightGBM, Fold=3]
总体进度:  16%|█▌        | 3/19 [00:31<02:44, 10.29s/it]

Early stopping, best iteration is:
[8]	valid_0's ndcg@3: 0.78953

第 4/19 层




Training until validation scores don't improve for 15 rounds





Early stopping, best iteration is:
[13]	valid_0's ndcg@3: 0.77026


训练分层(1000 rankers):  67%|██████▋   | 4/6 [00:03<00:02,  1.20s/it, 模型=LightGBM, Fold=2][A

Training until validation scores don't improve for 15 rounds





Early stopping, best iteration is:
[19]	valid_0's ndcg@3: 0.754098


训练分层(1000 rankers):  83%|████████▎ | 5/6 [00:06<00:01,  1.72s/it, 模型=LightGBM, Fold=3][A

Training until validation scores don't improve for 15 rounds


训练分层(1000 rankers): 100%|██████████| 6/6 [00:09<00:00,  1.55s/it, 模型=LightGBM, Fold=3]
总体进度:  21%|██        | 4/19 [00:41<02:29,  9.98s/it]

Early stopping, best iteration is:
[21]	valid_0's ndcg@3: 0.796619

第 5/19 层




Training until validation scores don't improve for 15 rounds




Early stopping, best iteration is:
[22]	valid_0's ndcg@3: 0.750499
Training until validation scores don't improve for 15 rounds




Early stopping, best iteration is:
[3]	valid_0's ndcg@3: 0.788002
Training until validation scores don't improve for 15 rounds


训练分层(1000 rankers): 100%|██████████| 6/6 [00:09<00:00,  1.59s/it, 模型=LightGBM, Fold=3]
总体进度:  26%|██▋       | 5/19 [00:50<02:18,  9.89s/it]

Early stopping, best iteration is:
[43]	valid_0's ndcg@3: 0.761774

第 6/19 层




Training until validation scores don't improve for 15 rounds





Early stopping, best iteration is:
[3]	valid_0's ndcg@3: 0.718708


训练分层(1000 rankers):  67%|██████▋   | 4/6 [00:03<00:01,  1.00it/s, 模型=LightGBM, Fold=2][A

Training until validation scores don't improve for 15 rounds





Early stopping, best iteration is:
[30]	valid_0's ndcg@3: 0.746779


训练分层(1000 rankers):  83%|████████▎ | 5/6 [00:07<00:02,  2.13s/it, 模型=LightGBM, Fold=3][A

Training until validation scores don't improve for 15 rounds


训练分层(1000 rankers): 100%|██████████| 6/6 [00:09<00:00,  1.62s/it, 模型=LightGBM, Fold=3]
总体进度:  32%|███▏      | 6/19 [01:00<02:08,  9.91s/it]

Early stopping, best iteration is:
[8]	valid_0's ndcg@3: 0.832098

第 7/19 层




Training until validation scores don't improve for 15 rounds





Early stopping, best iteration is:
[38]	valid_0's ndcg@3: 0.738232


训练分层(1000 rankers):  67%|██████▋   | 4/6 [00:06<00:04,  2.13s/it, 模型=LightGBM, Fold=2][A

Training until validation scores don't improve for 15 rounds





Early stopping, best iteration is:
[23]	valid_0's ndcg@3: 0.794105


训练分层(1000 rankers):  83%|████████▎ | 5/6 [00:09<00:02,  2.65s/it, 模型=LightGBM, Fold=3][A

Training until validation scores don't improve for 15 rounds


训练分层(1000 rankers): 100%|██████████| 6/6 [00:12<00:00,  2.01s/it, 模型=LightGBM, Fold=3]
总体进度:  37%|███▋      | 7/19 [01:13<02:08, 10.67s/it]

Early stopping, best iteration is:
[9]	valid_0's ndcg@3: 0.757089

第 8/19 层




Training until validation scores don't improve for 15 rounds





Early stopping, best iteration is:
[8]	valid_0's ndcg@3: 0.775442


训练分层(1000 rankers):  67%|██████▋   | 4/6 [00:03<00:02,  1.12s/it, 模型=LightGBM, Fold=2][A

Training until validation scores don't improve for 15 rounds





Early stopping, best iteration is:
[13]	valid_0's ndcg@3: 0.77204


训练分层(1000 rankers):  83%|████████▎ | 5/6 [00:06<00:01,  1.64s/it, 模型=LightGBM, Fold=3][A

Training until validation scores don't improve for 15 rounds


训练分层(1000 rankers): 100%|██████████| 6/6 [00:08<00:00,  1.36s/it, 模型=LightGBM, Fold=3]
总体进度:  42%|████▏     | 8/19 [01:21<01:49,  9.95s/it]

Early stopping, best iteration is:
[4]	valid_0's ndcg@3: 0.772409

第 9/19 层




Training until validation scores don't improve for 15 rounds





Early stopping, best iteration is:
[36]	valid_0's ndcg@3: 0.799231


训练分层(1000 rankers):  67%|██████▋   | 4/6 [00:05<00:03,  1.98s/it, 模型=LightGBM, Fold=2][A

Training until validation scores don't improve for 15 rounds





Early stopping, best iteration is:
[30]	valid_0's ndcg@3: 0.772081


训练分层(1000 rankers):  83%|████████▎ | 5/6 [00:09<00:02,  2.71s/it, 模型=LightGBM, Fold=3][A

Training until validation scores don't improve for 15 rounds


训练分层(1000 rankers): 100%|██████████| 6/6 [00:12<00:00,  2.04s/it, 模型=LightGBM, Fold=3]
总体进度:  47%|████▋     | 9/19 [01:34<01:47, 10.74s/it]

Early stopping, best iteration is:
[12]	valid_0's ndcg@3: 0.766644

第 10/19 层




Training until validation scores don't improve for 15 rounds





Early stopping, best iteration is:
[28]	valid_0's ndcg@3: 0.791561


训练分层(1000 rankers):  67%|██████▋   | 4/6 [00:05<00:03,  1.69s/it, 模型=LightGBM, Fold=2][A

Training until validation scores don't improve for 15 rounds





Early stopping, best iteration is:
[9]	valid_0's ndcg@3: 0.787272


训练分层(1000 rankers):  83%|████████▎ | 5/6 [00:07<00:01,  1.88s/it, 模型=LightGBM, Fold=3][A

Training until validation scores don't improve for 15 rounds


训练分层(1000 rankers): 100%|██████████| 6/6 [00:09<00:00,  1.59s/it, 模型=LightGBM, Fold=3]
总体进度:  53%|█████▎    | 10/19 [01:43<01:33, 10.43s/it]

Early stopping, best iteration is:
[11]	valid_0's ndcg@3: 0.803098

第 11/19 层




Training until validation scores don't improve for 15 rounds





Early stopping, best iteration is:
[3]	valid_0's ndcg@3: 0.804299


训练分层(1000 rankers):  67%|██████▋   | 4/6 [00:03<00:02,  1.04s/it, 模型=LightGBM, Fold=2][A

Training until validation scores don't improve for 15 rounds





Early stopping, best iteration is:
[7]	valid_0's ndcg@3: 0.787573


训练分层(1000 rankers):  83%|████████▎ | 5/6 [00:05<00:01,  1.45s/it, 模型=LightGBM, Fold=3][A

Training until validation scores don't improve for 15 rounds


训练分层(1000 rankers): 100%|██████████| 6/6 [00:07<00:00,  1.28s/it, 模型=LightGBM, Fold=3]
总体进度:  58%|█████▊    | 11/19 [01:51<01:17,  9.64s/it]

Early stopping, best iteration is:
[7]	valid_0's ndcg@3: 0.773888

第 12/19 层




Training until validation scores don't improve for 15 rounds





Early stopping, best iteration is:
[2]	valid_0's ndcg@3: 0.765832


训练分层(1000 rankers):  67%|██████▋   | 4/6 [00:03<00:01,  1.05it/s, 模型=LightGBM, Fold=2][A

Training until validation scores don't improve for 15 rounds




Early stopping, best iteration is:
[24]	valid_0's ndcg@3: 0.761443




Training until validation scores don't improve for 15 rounds


训练分层(1000 rankers): 100%|██████████| 6/6 [00:08<00:00,  1.40s/it, 模型=LightGBM, Fold=3]
总体进度:  63%|██████▎   | 12/19 [02:00<01:05,  9.32s/it]

Early stopping, best iteration is:
[1]	valid_0's ndcg@3: 0.791408

第 13/19 层




Training until validation scores don't improve for 15 rounds




Early stopping, best iteration is:
[2]	valid_0's ndcg@3: 0.74894




Training until validation scores don't improve for 15 rounds





Early stopping, best iteration is:
[4]	valid_0's ndcg@3: 0.766185


训练分层(1000 rankers):  83%|████████▎ | 5/6 [00:05<00:01,  1.41s/it, 模型=LightGBM, Fold=3][A

Training until validation scores don't improve for 15 rounds


训练分层(1000 rankers): 100%|██████████| 6/6 [00:07<00:00,  1.28s/it, 模型=LightGBM, Fold=3]
总体进度:  68%|██████▊   | 13/19 [02:08<00:53,  8.88s/it]

Early stopping, best iteration is:
[8]	valid_0's ndcg@3: 0.760371

第 14/19 层




Training until validation scores don't improve for 15 rounds





Early stopping, best iteration is:
[4]	valid_0's ndcg@3: 0.770223


训练分层(1000 rankers):  67%|██████▋   | 4/6 [00:03<00:02,  1.08s/it, 模型=LightGBM, Fold=2][A

Training until validation scores don't improve for 15 rounds





Early stopping, best iteration is:
[14]	valid_0's ndcg@3: 0.739496


训练分层(1000 rankers):  83%|████████▎ | 5/6 [00:06<00:01,  1.73s/it, 模型=LightGBM, Fold=3][A

Training until validation scores don't improve for 15 rounds


训练分层(1000 rankers): 100%|██████████| 6/6 [00:08<00:00,  1.41s/it, 模型=LightGBM, Fold=3]
总体进度:  74%|███████▎  | 14/19 [02:16<00:44,  8.82s/it]

Early stopping, best iteration is:
[4]	valid_0's ndcg@3: 0.780158

第 15/19 层




Training until validation scores don't improve for 15 rounds




Early stopping, best iteration is:
[5]	valid_0's ndcg@3: 0.767227




Training until validation scores don't improve for 15 rounds





Early stopping, best iteration is:
[22]	valid_0's ndcg@3: 0.735906


训练分层(1000 rankers):  83%|████████▎ | 5/6 [00:07<00:01,  2.00s/it, 模型=LightGBM, Fold=3][A

Training until validation scores don't improve for 15 rounds


训练分层(1000 rankers): 100%|██████████| 6/6 [00:09<00:00,  1.53s/it, 模型=LightGBM, Fold=3]
总体进度:  79%|███████▉  | 15/19 [02:26<00:35,  8.99s/it]

Early stopping, best iteration is:
[6]	valid_0's ndcg@3: 0.76408

第 16/19 层




Training until validation scores don't improve for 15 rounds




Early stopping, best iteration is:
[6]	valid_0's ndcg@3: 0.752433




Training until validation scores don't improve for 15 rounds





Early stopping, best iteration is:
[22]	valid_0's ndcg@3: 0.745787


训练分层(1000 rankers):  83%|████████▎ | 5/6 [00:07<00:01,  1.93s/it, 模型=LightGBM, Fold=3][A

Training until validation scores don't improve for 15 rounds


训练分层(1000 rankers): 100%|██████████| 6/6 [00:10<00:00,  1.67s/it, 模型=LightGBM, Fold=3]
总体进度:  84%|████████▍ | 16/19 [02:36<00:28,  9.37s/it]

Early stopping, best iteration is:
[14]	valid_0's ndcg@3: 0.767723

第 17/19 层




Training until validation scores don't improve for 15 rounds




Early stopping, best iteration is:
[11]	valid_0's ndcg@3: 0.761449




Training until validation scores don't improve for 15 rounds





Early stopping, best iteration is:
[11]	valid_0's ndcg@3: 0.734501


训练分层(1000 rankers):  83%|████████▎ | 5/6 [00:06<00:01,  1.78s/it, 模型=LightGBM, Fold=3][A

Training until validation scores don't improve for 15 rounds


训练分层(1000 rankers): 100%|██████████| 6/6 [00:10<00:00,  1.72s/it, 模型=LightGBM, Fold=3]
总体进度:  89%|████████▉ | 17/19 [02:46<00:19,  9.72s/it]

Early stopping, best iteration is:
[25]	valid_0's ndcg@3: 0.793551

第 18/19 层




Training until validation scores don't improve for 15 rounds




Early stopping, best iteration is:
[2]	valid_0's ndcg@3: 0.761829




Training until validation scores don't improve for 15 rounds




Early stopping, best iteration is:
[19]	valid_0's ndcg@3: 0.775415




Training until validation scores don't improve for 15 rounds


训练分层(1000 rankers): 100%|██████████| 6/6 [00:08<00:00,  1.39s/it, 模型=LightGBM, Fold=3]
总体进度:  95%|█████████▍| 18/19 [02:55<00:09,  9.36s/it]

Early stopping, best iteration is:
[3]	valid_0's ndcg@3: 0.735859

第 19/19 层




Training until validation scores don't improve for 15 rounds




Early stopping, best iteration is:
[7]	valid_0's ndcg@3: 0.761397
Training until validation scores don't improve for 15 rounds




Early stopping, best iteration is:
[14]	valid_0's ndcg@3: 0.787351
Training until validation scores don't improve for 15 rounds


训练分层(560 rankers): 100%|██████████| 6/6 [00:07<00:00,  1.33s/it, 模型=LightGBM, Fold=3]
总体进度: 100%|██████████| 19/19 [03:03<00:00,  9.67s/it]


Early stopping, best iteration is:
[13]	valid_0's ndcg@3: 0.74664

🔮 开始预测...


集成预测: 100%|██████████| 2/2 [00:39<00:00, 19.81s/it, 模型=LightGBM]
                                                               

✅ 验证排名...
 完成! 使用 6 个特征, 3 折交叉验证
🚀 开始航班推荐排序流程
加载数据...


清理数据: 100%|██████████| 2/2 [00:34<00:00, 17.37s/it]


开始特征选择: 263 -> 150


                                                                

特征选择完成: 12 个特征
数据加载完成: 训练集(10865187, 15), 测试集(4111603, 14)
📊 分层训练: 67 层, 每层最多 1000 个ranker


总体进度:   0%|          | 0/67 [00:00<?, ?it/s]


第 1/67 层




Training until validation scores don't improve for 15 rounds




Early stopping, best iteration is:
[18]	valid_0's ndcg@3: 0.690171
Training until validation scores don't improve for 15 rounds




Early stopping, best iteration is:
[6]	valid_0's ndcg@3: 0.691364
Training until validation scores don't improve for 15 rounds


训练分层(1000 rankers): 100%|██████████| 6/6 [00:09<00:00,  1.54s/it, 模型=LightGBM, Fold=3]
总体进度:   1%|▏         | 1/67 [00:09<10:22,  9.43s/it]

Early stopping, best iteration is:
[28]	valid_0's ndcg@3: 0.704008

第 2/67 层




Training until validation scores don't improve for 15 rounds




Early stopping, best iteration is:
[9]	valid_0's ndcg@3: 0.678358
Training until validation scores don't improve for 15 rounds




Early stopping, best iteration is:
[47]	valid_0's ndcg@3: 0.687089
Training until validation scores don't improve for 15 rounds


训练分层(1000 rankers): 100%|██████████| 6/6 [00:09<00:00,  1.62s/it, 模型=LightGBM, Fold=3]
总体进度:   3%|▎         | 2/67 [00:19<10:34,  9.76s/it]

Early stopping, best iteration is:
[32]	valid_0's ndcg@3: 0.679216

第 3/67 层




Training until validation scores don't improve for 15 rounds




Early stopping, best iteration is:
[31]	valid_0's ndcg@3: 0.676497
Training until validation scores don't improve for 15 rounds




Early stopping, best iteration is:
[13]	valid_0's ndcg@3: 0.681295
Training until validation scores don't improve for 15 rounds


训练分层(1000 rankers): 100%|██████████| 6/6 [00:09<00:00,  1.54s/it, 模型=LightGBM, Fold=3]
总体进度:   4%|▍         | 3/67 [00:28<10:17,  9.65s/it]

Early stopping, best iteration is:
[36]	valid_0's ndcg@3: 0.675176

第 4/67 层




Training until validation scores don't improve for 15 rounds




Early stopping, best iteration is:
[5]	valid_0's ndcg@3: 0.680265
Training until validation scores don't improve for 15 rounds




Early stopping, best iteration is:
[26]	valid_0's ndcg@3: 0.645532
Training until validation scores don't improve for 15 rounds


训练分层(1000 rankers): 100%|██████████| 6/6 [00:09<00:00,  1.66s/it, 模型=LightGBM, Fold=3]
总体进度:   6%|▌         | 4/67 [00:39<10:22,  9.88s/it]

Early stopping, best iteration is:
[67]	valid_0's ndcg@3: 0.662977

第 5/67 层




Training until validation scores don't improve for 15 rounds




Early stopping, best iteration is:
[34]	valid_0's ndcg@3: 0.709764
Training until validation scores don't improve for 15 rounds




Early stopping, best iteration is:
[11]	valid_0's ndcg@3: 0.679747
Training until validation scores don't improve for 15 rounds


训练分层(1000 rankers): 100%|██████████| 6/6 [00:09<00:00,  1.57s/it, 模型=LightGBM, Fold=3]
总体进度:   7%|▋         | 5/67 [00:48<10:08,  9.81s/it]

Early stopping, best iteration is:
[33]	valid_0's ndcg@3: 0.664659

第 6/67 层




Training until validation scores don't improve for 15 rounds




Early stopping, best iteration is:
[25]	valid_0's ndcg@3: 0.68418
Training until validation scores don't improve for 15 rounds




Early stopping, best iteration is:
[10]	valid_0's ndcg@3: 0.692976
Training until validation scores don't improve for 15 rounds


训练分层(1000 rankers): 100%|██████████| 6/6 [00:07<00:00,  1.32s/it, 模型=LightGBM, Fold=3]
总体进度:   9%|▉         | 6/67 [00:57<09:24,  9.26s/it]

Early stopping, best iteration is:
[9]	valid_0's ndcg@3: 0.677635

第 7/67 层




Training until validation scores don't improve for 15 rounds




Early stopping, best iteration is:
[23]	valid_0's ndcg@3: 0.672105
Training until validation scores don't improve for 15 rounds




Early stopping, best iteration is:
[19]	valid_0's ndcg@3: 0.708357
Training until validation scores don't improve for 15 rounds


训练分层(1000 rankers): 100%|██████████| 6/6 [00:08<00:00,  1.44s/it, 模型=LightGBM, Fold=3]
总体进度:  10%|█         | 7/67 [01:06<09:09,  9.16s/it]

Early stopping, best iteration is:
[9]	valid_0's ndcg@3: 0.656341

第 8/67 层




Training until validation scores don't improve for 15 rounds




Early stopping, best iteration is:
[34]	valid_0's ndcg@3: 0.62057
Training until validation scores don't improve for 15 rounds




Early stopping, best iteration is:
[16]	valid_0's ndcg@3: 0.650752
Training until validation scores don't improve for 15 rounds


训练分层(1000 rankers): 100%|██████████| 6/6 [00:10<00:00,  1.68s/it, 模型=LightGBM, Fold=3]
总体进度:  12%|█▏        | 8/67 [01:16<09:23,  9.54s/it]

Early stopping, best iteration is:
[36]	valid_0's ndcg@3: 0.671915

第 9/67 层




Training until validation scores don't improve for 15 rounds




Early stopping, best iteration is:
[19]	valid_0's ndcg@3: 0.697076
Training until validation scores don't improve for 15 rounds




Early stopping, best iteration is:
[8]	valid_0's ndcg@3: 0.677156
Training until validation scores don't improve for 15 rounds


训练分层(1000 rankers): 100%|██████████| 6/6 [00:12<00:00,  2.11s/it, 模型=LightGBM, Fold=3]
总体进度:  13%|█▎        | 9/67 [01:29<10:16, 10.62s/it]

Early stopping, best iteration is:
[86]	valid_0's ndcg@3: 0.694455

第 10/67 层




Training until validation scores don't improve for 15 rounds




Early stopping, best iteration is:
[16]	valid_0's ndcg@3: 0.68998
Training until validation scores don't improve for 15 rounds




Early stopping, best iteration is:
[31]	valid_0's ndcg@3: 0.679322
Training until validation scores don't improve for 15 rounds


训练分层(1000 rankers): 100%|██████████| 6/6 [00:10<00:00,  1.70s/it, 模型=LightGBM, Fold=3]
总体进度:  15%|█▍        | 10/67 [01:39<10:03, 10.59s/it]

Early stopping, best iteration is:
[29]	valid_0's ndcg@3: 0.702038

第 11/67 层




Training until validation scores don't improve for 15 rounds




Early stopping, best iteration is:
[63]	valid_0's ndcg@3: 0.672824
Training until validation scores don't improve for 15 rounds





Early stopping, best iteration is:
[25]	valid_0's ndcg@3: 0.708207


训练分层(1000 rankers):  83%|████████▎ | 5/6 [00:09<00:02,  2.52s/it, 模型=LightGBM, Fold=3][A

Training until validation scores don't improve for 15 rounds


训练分层(1000 rankers): 100%|██████████| 6/6 [00:12<00:00,  2.08s/it, 模型=LightGBM, Fold=3]
总体进度:  16%|█▋        | 11/67 [01:52<10:30, 11.26s/it]

Early stopping, best iteration is:
[28]	valid_0's ndcg@3: 0.68685

第 12/67 层




Training until validation scores don't improve for 15 rounds




Early stopping, best iteration is:
[6]	valid_0's ndcg@3: 0.679198
Training until validation scores don't improve for 15 rounds




Early stopping, best iteration is:
[9]	valid_0's ndcg@3: 0.6513
Training until validation scores don't improve for 15 rounds


训练分层(1000 rankers): 100%|██████████| 6/6 [00:07<00:00,  1.28s/it, 模型=LightGBM, Fold=3]
总体进度:  18%|█▊        | 12/67 [02:00<09:25, 10.27s/it]

Early stopping, best iteration is:
[22]	valid_0's ndcg@3: 0.682958

第 13/67 层




Training until validation scores don't improve for 15 rounds





Early stopping, best iteration is:
[60]	valid_0's ndcg@3: 0.675989


训练分层(1000 rankers):  67%|██████▋   | 4/6 [00:06<00:04,  2.24s/it, 模型=LightGBM, Fold=2][A

Training until validation scores don't improve for 15 rounds




Early stopping, best iteration is:
[14]	valid_0's ndcg@3: 0.669874
Training until validation scores don't improve for 15 rounds


训练分层(1000 rankers): 100%|██████████| 6/6 [00:10<00:00,  1.76s/it, 模型=LightGBM, Fold=3]
总体进度:  19%|█▉        | 13/67 [02:11<09:25, 10.47s/it]

Early stopping, best iteration is:
[7]	valid_0's ndcg@3: 0.718042

第 14/67 层




Training until validation scores don't improve for 15 rounds




Early stopping, best iteration is:
[4]	valid_0's ndcg@3: 0.664392
Training until validation scores don't improve for 15 rounds





Early stopping, best iteration is:
[63]	valid_0's ndcg@3: 0.679199


训练分层(1000 rankers):  83%|████████▎ | 5/6 [00:08<00:02,  2.45s/it, 模型=LightGBM, Fold=3][A

Training until validation scores don't improve for 15 rounds


训练分层(1000 rankers): 100%|██████████| 6/6 [00:09<00:00,  1.66s/it, 模型=LightGBM, Fold=3]
总体进度:  21%|██        | 14/67 [02:21<09:11, 10.40s/it]

Early stopping, best iteration is:
[5]	valid_0's ndcg@3: 0.680319

第 15/67 层




Training until validation scores don't improve for 15 rounds





Early stopping, best iteration is:
[4]	valid_0's ndcg@3: 0.716127


训练分层(1000 rankers):  67%|██████▋   | 4/6 [00:03<00:01,  1.05it/s, 模型=LightGBM, Fold=2][A

Training until validation scores don't improve for 15 rounds





Early stopping, best iteration is:
[18]	valid_0's ndcg@3: 0.749359


训练分层(1000 rankers):  83%|████████▎ | 5/6 [00:05<00:01,  1.61s/it, 模型=LightGBM, Fold=3][A

Training until validation scores don't improve for 15 rounds


训练分层(1000 rankers): 100%|██████████| 6/6 [00:07<00:00,  1.31s/it, 模型=LightGBM, Fold=3]
总体进度:  22%|██▏       | 15/67 [02:30<08:26,  9.74s/it]

Early stopping, best iteration is:
[8]	valid_0's ndcg@3: 0.706558

第 16/67 层




Training until validation scores don't improve for 15 rounds




Early stopping, best iteration is:
[32]	valid_0's ndcg@3: 0.686254
Training until validation scores don't improve for 15 rounds




Early stopping, best iteration is:
[53]	valid_0's ndcg@3: 0.683064
Training until validation scores don't improve for 15 rounds


训练分层(1000 rankers): 100%|██████████| 6/6 [00:11<00:00,  1.97s/it, 模型=LightGBM, Fold=3]
总体进度:  24%|██▍       | 16/67 [02:42<08:53, 10.46s/it]

Early stopping, best iteration is:
[13]	valid_0's ndcg@3: 0.679624

第 17/67 层




Training until validation scores don't improve for 15 rounds





Early stopping, best iteration is:
[31]	valid_0's ndcg@3: 0.712308


训练分层(1000 rankers):  67%|██████▋   | 4/6 [00:05<00:03,  1.76s/it, 模型=LightGBM, Fold=2][A

Training until validation scores don't improve for 15 rounds




Early stopping, best iteration is:
[9]	valid_0's ndcg@3: 0.705051
Training until validation scores don't improve for 15 rounds


训练分层(1000 rankers): 100%|██████████| 6/6 [00:09<00:00,  1.64s/it, 模型=LightGBM, Fold=3]
总体进度:  25%|██▌       | 17/67 [02:52<08:38, 10.37s/it]

Early stopping, best iteration is:
[16]	valid_0's ndcg@3: 0.687562

第 18/67 层




Training until validation scores don't improve for 15 rounds




Early stopping, best iteration is:
[9]	valid_0's ndcg@3: 0.665905
Training until validation scores don't improve for 15 rounds





Early stopping, best iteration is:
[58]	valid_0's ndcg@3: 0.658888


训练分层(1000 rankers):  83%|████████▎ | 5/6 [00:09<00:02,  2.68s/it, 模型=LightGBM, Fold=3][A

Training until validation scores don't improve for 15 rounds


训练分层(1000 rankers): 100%|██████████| 6/6 [00:12<00:00,  2.10s/it, 模型=LightGBM, Fold=3]
总体进度:  27%|██▋       | 18/67 [03:05<09:06, 11.15s/it]

Early stopping, best iteration is:
[27]	valid_0's ndcg@3: 0.668381

第 19/67 层




Training until validation scores don't improve for 15 rounds





Early stopping, best iteration is:
[25]	valid_0's ndcg@3: 0.691361


训练分层(1000 rankers):  67%|██████▋   | 4/6 [00:05<00:03,  1.63s/it, 模型=LightGBM, Fold=2][A

Training until validation scores don't improve for 15 rounds





Early stopping, best iteration is:
[38]	valid_0's ndcg@3: 0.67231


训练分层(1000 rankers):  83%|████████▎ | 5/6 [00:09<00:02,  2.61s/it, 模型=LightGBM, Fold=3][A

Training until validation scores don't improve for 15 rounds


训练分层(1000 rankers): 100%|██████████| 6/6 [00:12<00:00,  2.11s/it, 模型=LightGBM, Fold=3]
总体进度:  28%|██▊       | 19/67 [03:18<09:22, 11.71s/it]

Early stopping, best iteration is:
[24]	valid_0's ndcg@3: 0.658985

第 20/67 层




Training until validation scores don't improve for 15 rounds





Early stopping, best iteration is:
[16]	valid_0's ndcg@3: 0.669772


训练分层(1000 rankers):  67%|██████▋   | 4/6 [00:04<00:02,  1.35s/it, 模型=LightGBM, Fold=2][A

Training until validation scores don't improve for 15 rounds





Early stopping, best iteration is:
[7]	valid_0's ndcg@3: 0.629156


训练分层(1000 rankers):  83%|████████▎ | 5/6 [00:06<00:01,  1.61s/it, 模型=LightGBM, Fold=3][A

Training until validation scores don't improve for 15 rounds


训练分层(1000 rankers): 100%|██████████| 6/6 [00:09<00:00,  1.51s/it, 模型=LightGBM, Fold=3]
总体进度:  30%|██▉       | 20/67 [03:27<08:37, 11.01s/it]

Early stopping, best iteration is:
[17]	valid_0's ndcg@3: 0.638853

第 21/67 层




Training until validation scores don't improve for 15 rounds



训练分层(1000 rankers):  67%|██████▋   | 4/6 [00:03<00:02,  1.04s/it, 模型=LightGBM, Fold=2]

Early stopping, best iteration is:
[3]	valid_0's ndcg@3: 0.696116


[A

Training until validation scores don't improve for 15 rounds




Early stopping, best iteration is:
[31]	valid_0's ndcg@3: 0.717331




Training until validation scores don't improve for 15 rounds


训练分层(1000 rankers): 100%|██████████| 6/6 [00:09<00:00,  1.57s/it, 模型=LightGBM, Fold=3]
总体进度:  31%|███▏      | 21/67 [03:37<08:09, 10.63s/it]

Early stopping, best iteration is:
[11]	valid_0's ndcg@3: 0.682063

第 22/67 层




Training until validation scores don't improve for 15 rounds





Early stopping, best iteration is:
[7]	valid_0's ndcg@3: 0.724798


训练分层(1000 rankers):  67%|██████▋   | 4/6 [00:03<00:01,  1.00it/s, 模型=LightGBM, Fold=2][A

Training until validation scores don't improve for 15 rounds




Early stopping, best iteration is:
[15]	valid_0's ndcg@3: 0.692216
Training until validation scores don't improve for 15 rounds


训练分层(1000 rankers): 100%|██████████| 6/6 [00:07<00:00,  1.23s/it, 模型=LightGBM, Fold=3]
总体进度:  33%|███▎      | 22/67 [03:45<07:20,  9.78s/it]

Early stopping, best iteration is:
[5]	valid_0's ndcg@3: 0.720437

第 23/67 层




Training until validation scores don't improve for 15 rounds




Early stopping, best iteration is:
[5]	valid_0's ndcg@3: 0.707804
Training until validation scores don't improve for 15 rounds




Early stopping, best iteration is:
[11]	valid_0's ndcg@3: 0.681168
Training until validation scores don't improve for 15 rounds


训练分层(1000 rankers): 100%|██████████| 6/6 [00:08<00:00,  1.38s/it, 模型=LightGBM, Fold=3]
总体进度:  34%|███▍      | 23/67 [03:53<06:54,  9.43s/it]

Early stopping, best iteration is:
[19]	valid_0's ndcg@3: 0.707022

第 24/67 层




Training until validation scores don't improve for 15 rounds




Early stopping, best iteration is:
[37]	valid_0's ndcg@3: 0.706046
Training until validation scores don't improve for 15 rounds




Early stopping, best iteration is:
[2]	valid_0's ndcg@3: 0.668685
Training until validation scores don't improve for 15 rounds


训练分层(1000 rankers): 100%|██████████| 6/6 [00:08<00:00,  1.39s/it, 模型=LightGBM, Fold=3]
总体进度:  36%|███▌      | 24/67 [04:02<06:35,  9.21s/it]

Early stopping, best iteration is:
[3]	valid_0's ndcg@3: 0.684293

第 25/67 层




Training until validation scores don't improve for 15 rounds




Early stopping, best iteration is:
[15]	valid_0's ndcg@3: 0.669244
Training until validation scores don't improve for 15 rounds




Early stopping, best iteration is:
[7]	valid_0's ndcg@3: 0.677615
Training until validation scores don't improve for 15 rounds


训练分层(1000 rankers): 100%|██████████| 6/6 [00:08<00:00,  1.42s/it, 模型=LightGBM, Fold=3]
总体进度:  37%|███▋      | 25/67 [04:11<06:21,  9.09s/it]

Early stopping, best iteration is:
[19]	valid_0's ndcg@3: 0.697786

第 26/67 层




Training until validation scores don't improve for 15 rounds




Early stopping, best iteration is:
[17]	valid_0's ndcg@3: 0.667042
Training until validation scores don't improve for 15 rounds




Early stopping, best iteration is:
[18]	valid_0's ndcg@3: 0.681115
Training until validation scores don't improve for 15 rounds


训练分层(1000 rankers): 100%|██████████| 6/6 [00:09<00:00,  1.55s/it, 模型=LightGBM, Fold=3]
总体进度:  39%|███▉      | 26/67 [04:21<06:19,  9.27s/it]

Early stopping, best iteration is:
[19]	valid_0's ndcg@3: 0.672104

第 27/67 层




Training until validation scores don't improve for 15 rounds




Early stopping, best iteration is:
[10]	valid_0's ndcg@3: 0.692585
Training until validation scores don't improve for 15 rounds




Early stopping, best iteration is:
[15]	valid_0's ndcg@3: 0.684919
Training until validation scores don't improve for 15 rounds


训练分层(1000 rankers): 100%|██████████| 6/6 [00:08<00:00,  1.40s/it, 模型=LightGBM, Fold=3]
总体进度:  40%|████      | 27/67 [04:29<06:04,  9.11s/it]

Early stopping, best iteration is:
[21]	valid_0's ndcg@3: 0.701432

第 28/67 层




Training until validation scores don't improve for 15 rounds




Early stopping, best iteration is:
[14]	valid_0's ndcg@3: 0.685535
Training until validation scores don't improve for 15 rounds




Early stopping, best iteration is:
[27]	valid_0's ndcg@3: 0.638982
Training until validation scores don't improve for 15 rounds


训练分层(1000 rankers): 100%|██████████| 6/6 [00:09<00:00,  1.60s/it, 模型=LightGBM, Fold=3]
总体进度:  42%|████▏     | 28/67 [04:39<06:04,  9.34s/it]

Early stopping, best iteration is:
[21]	valid_0's ndcg@3: 0.661865

第 29/67 层




Training until validation scores don't improve for 15 rounds




Early stopping, best iteration is:
[11]	valid_0's ndcg@3: 0.675252
Training until validation scores don't improve for 15 rounds




Early stopping, best iteration is:
[32]	valid_0's ndcg@3: 0.693749
Training until validation scores don't improve for 15 rounds


训练分层(1000 rankers): 100%|██████████| 6/6 [00:09<00:00,  1.53s/it, 模型=LightGBM, Fold=3]
总体进度:  43%|████▎     | 29/67 [04:49<05:56,  9.38s/it]

Early stopping, best iteration is:
[17]	valid_0's ndcg@3: 0.647444

第 30/67 层




Training until validation scores don't improve for 15 rounds





Early stopping, best iteration is:
[64]	valid_0's ndcg@3: 0.653218


训练分层(1000 rankers):  67%|██████▋   | 4/6 [00:06<00:04,  2.33s/it, 模型=LightGBM, Fold=2][A

Training until validation scores don't improve for 15 rounds




Early stopping, best iteration is:
[30]	valid_0's ndcg@3: 0.641128
Training until validation scores don't improve for 15 rounds


训练分层(1000 rankers): 100%|██████████| 6/6 [00:11<00:00,  1.93s/it, 模型=LightGBM, Fold=3]
总体进度:  45%|████▍     | 30/67 [05:01<06:15, 10.15s/it]

Early stopping, best iteration is:
[4]	valid_0's ndcg@3: 0.669307

第 31/67 层




Training until validation scores don't improve for 15 rounds




Early stopping, best iteration is:
[35]	valid_0's ndcg@3: 0.741954
Training until validation scores don't improve for 15 rounds





Early stopping, best iteration is:
[21]	valid_0's ndcg@3: 0.700302


训练分层(1000 rankers):  83%|████████▎ | 5/6 [00:07<00:02,  2.07s/it, 模型=LightGBM, Fold=3][A

Training until validation scores don't improve for 15 rounds


训练分层(1000 rankers): 100%|██████████| 6/6 [00:09<00:00,  1.63s/it, 模型=LightGBM, Fold=3]
总体进度:  46%|████▋     | 31/67 [05:11<06:04, 10.13s/it]

Early stopping, best iteration is:
[10]	valid_0's ndcg@3: 0.685686

第 32/67 层




Training until validation scores don't improve for 15 rounds




Early stopping, best iteration is:
[8]	valid_0's ndcg@3: 0.700957
Training until validation scores don't improve for 15 rounds





Early stopping, best iteration is:
[28]	valid_0's ndcg@3: 0.709491


训练分层(1000 rankers):  83%|████████▎ | 5/6 [00:06<00:01,  1.69s/it, 模型=LightGBM, Fold=3][A

Training until validation scores don't improve for 15 rounds


训练分层(1000 rankers): 100%|██████████| 6/6 [00:08<00:00,  1.39s/it, 模型=LightGBM, Fold=3]
总体进度:  48%|████▊     | 32/67 [05:19<05:39,  9.70s/it]

Early stopping, best iteration is:
[16]	valid_0's ndcg@3: 0.701015

第 33/67 层




Training until validation scores don't improve for 15 rounds




Early stopping, best iteration is:
[8]	valid_0's ndcg@3: 0.717429
Training until validation scores don't improve for 15 rounds




Early stopping, best iteration is:
[5]	valid_0's ndcg@3: 0.742036
Training until validation scores don't improve for 15 rounds


训练分层(1000 rankers): 100%|██████████| 6/6 [00:06<00:00,  1.15s/it, 模型=LightGBM, Fold=3]
总体进度:  49%|████▉     | 33/67 [05:27<05:04,  8.95s/it]

Early stopping, best iteration is:
[12]	valid_0's ndcg@3: 0.70623

第 34/67 层




Training until validation scores don't improve for 15 rounds




Early stopping, best iteration is:
[27]	valid_0's ndcg@3: 0.703391
Training until validation scores don't improve for 15 rounds




Early stopping, best iteration is:
[2]	valid_0's ndcg@3: 0.698804
Training until validation scores don't improve for 15 rounds


训练分层(1000 rankers): 100%|██████████| 6/6 [00:09<00:00,  1.60s/it, 模型=LightGBM, Fold=3]
总体进度:  51%|█████     | 34/67 [05:36<05:04,  9.24s/it]

Early stopping, best iteration is:
[38]	valid_0's ndcg@3: 0.730256

第 35/67 层




Training until validation scores don't improve for 15 rounds




Early stopping, best iteration is:
[3]	valid_0's ndcg@3: 0.702147
Training until validation scores don't improve for 15 rounds




Early stopping, best iteration is:
[26]	valid_0's ndcg@3: 0.687589
Training until validation scores don't improve for 15 rounds


训练分层(1000 rankers): 100%|██████████| 6/6 [00:07<00:00,  1.22s/it, 模型=LightGBM, Fold=3]
总体进度:  52%|█████▏    | 35/67 [05:44<04:40,  8.76s/it]

Early stopping, best iteration is:
[3]	valid_0's ndcg@3: 0.660414

第 36/67 层




Training until validation scores don't improve for 15 rounds




Early stopping, best iteration is:
[24]	valid_0's ndcg@3: 0.692514
Training until validation scores don't improve for 15 rounds




Early stopping, best iteration is:
[20]	valid_0's ndcg@3: 0.684106
Training until validation scores don't improve for 15 rounds


训练分层(1000 rankers): 100%|██████████| 6/6 [00:09<00:00,  1.56s/it, 模型=LightGBM, Fold=3]
总体进度:  54%|█████▎    | 36/67 [05:54<04:39,  9.03s/it]

Early stopping, best iteration is:
[18]	valid_0's ndcg@3: 0.656316

第 37/67 层




Training until validation scores don't improve for 15 rounds




Early stopping, best iteration is:
[39]	valid_0's ndcg@3: 0.683916
Training until validation scores don't improve for 15 rounds




Early stopping, best iteration is:
[55]	valid_0's ndcg@3: 0.716602
Training until validation scores don't improve for 15 rounds


训练分层(1000 rankers): 100%|██████████| 6/6 [00:12<00:00,  2.05s/it, 模型=LightGBM, Fold=3]
总体进度:  55%|█████▌    | 37/67 [06:06<05:02, 10.10s/it]

Early stopping, best iteration is:
[34]	valid_0's ndcg@3: 0.686482

第 38/67 层




Training until validation scores don't improve for 15 rounds




Early stopping, best iteration is:
[24]	valid_0's ndcg@3: 0.65904
Training until validation scores don't improve for 15 rounds




Early stopping, best iteration is:
[4]	valid_0's ndcg@3: 0.673049
Training until validation scores don't improve for 15 rounds


训练分层(1000 rankers): 100%|██████████| 6/6 [00:08<00:00,  1.39s/it, 模型=LightGBM, Fold=3]
总体进度:  57%|█████▋    | 38/67 [06:15<04:40,  9.67s/it]

Early stopping, best iteration is:
[15]	valid_0's ndcg@3: 0.680184

第 39/67 层




Training until validation scores don't improve for 15 rounds




Early stopping, best iteration is:
[17]	valid_0's ndcg@3: 0.685965
Training until validation scores don't improve for 15 rounds





Early stopping, best iteration is:
[15]	valid_0's ndcg@3: 0.653435


训练分层(1000 rankers):  83%|████████▎ | 5/6 [00:06<00:01,  1.70s/it, 模型=LightGBM, Fold=3][A

Training until validation scores don't improve for 15 rounds


训练分层(1000 rankers): 100%|██████████| 6/6 [00:08<00:00,  1.36s/it, 模型=LightGBM, Fold=3]
总体进度:  58%|█████▊    | 39/67 [06:23<04:20,  9.30s/it]

Early stopping, best iteration is:
[5]	valid_0's ndcg@3: 0.652492

第 40/67 层




Training until validation scores don't improve for 15 rounds




Early stopping, best iteration is:
[13]	valid_0's ndcg@3: 0.690569
Training until validation scores don't improve for 15 rounds




Early stopping, best iteration is:
[23]	valid_0's ndcg@3: 0.665519
Training until validation scores don't improve for 15 rounds


训练分层(1000 rankers): 100%|██████████| 6/6 [00:09<00:00,  1.66s/it, 模型=LightGBM, Fold=3]
总体进度:  60%|█████▉    | 40/67 [06:34<04:19,  9.60s/it]

Early stopping, best iteration is:
[17]	valid_0's ndcg@3: 0.671773

第 41/67 层




Training until validation scores don't improve for 15 rounds




Early stopping, best iteration is:
[28]	valid_0's ndcg@3: 0.754368
Training until validation scores don't improve for 15 rounds





Early stopping, best iteration is:
[14]	valid_0's ndcg@3: 0.694638


训练分层(1000 rankers):  83%|████████▎ | 5/6 [00:07<00:01,  1.98s/it, 模型=LightGBM, Fold=3][A

Training until validation scores don't improve for 15 rounds


训练分层(1000 rankers): 100%|██████████| 6/6 [00:10<00:00,  1.76s/it, 模型=LightGBM, Fold=3]
总体进度:  61%|██████    | 41/67 [06:45<04:19, 10.00s/it]

Early stopping, best iteration is:
[24]	valid_0's ndcg@3: 0.693689

第 42/67 层




Training until validation scores don't improve for 15 rounds





Early stopping, best iteration is:
[4]	valid_0's ndcg@3: 0.708294


训练分层(1000 rankers):  67%|██████▋   | 4/6 [00:03<00:01,  1.06it/s, 模型=LightGBM, Fold=2][A

Training until validation scores don't improve for 15 rounds





Early stopping, best iteration is:
[12]	valid_0's ndcg@3: 0.694419


训练分层(1000 rankers):  83%|████████▎ | 5/6 [00:05<00:01,  1.48s/it, 模型=LightGBM, Fold=3][A

Training until validation scores don't improve for 15 rounds


训练分层(1000 rankers): 100%|██████████| 6/6 [00:07<00:00,  1.22s/it, 模型=LightGBM, Fold=3]
总体进度:  63%|██████▎   | 42/67 [06:52<03:52,  9.30s/it]

Early stopping, best iteration is:
[5]	valid_0's ndcg@3: 0.74947

第 43/67 层




Training until validation scores don't improve for 15 rounds





Early stopping, best iteration is:
[3]	valid_0's ndcg@3: 0.682159


训练分层(1000 rankers):  67%|██████▋   | 4/6 [00:03<00:01,  1.07it/s, 模型=LightGBM, Fold=2][A

Training until validation scores don't improve for 15 rounds




Early stopping, best iteration is:
[13]	valid_0's ndcg@3: 0.647268
Training until validation scores don't improve for 15 rounds


训练分层(1000 rankers): 100%|██████████| 6/6 [00:07<00:00,  1.23s/it, 模型=LightGBM, Fold=3]
总体进度:  64%|██████▍   | 43/67 [07:00<03:31,  8.82s/it]

Early stopping, best iteration is:
[6]	valid_0's ndcg@3: 0.675735

第 44/67 层




Training until validation scores don't improve for 15 rounds





Early stopping, best iteration is:
[49]	valid_0's ndcg@3: 0.676817


训练分层(1000 rankers):  67%|██████▋   | 4/6 [00:06<00:04,  2.23s/it, 模型=LightGBM, Fold=2][A

Training until validation scores don't improve for 15 rounds





Early stopping, best iteration is:
[3]	valid_0's ndcg@3: 0.695415


训练分层(1000 rankers):  83%|████████▎ | 5/6 [00:08<00:02,  2.12s/it, 模型=LightGBM, Fold=3][A

Training until validation scores don't improve for 15 rounds


训练分层(1000 rankers): 100%|██████████| 6/6 [00:10<00:00,  1.69s/it, 模型=LightGBM, Fold=3]
总体进度:  66%|██████▌   | 44/67 [07:11<03:34,  9.32s/it]

Early stopping, best iteration is:
[4]	valid_0's ndcg@3: 0.647792

第 45/67 层




Training until validation scores don't improve for 15 rounds





Early stopping, best iteration is:
[30]	valid_0's ndcg@3: 0.693258


训练分层(1000 rankers):  67%|██████▋   | 4/6 [00:05<00:03,  1.71s/it, 模型=LightGBM, Fold=2][A

Training until validation scores don't improve for 15 rounds





Early stopping, best iteration is:
[20]	valid_0's ndcg@3: 0.659094


训练分层(1000 rankers):  83%|████████▎ | 5/6 [00:08<00:02,  2.18s/it, 模型=LightGBM, Fold=3][A

Training until validation scores don't improve for 15 rounds


训练分层(1000 rankers): 100%|██████████| 6/6 [00:10<00:00,  1.70s/it, 模型=LightGBM, Fold=3]
总体进度:  67%|██████▋   | 45/67 [07:21<03:33,  9.69s/it]

Early stopping, best iteration is:
[8]	valid_0's ndcg@3: 0.656423

第 46/67 层




Training until validation scores don't improve for 15 rounds





Early stopping, best iteration is:
[15]	valid_0's ndcg@3: 0.639806


训练分层(1000 rankers):  67%|██████▋   | 4/6 [00:04<00:02,  1.34s/it, 模型=LightGBM, Fold=2][A

Training until validation scores don't improve for 15 rounds




Early stopping, best iteration is:
[9]	valid_0's ndcg@3: 0.687862
Training until validation scores don't improve for 15 rounds


训练分层(1000 rankers): 100%|██████████| 6/6 [00:09<00:00,  1.50s/it, 模型=LightGBM, Fold=3]
总体进度:  69%|██████▊   | 46/67 [07:30<03:21,  9.60s/it]

Early stopping, best iteration is:
[16]	valid_0's ndcg@3: 0.667489

第 47/67 层




Training until validation scores don't improve for 15 rounds




Early stopping, best iteration is:
[6]	valid_0's ndcg@3: 0.657905
Training until validation scores don't improve for 15 rounds





Early stopping, best iteration is:
[16]	valid_0's ndcg@3: 0.688401


训练分层(1000 rankers):  83%|████████▎ | 5/6 [00:06<00:01,  1.63s/it, 模型=LightGBM, Fold=3][A

Training until validation scores don't improve for 15 rounds


训练分层(1000 rankers): 100%|██████████| 6/6 [00:08<00:00,  1.36s/it, 模型=LightGBM, Fold=3]
总体进度:  70%|███████   | 47/67 [07:39<03:05,  9.25s/it]

Early stopping, best iteration is:
[7]	valid_0's ndcg@3: 0.719203

第 48/67 层




Training until validation scores don't improve for 15 rounds




Early stopping, best iteration is:
[9]	valid_0's ndcg@3: 0.667052
Training until validation scores don't improve for 15 rounds



训练分层(1000 rankers):  83%|████████▎ | 5/6 [00:07<00:02,  2.03s/it, 模型=LightGBM, Fold=3]

Early stopping, best iteration is:
[29]	valid_0's ndcg@3: 0.717579


[A

Training until validation scores don't improve for 15 rounds


训练分层(1000 rankers): 100%|██████████| 6/6 [00:08<00:00,  1.49s/it, 模型=LightGBM, Fold=3]
总体进度:  72%|███████▏  | 48/67 [07:48<02:56,  9.27s/it]

Early stopping, best iteration is:
[4]	valid_0's ndcg@3: 0.699042

第 49/67 层




Training until validation scores don't improve for 15 rounds





Early stopping, best iteration is:
[4]	valid_0's ndcg@3: 0.698636


训练分层(1000 rankers):  67%|██████▋   | 4/6 [00:03<00:01,  1.06it/s, 模型=LightGBM, Fold=2][A

Training until validation scores don't improve for 15 rounds





Early stopping, best iteration is:
[16]	valid_0's ndcg@3: 0.704381


训练分层(1000 rankers):  83%|████████▎ | 5/6 [00:05<00:01,  1.63s/it, 模型=LightGBM, Fold=3][A

Training until validation scores don't improve for 15 rounds


训练分层(1000 rankers): 100%|██████████| 6/6 [00:09<00:00,  1.59s/it, 模型=LightGBM, Fold=3]
总体进度:  73%|███████▎  | 49/67 [07:58<02:49,  9.44s/it]

Early stopping, best iteration is:
[30]	valid_0's ndcg@3: 0.657141

第 50/67 层




Training until validation scores don't improve for 15 rounds





Early stopping, best iteration is:
[18]	valid_0's ndcg@3: 0.677553


训练分层(1000 rankers):  67%|██████▋   | 4/6 [00:04<00:02,  1.41s/it, 模型=LightGBM, Fold=2][A

Training until validation scores don't improve for 15 rounds




Early stopping, best iteration is:
[11]	valid_0's ndcg@3: 0.695855
Training until validation scores don't improve for 15 rounds


训练分层(1000 rankers): 100%|██████████| 6/6 [00:09<00:00,  1.61s/it, 模型=LightGBM, Fold=3]
总体进度:  75%|███████▍  | 50/67 [08:08<02:43,  9.61s/it]

Early stopping, best iteration is:
[19]	valid_0's ndcg@3: 0.718475

第 51/67 层




Training until validation scores don't improve for 15 rounds




Early stopping, best iteration is:
[4]	valid_0's ndcg@3: 0.63964
Training until validation scores don't improve for 15 rounds




Early stopping, best iteration is:
[39]	valid_0's ndcg@3: 0.709073
Training until validation scores don't improve for 15 rounds


训练分层(1000 rankers): 100%|██████████| 6/6 [00:10<00:00,  1.69s/it, 模型=LightGBM, Fold=3]
总体进度:  76%|███████▌  | 51/67 [08:19<02:37,  9.87s/it]

Early stopping, best iteration is:
[20]	valid_0's ndcg@3: 0.690296

第 52/67 层




Training until validation scores don't improve for 15 rounds




Early stopping, best iteration is:
[15]	valid_0's ndcg@3: 0.674746
Training until validation scores don't improve for 15 rounds




Early stopping, best iteration is:
[34]	valid_0's ndcg@3: 0.659624
Training until validation scores don't improve for 15 rounds


训练分层(1000 rankers): 100%|██████████| 6/6 [00:08<00:00,  1.46s/it, 模型=LightGBM, Fold=3]
总体进度:  78%|███████▊  | 52/67 [08:28<02:24,  9.63s/it]

Early stopping, best iteration is:
[5]	valid_0's ndcg@3: 0.624792

第 53/67 层




Training until validation scores don't improve for 15 rounds




Early stopping, best iteration is:
[25]	valid_0's ndcg@3: 0.67649
Training until validation scores don't improve for 15 rounds




Early stopping, best iteration is:
[26]	valid_0's ndcg@3: 0.68478
Training until validation scores don't improve for 15 rounds


训练分层(1000 rankers): 100%|██████████| 6/6 [00:09<00:00,  1.62s/it, 模型=LightGBM, Fold=3]
总体进度:  79%|███████▉  | 53/67 [08:38<02:16,  9.76s/it]

Early stopping, best iteration is:
[14]	valid_0's ndcg@3: 0.676392

第 54/67 层




Training until validation scores don't improve for 15 rounds




Early stopping, best iteration is:
[10]	valid_0's ndcg@3: 0.686097
Training until validation scores don't improve for 15 rounds




Early stopping, best iteration is:
[44]	valid_0's ndcg@3: 0.693933
Training until validation scores don't improve for 15 rounds


训练分层(1000 rankers): 100%|██████████| 6/6 [00:09<00:00,  1.64s/it, 模型=LightGBM, Fold=3]
总体进度:  81%|████████  | 54/67 [08:48<02:08,  9.89s/it]

Early stopping, best iteration is:
[14]	valid_0's ndcg@3: 0.67354

第 55/67 层




Training until validation scores don't improve for 15 rounds




Early stopping, best iteration is:
[14]	valid_0's ndcg@3: 0.662403
Training until validation scores don't improve for 15 rounds




Early stopping, best iteration is:
[44]	valid_0's ndcg@3: 0.681117
Training until validation scores don't improve for 15 rounds


训练分层(1000 rankers): 100%|██████████| 6/6 [00:12<00:00,  2.02s/it, 模型=LightGBM, Fold=3]
总体进度:  82%|████████▏ | 55/67 [09:00<02:07, 10.66s/it]

Early stopping, best iteration is:
[47]	valid_0's ndcg@3: 0.682724

第 56/67 层




Training until validation scores don't improve for 15 rounds




Early stopping, best iteration is:
[36]	valid_0's ndcg@3: 0.633748
Training until validation scores don't improve for 15 rounds




Early stopping, best iteration is:
[13]	valid_0's ndcg@3: 0.687953
Training until validation scores don't improve for 15 rounds


训练分层(1000 rankers): 100%|██████████| 6/6 [00:08<00:00,  1.49s/it, 模型=LightGBM, Fold=3]
总体进度:  84%|████████▎ | 56/67 [09:10<01:52, 10.24s/it]

Early stopping, best iteration is:
[8]	valid_0's ndcg@3: 0.676092

第 57/67 层




Training until validation scores don't improve for 15 rounds




Early stopping, best iteration is:
[12]	valid_0's ndcg@3: 0.663748
Training until validation scores don't improve for 15 rounds




Early stopping, best iteration is:
[55]	valid_0's ndcg@3: 0.697356
Training until validation scores don't improve for 15 rounds


训练分层(1000 rankers): 100%|██████████| 6/6 [00:09<00:00,  1.64s/it, 模型=LightGBM, Fold=3]
总体进度:  85%|████████▌ | 57/67 [09:20<01:42, 10.21s/it]

Early stopping, best iteration is:
[2]	valid_0's ndcg@3: 0.710102

第 58/67 层




Training until validation scores don't improve for 15 rounds




Early stopping, best iteration is:
[39]	valid_0's ndcg@3: 0.715277
Training until validation scores don't improve for 15 rounds




Early stopping, best iteration is:
[10]	valid_0's ndcg@3: 0.674497
Training until validation scores don't improve for 15 rounds


训练分层(1000 rankers): 100%|██████████| 6/6 [00:08<00:00,  1.49s/it, 模型=LightGBM, Fold=3]
总体进度:  87%|████████▋ | 58/67 [09:29<01:29,  9.92s/it]

Early stopping, best iteration is:
[9]	valid_0's ndcg@3: 0.670956

第 59/67 层




Training until validation scores don't improve for 15 rounds





Early stopping, best iteration is:
[13]	valid_0's ndcg@3: 0.69094


训练分层(1000 rankers):  67%|██████▋   | 4/6 [00:03<00:02,  1.13s/it, 模型=LightGBM, Fold=2][A

Training until validation scores don't improve for 15 rounds




Early stopping, best iteration is:
[9]	valid_0's ndcg@3: 0.685231
Training until validation scores don't improve for 15 rounds


训练分层(1000 rankers): 100%|██████████| 6/6 [00:06<00:00,  1.15s/it, 模型=LightGBM, Fold=3]
总体进度:  88%|████████▊ | 59/67 [09:36<01:12,  9.12s/it]

Early stopping, best iteration is:
[2]	valid_0's ndcg@3: 0.693199

第 60/67 层




Training until validation scores don't improve for 15 rounds




Early stopping, best iteration is:
[7]	valid_0's ndcg@3: 0.688151
Training until validation scores don't improve for 15 rounds




Early stopping, best iteration is:
[12]	valid_0's ndcg@3: 0.712473
Training until validation scores don't improve for 15 rounds


训练分层(1000 rankers): 100%|██████████| 6/6 [00:06<00:00,  1.16s/it, 模型=LightGBM, Fold=3]
总体进度:  90%|████████▉ | 60/67 [09:44<00:59,  8.56s/it]

Early stopping, best iteration is:
[5]	valid_0's ndcg@3: 0.688461

第 61/67 层




Training until validation scores don't improve for 15 rounds




Early stopping, best iteration is:
[16]	valid_0's ndcg@3: 0.665196
Training until validation scores don't improve for 15 rounds




Early stopping, best iteration is:
[68]	valid_0's ndcg@3: 0.698169
Training until validation scores don't improve for 15 rounds


训练分层(1000 rankers): 100%|██████████| 6/6 [00:11<00:00,  1.90s/it, 模型=LightGBM, Fold=3]
总体进度:  91%|█████████ | 61/67 [09:55<00:56,  9.49s/it]

Early stopping, best iteration is:
[16]	valid_0's ndcg@3: 0.693491

第 62/67 层




Training until validation scores don't improve for 15 rounds




Early stopping, best iteration is:
[3]	valid_0's ndcg@3: 0.684825
Training until validation scores don't improve for 15 rounds




Early stopping, best iteration is:
[11]	valid_0's ndcg@3: 0.670761
Training until validation scores don't improve for 15 rounds


训练分层(1000 rankers): 100%|██████████| 6/6 [00:07<00:00,  1.18s/it, 模型=LightGBM, Fold=3]
总体进度:  93%|█████████▎| 62/67 [10:03<00:44,  8.88s/it]

Early stopping, best iteration is:
[12]	valid_0's ndcg@3: 0.704188

第 63/67 层




Training until validation scores don't improve for 15 rounds




Early stopping, best iteration is:
[9]	valid_0's ndcg@3: 0.672404
Training until validation scores don't improve for 15 rounds




Early stopping, best iteration is:
[10]	valid_0's ndcg@3: 0.683974
Training until validation scores don't improve for 15 rounds


训练分层(1000 rankers): 100%|██████████| 6/6 [00:07<00:00,  1.25s/it, 模型=LightGBM, Fold=3]
总体进度:  94%|█████████▍| 63/67 [10:10<00:34,  8.56s/it]

Early stopping, best iteration is:
[12]	valid_0's ndcg@3: 0.704675

第 64/67 层




Training until validation scores don't improve for 15 rounds




Early stopping, best iteration is:
[24]	valid_0's ndcg@3: 0.657395
Training until validation scores don't improve for 15 rounds




Early stopping, best iteration is:
[11]	valid_0's ndcg@3: 0.674754
Training until validation scores don't improve for 15 rounds


训练分层(1000 rankers): 100%|██████████| 6/6 [00:07<00:00,  1.29s/it, 模型=LightGBM, Fold=3]
总体进度:  96%|█████████▌| 64/67 [10:19<00:25,  8.43s/it]

Early stopping, best iteration is:
[8]	valid_0's ndcg@3: 0.664709

第 65/67 层




Training until validation scores don't improve for 15 rounds




Early stopping, best iteration is:
[47]	valid_0's ndcg@3: 0.65945
Training until validation scores don't improve for 15 rounds




Early stopping, best iteration is:
[18]	valid_0's ndcg@3: 0.663916
Training until validation scores don't improve for 15 rounds


训练分层(1000 rankers): 100%|██████████| 6/6 [00:09<00:00,  1.66s/it, 模型=LightGBM, Fold=3]
总体进度:  97%|█████████▋| 65/67 [10:29<00:17,  8.99s/it]

Early stopping, best iteration is:
[9]	valid_0's ndcg@3: 0.611878

第 66/67 层




Training until validation scores don't improve for 15 rounds




Early stopping, best iteration is:
[57]	valid_0's ndcg@3: 0.649631
Training until validation scores don't improve for 15 rounds




Early stopping, best iteration is:
[25]	valid_0's ndcg@3: 0.648402
Training until validation scores don't improve for 15 rounds


训练分层(1000 rankers): 100%|██████████| 6/6 [00:12<00:00,  2.14s/it, 模型=LightGBM, Fold=3]
总体进度:  99%|█████████▊| 66/67 [10:42<00:10, 10.23s/it]

Early stopping, best iteration is:
[49]	valid_0's ndcg@3: 0.665219

第 67/67 层




Training until validation scores don't improve for 15 rounds




Early stopping, best iteration is:
[23]	valid_0's ndcg@3: 0.636254
Training until validation scores don't improve for 15 rounds




Early stopping, best iteration is:
[3]	valid_0's ndcg@3: 0.680435
Training until validation scores don't improve for 15 rounds


训练分层(586 rankers): 100%|██████████| 6/6 [00:07<00:00,  1.31s/it, 模型=LightGBM, Fold=3]
总体进度: 100%|██████████| 67/67 [10:50<00:00,  9.71s/it]

Early stopping, best iteration is:
[21]	valid_0's ndcg@3: 0.638288

🔮 开始预测...



集成预测: 100%|██████████| 2/2 [05:11<00:00, 155.58s/it, 模型=LightGBM]
                                                                 

✅ 验证排名...
 完成! 使用 12 个特征, 3 折交叉验证
🚀 开始航班推荐排序流程
加载数据...


清理数据: 100%|██████████| 2/2 [00:02<00:00,  1.03s/it]


开始特征选择: 284 -> 150


                                                                

特征选择完成: 15 个特征
数据加载完成: 训练集(1120257, 18), 测试集(358948, 17)
📊 分层训练: 6 层, 每层最多 1000 个ranker


总体进度:   0%|          | 0/6 [00:00<?, ?it/s]


第 1/6 层




Training until validation scores don't improve for 15 rounds




Early stopping, best iteration is:
[17]	valid_0's ndcg@3: 0.70249
Training until validation scores don't improve for 15 rounds




Early stopping, best iteration is:
[41]	valid_0's ndcg@3: 0.727786
Training until validation scores don't improve for 15 rounds


训练分层(1000 rankers): 100%|██████████| 6/6 [00:08<00:00,  1.34s/it, 模型=LightGBM, Fold=3]
总体进度:  17%|█▋        | 1/6 [00:08<00:40,  8.10s/it]

Early stopping, best iteration is:
[20]	valid_0's ndcg@3: 0.670904

第 2/6 层




Training until validation scores don't improve for 15 rounds




Early stopping, best iteration is:
[45]	valid_0's ndcg@3: 0.722652
Training until validation scores don't improve for 15 rounds




Early stopping, best iteration is:
[21]	valid_0's ndcg@3: 0.711175
Training until validation scores don't improve for 15 rounds


训练分层(1000 rankers): 100%|██████████| 6/6 [00:07<00:00,  1.27s/it, 模型=LightGBM, Fold=3]
总体进度:  33%|███▎      | 2/6 [00:15<00:31,  7.87s/it]

Early stopping, best iteration is:
[11]	valid_0's ndcg@3: 0.698162

第 3/6 层




Training until validation scores don't improve for 15 rounds




Early stopping, best iteration is:
[7]	valid_0's ndcg@3: 0.68609
Training until validation scores don't improve for 15 rounds




Early stopping, best iteration is:
[9]	valid_0's ndcg@3: 0.705158
Training until validation scores don't improve for 15 rounds


训练分层(1000 rankers): 100%|██████████| 6/6 [00:06<00:00,  1.13s/it, 模型=LightGBM, Fold=3]
总体进度:  50%|█████     | 3/6 [00:22<00:22,  7.40s/it]

Early stopping, best iteration is:
[18]	valid_0's ndcg@3: 0.69729

第 4/6 层




Training until validation scores don't improve for 15 rounds




Early stopping, best iteration is:
[11]	valid_0's ndcg@3: 0.710869
Training until validation scores don't improve for 15 rounds




Early stopping, best iteration is:
[29]	valid_0's ndcg@3: 0.698351
Training until validation scores don't improve for 15 rounds


训练分层(1000 rankers): 100%|██████████| 6/6 [00:10<00:00,  1.80s/it, 模型=LightGBM, Fold=3]
总体进度:  67%|██████▋   | 4/6 [00:33<00:17,  8.78s/it]

Early stopping, best iteration is:
[71]	valid_0's ndcg@3: 0.71045

第 5/6 层




Training until validation scores don't improve for 15 rounds




Early stopping, best iteration is:
[59]	valid_0's ndcg@3: 0.73026
Training until validation scores don't improve for 15 rounds




Early stopping, best iteration is:
[37]	valid_0's ndcg@3: 0.727658
Training until validation scores don't improve for 15 rounds


训练分层(1000 rankers): 100%|██████████| 6/6 [00:10<00:00,  1.80s/it, 模型=LightGBM, Fold=3]
总体进度:  83%|████████▎ | 5/6 [00:44<00:09,  9.54s/it]

Early stopping, best iteration is:
[26]	valid_0's ndcg@3: 0.713773

第 6/6 层




Training until validation scores don't improve for 15 rounds




Early stopping, best iteration is:
[37]	valid_0's ndcg@3: 0.698988
Training until validation scores don't improve for 15 rounds




Early stopping, best iteration is:
[8]	valid_0's ndcg@3: 0.654112
Training until validation scores don't improve for 15 rounds


训练分层(575 rankers): 100%|██████████| 6/6 [00:06<00:00,  1.11s/it, 模型=LightGBM, Fold=3]
总体进度: 100%|██████████| 6/6 [00:51<00:00,  8.53s/it]


Early stopping, best iteration is:
[14]	valid_0's ndcg@3: 0.669835

🔮 开始预测...


集成预测: 100%|██████████| 2/2 [00:03<00:00,  1.70s/it, 模型=LightGBM]
                                                             

✅ 验证排名...
 完成! 使用 15 个特征, 3 折交叉验证


完成预测，总计 6897776 行结果
