# 航班排名竞赛 - 业务逻辑理解与特征分析
## Flight Ranking Competition - Business Logic Understanding & Feature Analysis

#### 1. **智能缺失值处理**
- 自动移除缺失值超过80%的特征
- 按`ranker_id`分组分析，只处理有足够数据的搜索会话
- 对保留特征进行合理填充策略

#### 2. **分析维度**
- 商务vs休闲旅行者深度对比
- 价格敏感性和支付意愿分析
- 时间偏好模式（出发时间、预订时间）
- 航线复杂性偏好（直飞vs中转）
- 航空公司和机场偏好
- 用户聚类和忠诚度分析
- 企业差旅政策影响
- 舱位等级偏好
- 提前预订行为分析
- 往返vs单程偏好
- 用户转化率分析
- 拓展...

#### 3. **功能**
- **用户画像系统**：为每个用户创建详细的偏好档案
- **智能聚类**：将用户分为4个主要群体
- **交互式可视化**：使用Plotly创建动态图表
- **商业洞察**：生成可执行的商业建议

### 📊 系统架构

```
EnhancedFlightDataAnalyzer
├── 数据加载与预处理
├── 分组数据创建 (按ranker_id)
├── 用户画像生成
├── 多维度分析模块
│   ├── 商务vs休闲分析
│   ├── 价格敏感性分析
│   ├── 时间偏好分析
│   ├── 航线偏好分析
│   └── 用户聚类分析
└── 综合报告生成
```

### 🚀 使用方法

```python
# 初始化分析器
analyzer = EnhancedFlightDataAnalyzer(
    train_path='data/train.parquet',
    test_path='data/test.parquet'
)

# 运行完整分析
analyzer.run_full_analysis()

# 或者运行单个分析模块
analyzer.load_and_preprocess_data()
analyzer.analyze_business_vs_leisure_detailed()
analyzer.analyze_price_sensitivity()
```

### 🎯 关键洞察能力

1. **用户分群**：自动识别高价值商务用户和价格敏感休闲用户
2. **偏好预测**：基于历史行为预测用户选择倾向
3. **个性化推荐**：为不同用户群体提供定制化的航班推荐策略
4. **转化优化**：识别影响用户选择的关键因素

### 📈 输出报告

系统会生成：
- 多维度可视化图表
- 用户聚类分析结果
- 商业洞察和建议
- 详细的统计报告文件

这个系统特别适合：
- 航班推荐系统优化
- 用户行为分析
- 个性化营销策略制定
- 产品功能改进决策

In [5]:
#!/usr/bin/env python3

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import gc
import os
import psutil
import time
from pathlib import Path
from datetime import datetime
import json
from typing import Dict, List, Optional, Tuple, Any, Union
from functools import wraps
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

warnings.filterwarnings('ignore')

def memory_monitor(func):
    """内存监控装饰器"""
    @wraps(func)
    def wrapper(*args, **kwargs):
        process = psutil.Process()
        mem_before = process.memory_info().rss / 1024**2
        
        start_time = time.time()
        result = func(*args, **kwargs)
        end_time = time.time()
        
        mem_after = process.memory_info().rss / 1024**2
        print(f"[{func.__name__}] 耗时: {end_time-start_time:.2f}s | 内存: {mem_after:.1f}MB ({mem_after-mem_before:+.1f}MB)")
        
        return result
    return wrapper

class EnhancedFlightDataAnalyzer:
    """航班数据分析器 - 支持单一数据集分析"""
    
    def __init__(self, data_path: str, output_dir: str = "flight_analysis", max_rows: int = 1000000):
        """
        初始化分析器
        
        Args:
            data_path: 数据文件路径
            output_dir: 输出目录
            max_rows: 最大处理行数（用于内存控制）
        """
        if not data_path or not os.path.exists(data_path):
            raise ValueError(f"数据文件路径无效: {data_path}")
            
        self.data_path = data_path
        self.output_dir = output_dir
        self.max_rows = max_rows
        self.chunk_size = 100000
        
        # 初始化目录
        Path(output_dir).mkdir(parents=True, exist_ok=True)
        for subdir in ['reports', 'plots', 'processed_data']:
            Path(f"{output_dir}/{subdir}").mkdir(exist_ok=True)
        
        # 数据存储
        self.data = None
        self.grouped_data = None
        self.analysis_results = {}
        self.data_info = {}
        
        print(f"航班数据分析器已初始化")
        print(f"- 数据文件: {data_path}")
        print(f"- 输出目录: {output_dir}")
        print(f"- 最大处理行数: {max_rows:,}")
    
    @memory_monitor
    def load_and_preprocess_data(self):
        """加载和预处理数据"""
        print("开始加载数据...")
        
        # 获取文件信息
        file_size_mb = os.path.getsize(self.data_path) / (1024**2)
        print(f"文件大小: {file_size_mb:.1f} MB")
        
        # 根据文件大小决定加载策略
        if file_size_mb > 500:  # 大于500MB进行采样
            print("文件较大，进行采样加载...")
            self.data = self._load_large_file_sampled()
        else:
            print("直接加载文件...")
            self.data = self._load_file_direct()
        
        if self.data is None or len(self.data) == 0:
            raise ValueError("数据加载失败，请检查文件格式和路径")
        
        print(f"数据加载完成 | 总行数: {len(self.data):,} | 列数: {len(self.data.columns)}")
        print(f"数据列: {list(self.data.columns)}")
        
        # 数据信息统计
        self._analyze_data_structure()
        
        # 预处理
        self._preprocess_data()
        
        # 创建用户分组数据（如果有用户标识）
        self._create_grouped_data()
    
    def _load_large_file_sampled(self) -> pd.DataFrame:
        """采样加载大文件"""
        try:
            # 先读取小样本了解数据结构
            sample_data = pd.read_parquet(self.data_path, nrows=10000)
            print(f"样本数据结构: {sample_data.shape}")
            print(f"列名: {list(sample_data.columns)}")
            
            # 计算总行数
            try:
                import pyarrow.parquet as pq
                parquet_file = pq.ParquetFile(self.data_path)
                total_rows = parquet_file.metadata.num_rows
                print(f"文件总行数: {total_rows:,}")
            except:
                total_rows = None
            
            # 根据内存限制决定采样策略
            if total_rows and total_rows > self.max_rows:
                # 随机采样
                sample_fraction = min(self.max_rows / total_rows, 1.0)
                print(f"采样比例: {sample_fraction:.2%}")
     
        except Exception as e:
            print(f"读取文件失败: {e}")
            return None
                
    def _load_large_file_sampled(self) -> pd.DataFrame:
        """采样加载大文件"""
        try:
            # 先读取小样本了解数据结构
            print("读取数据样本以了解结构...")
            
            # 使用pyarrow尝试获取文件信息
            try:
                import pyarrow.parquet as pq
                parquet_file = pq.ParquetFile(self.data_path)
                total_rows = parquet_file.metadata.num_rows
                print(f"文件总行数: {total_rows:,}")
                
                # 如果行数不大，直接读取
                if total_rows <= self.max_rows:
                    table = parquet_file.read()
                    return table.to_pandas()
                
                # 否则进行采样
                return self._sample_large_parquet(parquet_file, total_rows)
                
            except ImportError:
                print("pyarrow未安装，使用pandas直接读取...")
                return self._load_file_direct()
            except Exception as e:
                print(f"pyarrow读取失败: {e}")
                return self._load_file_direct()
                
        except Exception as e:
            print(f"采样加载失败: {e}")
            return self._load_file_direct()
    
    def _sample_large_parquet(self, parquet_file, total_rows: int) -> pd.DataFrame:
        """对大型parquet文件进行采样"""
        print(f"对大文件进行采样 ({total_rows:,} -> {self.max_rows:,})")
        
        try:
            # 计算采样比例
            sample_ratio = self.max_rows / total_rows
            batch_size = min(self.chunk_size, total_rows // 10)  # 分成10批处理
            
            sampled_chunks = []
            processed_rows = 0
            
            for batch in parquet_file.iter_batches(batch_size=batch_size):
                batch_df = batch.to_pandas()
                processed_rows += len(batch_df)
                
                # 对每个批次进行采样
                if len(batch_df) > 0:
                    sample_size = max(1, int(len(batch_df) * sample_ratio))
                    sampled_batch = batch_df.sample(n=sample_size, random_state=42)
                    sampled_chunks.append(sampled_batch)
                
                # 显示进度
                if processed_rows % (batch_size * 5) == 0:
                    print(f"已处理: {processed_rows:,}/{total_rows:,} ({processed_rows/total_rows:.1%})")
                
                # 如果已经采样到足够数据，停止
                current_sampled = sum(len(chunk) for chunk in sampled_chunks)
                if current_sampled >= self.max_rows:
                    break
            
            if sampled_chunks:
                result = pd.concat(sampled_chunks, ignore_index=True)
                # 确保不超过最大行数
                if len(result) > self.max_rows:
                    result = result.sample(n=self.max_rows, random_state=42)
                print(f"采样完成，最终数据量: {len(result):,}")
                return result
            else:
                print("采样失败，返回空DataFrame")
                return pd.DataFrame()
                
        except Exception as e:
            print(f"采样过程出错: {e}")
            # 尝试直接读取一部分数据
            try:
                table = parquet_file.read(use_threads=False)
                data = table.to_pandas()
                if len(data) > self.max_rows:
                    data = data.sample(n=self.max_rows, random_state=42)
                return data
            except:
                return pd.DataFrame()
                
        except Exception as e:
            print(f"采样加载失败: {e}")
            return self._load_file_direct()
    
    def _load_file_direct(self) -> pd.DataFrame:
        """直接加载文件"""
        try:
            # 先尝试读取全部数据
            data = pd.read_parquet(self.data_path)
            
            # 如果数据量超过限制，进行采样
            if len(data) > self.max_rows:
                print(f"数据量 ({len(data):,}) 超过限制 ({self.max_rows:,})，进行随机采样...")
                data = data.sample(n=self.max_rows, random_state=42)
                print(f"采样后数据量: {len(data):,}")
            
            return data
        except Exception as e:
            print(f"直接加载失败: {e}")
    def _load_parquet_with_pyarrow(self) -> pd.DataFrame:
        """使用pyarrow加载parquet文件"""
        try:
            import pyarrow.parquet as pq
            
            # 读取parquet文件
            parquet_file = pq.ParquetFile(self.data_path)
            total_rows = parquet_file.metadata.num_rows
            
            print(f"使用pyarrow读取，总行数: {total_rows:,}")
            
            # 如果行数超过限制，按批次读取并采样
            if total_rows > self.max_rows:
                print(f"分批读取并采样到 {self.max_rows:,} 行...")
                
                # 计算采样比例
                sample_ratio = self.max_rows / total_rows
                batch_size = min(self.chunk_size, self.max_rows)
                
                sampled_data = []
                for batch in parquet_file.iter_batches(batch_size=batch_size):
                    batch_df = batch.to_pandas()
                    
                    # 随机采样
                    if len(batch_df) > 0:
                        sample_size = max(1, int(len(batch_df) * sample_ratio))
                        sampled_batch = batch_df.sample(n=sample_size, random_state=42)
                        sampled_data.append(sampled_batch)
                    
                    # 如果已经采样到足够数据，停止
                    if sum(len(df) for df in sampled_data) >= self.max_rows:
                        break
                
                if sampled_data:
                    result = pd.concat(sampled_data, ignore_index=True)
                    # 确保不超过最大行数
                    if len(result) > self.max_rows:
                        result = result.sample(n=self.max_rows, random_state=42)
                    return result
                else:
                    return pd.DataFrame()
            else:
                # 直接读取全部数据
                table = parquet_file.read()
                return table.to_pandas()
                
        except ImportError:
            print("pyarrow未安装，尝试其他方法...")
            return self._load_parquet_fallback()
        except Exception as e:
            print(f"pyarrow加载失败: {e}")
            return self._load_parquet_fallback()
    
    def _load_parquet_fallback(self) -> pd.DataFrame:
        """parquet文件最后的备用方法"""
        try:
            # 尝试读取一小部分数据
            print("尝试读取数据样本...")
            
            # 先读取很小的样本来了解数据结构
            import tempfile
            import os
            
            # 创建临时文件进行测试读取
            data = pd.read_parquet(self.data_path)
            
            # 如果成功读取但数据量太大，进行采样
            if len(data) > self.max_rows:
                print(f"数据量过大，从 {len(data):,} 行采样到 {self.max_rows:,} 行")
                data = data.sample(n=self.max_rows, random_state=42)
            
            return data
            
        except Exception as e:
            print(f"所有加载方法都失败了: {e}")
            print("请检查文件格式和完整性")
            return pd.DataFrame()
    
    def _analyze_data_structure(self):
        """分析数据结构"""
        print("分析数据结构...")
        
        self.data_info = {
            'shape': self.data.shape,
            'columns': list(self.data.columns),
            'dtypes': self.data.dtypes.to_dict(),
            'missing_values': self.data.isnull().sum().to_dict(),
            'memory_usage_mb': self.data.memory_usage(deep=True).sum() / 1024**2
        }
        
        print(f"数据形状: {self.data_info['shape']}")
        print(f"内存使用: {self.data_info['memory_usage_mb']:.1f} MB")
        print(f"列数: {len(self.data_info['columns'])}")
        
        # 检查关键列
        key_columns = ['ranker_id', 'user_id', 'departure_datetime', 'booking_datetime', 'price', 'num_stops']
        available_columns = [col for col in key_columns if col in self.data.columns]
        missing_columns = [col for col in key_columns if col not in self.data.columns]
        
        print(f"可用关键列: {available_columns}")
        if missing_columns:
            print(f"缺失关键列: {missing_columns}")
    
    def _preprocess_data(self):
        """预处理数据"""
        print("预处理数据...")
        
        # 数据类型优化
        for col in self.data.columns:
            if self.data[col].dtype == 'object':
                try:
                    # 尝试转换为数值
                    self.data[col] = pd.to_numeric(self.data[col], errors='ignore')
                except:
                    pass
        
        # 处理时间列
        datetime_columns = ['departure_datetime', 'booking_datetime', 'arrival_datetime']
        for col in datetime_columns:
            if col in self.data.columns:
                try:
                    self.data[col] = pd.to_datetime(self.data[col], errors='coerce')
                    print(f"处理时间列: {col}")
                except:
                    print(f"时间列处理失败: {col}")
        
        # 特征工程
        if 'departure_datetime' in self.data.columns:
            self.data['departure_hour'] = self.data['departure_datetime'].dt.hour
            self.data['departure_day'] = self.data['departure_datetime'].dt.day_name()
            self.data['departure_month'] = self.data['departure_datetime'].dt.month
            self.data['is_weekend'] = self.data['departure_datetime'].dt.weekday >= 5
        
        if 'booking_datetime' in self.data.columns and 'departure_datetime' in self.data.columns:
            self.data['advance_booking_days'] = (
                self.data['departure_datetime'] - self.data['booking_datetime']
            ).dt.days
        
        # 价格相关特征
        if 'price' in self.data.columns:
            self.data['price_level'] = pd.cut(
                self.data['price'], 
                bins=5, 
                labels=['极低价', '低价', '中价', '高价', '极高价']
            )
        
        # 航线复杂性
        if 'num_stops' in self.data.columns:
            self.data['flight_type'] = self.data['num_stops'].apply(
                lambda x: '直飞' if x == 0 else ('一次中转' if x == 1 else '多次中转')
            )
        
        print("数据预处理完成")
    
    def _create_grouped_data(self):
        """创建用户分组数据"""
        # 寻找用户标识列
        user_id_col = 'ranker_id'
        
        if not user_id_col in self.data.columns:
            print("警告: 未找到用户标识列，将进行整体分析")
            self.grouped_data = None
            return
        
        print(f"使用用户标识列: {user_id_col}")
        print("创建用户分组数据...")
        
        # 按用户分组统计
        user_stats = []
        user_groups = self.data.groupby(user_id_col)
        
        for user_id, group in user_groups:
            if len(group) >= 3:  # 只处理有足够数据的用户
                stats = self._calculate_user_stats(group)
                stats[user_id_col] = user_id
                user_stats.append(stats)
        
        if user_stats:
            self.grouped_data = pd.DataFrame(user_stats)
            print(f"创建用户分组数据完成 | 用户数: {len(self.grouped_data):,}")
        else:
            print("警告: 没有足够的用户数据进行分组分析")
            self.grouped_data = None
    
    def _calculate_user_stats(self, group: pd.DataFrame) -> Dict:
        """计算用户统计信息"""
        stats = {
            'total_searches': len(group),
            'avg_price': group['price'].mean() if 'price' in group.columns and not group['price'].isna().all() else 0,
            'price_std': group['price'].std() if 'price' in group.columns and not group['price'].isna().all() else 0,
            'price_sensitivity': self._calculate_price_sensitivity(group),
            'preferred_departure_hour': group['departure_hour'].mode().iloc[0] if 'departure_hour' in group.columns and len(group['departure_hour'].dropna()) > 0 else 12,
            'weekend_ratio': group['is_weekend'].mean() if 'is_weekend' in group.columns else 0,
            'avg_advance_booking': group['advance_booking_days'].mean() if 'advance_booking_days' in group.columns and not group['advance_booking_days'].isna().all() else 0,
            'direct_flight_ratio': (group['num_stops'] == 0).mean() if 'num_stops' in group.columns else 0,
            'business_indicator': self._identify_business_traveler(group)
        }
        
        return stats
    
    def _calculate_price_sensitivity(self, group: pd.DataFrame) -> float:
        """计算价格敏感性"""
        if 'price' not in group.columns or len(group) < 3 or group['price'].isna().all():
            return 0.5
        
        price_data = group['price'].dropna()
        if len(price_data) < 3:
            return 0.5
        
        price_ranges = price_data.quantile([0.25, 0.75])
        low_price_selections = len(price_data[price_data <= price_ranges[0.25]])
        high_price_selections = len(price_data[price_data >= price_ranges[0.75]])
        
        if low_price_selections + high_price_selections == 0:
            return 0.5
        
        return low_price_selections / (low_price_selections + high_price_selections)
    
    def _identify_business_traveler(self, group: pd.DataFrame) -> float:
        """识别商务旅行者"""
        business_indicators = []
        
        # 工作日出行比例
        if 'is_weekend' in group.columns:
            weekday_ratio = 1 - group['is_weekend'].mean()
            business_indicators.append(weekday_ratio)
        
        # 早班机偏好
        if 'departure_hour' in group.columns:
            early_flight_ratio = (group['departure_hour'] <= 8).mean()
            business_indicators.append(early_flight_ratio)
        
        # 短期预订
        if 'advance_booking_days' in group.columns:
            short_booking_data = group['advance_booking_days'].dropna()
            if len(short_booking_data) > 0:
                short_booking_ratio = (short_booking_data <= 7).mean()
                business_indicators.append(short_booking_ratio)
        
        # 直飞偏好
        if 'num_stops' in group.columns:
            direct_ratio = (group['num_stops'] == 0).mean()
            business_indicators.append(direct_ratio)
        
        return np.mean(business_indicators) if business_indicators else 0.5
    
    @memory_monitor
    def analyze_overall_patterns(self):
        """分析整体模式"""
        print("分析整体模式...")
        
        patterns = {}
        
        # 价格分析
        if 'price' in self.data.columns:
            price_data = self.data['price'].dropna()
            patterns['price_analysis'] = {
                'mean': price_data.mean(),
                'median': price_data.median(),
                'std': price_data.std(),
                'min': price_data.min(),
                'max': price_data.max(),
                'q25': price_data.quantile(0.25),
                'q75': price_data.quantile(0.75)
            }
        
        # 时间模式分析
        if 'departure_hour' in self.data.columns:
            patterns['time_patterns'] = {
                'peak_hours': self.data['departure_hour'].value_counts().head(5).to_dict(),
                'weekend_vs_weekday': self.data['is_weekend'].value_counts().to_dict() if 'is_weekend' in self.data.columns else {}
            }
        
        # 航线类型分析
        if 'num_stops' in self.data.columns:
            patterns['flight_type_analysis'] = {
                'direct_flights_ratio': (self.data['num_stops'] == 0).mean(),
                'stops_distribution': self.data['num_stops'].value_counts().to_dict()
            }
        
        # 预订模式分析
        if 'advance_booking_days' in self.data.columns:
            booking_data = self.data['advance_booking_days'].dropna()
            patterns['booking_patterns'] = {
                'avg_advance_days': booking_data.mean(),
                'median_advance_days': booking_data.median(),
                'last_minute_ratio': (booking_data <= 1).mean(),
                'planned_booking_ratio': (booking_data >= 14).mean()
            }
        
        # 可视化
        self._create_overall_pattern_plots(patterns)
        
        self.analysis_results['overall_patterns'] = patterns
        print("整体模式分析完成")
    
    def _create_overall_pattern_plots(self, patterns: Dict):
        """创建整体模式图表"""
        fig = make_subplots(
            rows=2, cols=2,
            subplot_titles=('价格分布', '出发时间分布', '航班类型分布', '预订提前天数分布'),
            specs=[[{"secondary_y": False}, {"secondary_y": False}],
                   [{"secondary_y": False}, {"secondary_y": False}]]
        )
        
        # 价格分布
        if 'price' in self.data.columns:
            price_data = self.data['price'].dropna()
            fig.add_trace(go.Histogram(x=price_data, name='价格分布', nbinsx=50), row=1, col=1)
        
        # 出发时间分布
        if 'departure_hour' in self.data.columns:
            hour_counts = self.data['departure_hour'].value_counts().sort_index()
            fig.add_trace(go.Bar(x=hour_counts.index, y=hour_counts.values, name='出发时间'), row=1, col=2)
        
        # 航班类型分布
        if 'num_stops' in self.data.columns:
            stops_counts = self.data['num_stops'].value_counts().sort_index()
            fig.add_trace(go.Bar(x=stops_counts.index, y=stops_counts.values, name='中转次数'), row=2, col=1)
        
        # 预订提前天数分布
        if 'advance_booking_days' in self.data.columns:
            booking_data = self.data['advance_booking_days'].dropna()
            fig.add_trace(go.Histogram(x=booking_data, name='预订提前天数', nbinsx=50), row=2, col=2)
        
        fig.update_layout(height=800, title_text="整体模式分析")
        fig.write_html(f"{self.output_dir}/plots/overall_patterns.html")
    
    @memory_monitor
    def analyze_user_behavior(self):
        """分析用户行为（如果有用户数据）"""
        if self.grouped_data is None:
            print("跳过用户行为分析 - 无用户分组数据")
            return
        
        print("分析用户行为...")
        
        # 用户分类
        if 'business_indicator' in self.grouped_data.columns:
            threshold = self.grouped_data['business_indicator'].median()
            business_users = self.grouped_data[self.grouped_data['business_indicator'] >= threshold]
            leisure_users = self.grouped_data[self.grouped_data['business_indicator'] < threshold]
            
            behavior_analysis = {
                'business_users': {
                    'count': len(business_users),
                    'avg_price': business_users['avg_price'].mean(),
                    'price_sensitivity': business_users['price_sensitivity'].mean(),
                    'advance_booking': business_users['avg_advance_booking'].mean(),
                    'direct_flight_ratio': business_users['direct_flight_ratio'].mean(),
                    'weekend_ratio': business_users['weekend_ratio'].mean()
                },
                'leisure_users': {
                    'count': len(leisure_users),
                    'avg_price': leisure_users['avg_price'].mean(),
                    'price_sensitivity': leisure_users['price_sensitivity'].mean(),
                    'advance_booking': leisure_users['avg_advance_booking'].mean(),
                    'direct_flight_ratio': leisure_users['direct_flight_ratio'].mean(),
                    'weekend_ratio': leisure_users['weekend_ratio'].mean()
                }
            }
            
            # 可视化
            self._create_user_behavior_plots(business_users, leisure_users)
            
            self.analysis_results['user_behavior'] = behavior_analysis
            
        print("用户行为分析完成")
    
    def _create_user_behavior_plots(self, business_users: pd.DataFrame, leisure_users: pd.DataFrame):
        """创建用户行为图表"""
        fig = make_subplots(
            rows=2, cols=2,
            subplot_titles=('价格敏感性对比', '平均价格对比', '提前预订天数对比', '直飞偏好对比'),
            specs=[[{"secondary_y": False}, {"secondary_y": False}],
                   [{"secondary_y": False}, {"secondary_y": False}]]
        )
        
        # 价格敏感性
        fig.add_trace(go.Histogram(x=business_users['price_sensitivity'], name='商务用户', 
                                 opacity=0.7, nbinsx=20), row=1, col=1)
        fig.add_trace(go.Histogram(x=leisure_users['price_sensitivity'], name='休闲用户', 
                                 opacity=0.7, nbinsx=20), row=1, col=1)
        
        # 平均价格
        categories = ['商务用户', '休闲用户']
        avg_prices = [business_users['avg_price'].mean(), leisure_users['avg_price'].mean()]
        fig.add_trace(go.Bar(x=categories, y=avg_prices, name='平均价格'), row=1, col=2)
        
        # 提前预订
        fig.add_trace(go.Histogram(x=business_users['avg_advance_booking'], name='商务用户', 
                                 opacity=0.7, nbinsx=20), row=2, col=1)
        fig.add_trace(go.Histogram(x=leisure_users['avg_advance_booking'], name='休闲用户', 
                                 opacity=0.7, nbinsx=20), row=2, col=1)
        
        # 直飞偏好
        direct_ratios = [business_users['direct_flight_ratio'].mean(), leisure_users['direct_flight_ratio'].mean()]
        fig.add_trace(go.Bar(x=categories, y=direct_ratios, name='直飞比例'), row=2, col=2)
        
        fig.update_layout(height=800, title_text="用户行为对比分析")
        fig.write_html(f"{self.output_dir}/plots/user_behavior_analysis.html")
    
    @memory_monitor
    def generate_comprehensive_report(self):
        """生成综合分析报告"""
        print("生成综合分析报告...")
        
        report = f"""# 航班数据分析报告

## 数据概况
- 数据文件: {self.data_path}
- 记录数: {len(self.data):,}
- 内存使用: {self.data_info['memory_usage_mb']:.1f} MB
- 分析时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}

## 数据结构
- 数据维度: {self.data_info['shape']}
- 列数: {len(self.data_info['columns'])}
- 主要列: {', '.join(self.data_info['columns'][:10])}

## 主要发现

### 1. 整体模式分析
"""
        
        if 'overall_patterns' in self.analysis_results:
            patterns = self.analysis_results['overall_patterns']
            
            if 'price_analysis' in patterns:
                price_info = patterns['price_analysis']
                report += f"""
**价格分析:**
- 平均价格: {price_info['mean']:.2f}
- 中位价格: {price_info['median']:.2f}
- 价格标准差: {price_info['std']:.2f}
- 价格范围: {price_info['min']:.2f} - {price_info['max']:.2f}
"""
            
            if 'flight_type_analysis' in patterns:
                flight_info = patterns['flight_type_analysis']
                report += f"""
**航班类型分析:**
- 直飞比例: {flight_info['direct_flights_ratio']:.2%}
- 中转分布: {flight_info['stops_distribution']}
"""
            
            if 'booking_patterns' in patterns:
                booking_info = patterns['booking_patterns']
                report += f"""
**预订模式分析:**
- 平均提前预订: {booking_info['avg_advance_days']:.1f} 天
- 临时预订比例: {booking_info['last_minute_ratio']:.2%}
- 计划预订比例: {booking_info['planned_booking_ratio']:.2%}
"""
        
        if 'user_behavior' in self.analysis_results:
            behavior = self.analysis_results['user_behavior']
            report += f"""
### 2. 用户行为分析

**商务用户 ({behavior['business_users']['count']}人):**
- 平均价格: {behavior['business_users']['avg_price']:.2f}
- 价格敏感性: {behavior['business_users']['price_sensitivity']:.2f}
- 提前预订: {behavior['business_users']['advance_booking']:.1f} 天
- 直飞偏好: {behavior['business_users']['direct_flight_ratio']:.2%}

**休闲用户 ({behavior['leisure_users']['count']}人):**
- 平均价格: {behavior['leisure_users']['avg_price']:.2f}
- 价格敏感性: {behavior['leisure_users']['price_sensitivity']:.2f}
- 提前预订: {behavior['leisure_users']['advance_booking']:.1f} 天
- 直飞偏好: {behavior['leisure_users']['direct_flight_ratio']:.2%}
"""
        
        report += f"""
## 商业建议

### 整体策略
1. 优化价格策略，关注价格敏感用户群体
2. 提供多样化的航班选择（直飞/中转）
3. 针对不同预订习惯制定差异化服务

### 技术建议
1. 改进搜索排序算法，考虑用户历史行为
2. 实施个性化推荐系统
3. 优化移动端预订流程

---
*报告生成时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}*
*数据来源: {self.data_path}*
"""
        
        # 保存报告
        report_path = f"{self.output_dir}/reports/comprehensive_analysis.md"
        with open(report_path, 'w', encoding='utf-8') as f:
            f.write(report)
        
        print(f"综合报告已保存到: {report_path}")
    
    def run_analysis(self):
        """运行完整分析流程"""
        print("=" * 60)
        print("开始航班数据分析...")
        print("=" * 60)
        
        try:
            # 1. 数据加载和预处理
            self.load_and_preprocess_data()
            
            # 2. 整体模式分析
            self.analyze_overall_patterns()
            
            # 3. 用户行为分析（如果有用户数据）
            self.analyze_user_behavior()
            
            # 4. 生成综合报告
            self.generate_comprehensive_report()
            
            print("=" * 60)
            print("分析完成！")
            print("=" * 60)
            print("查看结果:")
            print(f"- 报告: {self.output_dir}/reports/")
            print(f"- 图表: {self.output_dir}/plots/")
            
        except Exception as e:
            print(f"分析过程中出现错误: {e}")
            raise
        finally:
            # 清理内存
            self.cleanup()
    
    def cleanup(self):
        """清理内存"""
        if hasattr(self, 'data'):
            del self.data
        if hasattr(self, 'grouped_data'):
            del self.grouped_data
        gc.collect()
        print("内存清理完成")

# 便捷函数
def analyze_flight_data(data_path: str, output_dir: str = "flight_analysis", max_rows: int = 1000000):
    """分析航班数据的便捷函数"""
    analyzer = EnhancedFlightDataAnalyzer(data_path, output_dir, max_rows)
    analyzer.run_analysis()
    return analyzer

if __name__ == "__main__":
    # 首先测试数据加载
    print("=" * 60)
    print("测试数据文件")
    print("=" * 60)
    
    test_files = [
        "../data/test.parquet",
        "../data/train.parquet"
    ]
    
    # 示例1: 分析测试数据
    print("=" * 60)
    print("分析测试数据")
    print("=" * 60)
    try:
        test_analyzer = analyze_flight_data(
            data_path="../data/test.parquet",
            output_dir="../data/test_flight_analysis_results",
            max_rows=500000  # 限制最大行数以控制内存
        )
        print("✅ 测试数据分析完成！")
    except Exception as e:
        print(f"❌ 测试数据分析失败: {e}")
    
    print("\n" + "=" * 60)
    print("分析训练数据")
    print("=" * 60)
    # 示例2: 分析训练数据
    try:
        train_analyzer = analyze_flight_data(
            data_path="../data/train.parquet",
            output_dir="../data/train_flight_analysis_results",
            max_rows=500000  # 限制最大行数以控制内存
        )
        print("✅ 训练数据分析完成！")
    except Exception as e:
        print(f"❌ 训练数据分析失败: {e}")
    
    print("\n" + "=" * 60)
    print("分析完成，请查看输出目录中的结果")
    print("=" * 60)

测试数据文件
分析测试数据
航班数据分析器已初始化
- 数据文件: ../data/test.parquet
- 输出目录: ../data/test_flight_analysis_results
- 最大处理行数: 500,000
开始航班数据分析...
开始加载数据...
文件大小: 137.5 MB
直接加载文件...
数据量 (6,897,776) 超过限制 (500,000)，进行随机采样...
采样后数据量: 500,000
数据加载完成 | 总行数: 500,000 | 列数: 125
数据列: ['Id', 'bySelf', 'companyID', 'corporateTariffCode', 'frequentFlyer', 'nationality', 'isAccess3D', 'isVip', 'legs0_arrivalAt', 'legs0_departureAt', 'legs0_duration', 'legs0_segments0_aircraft_code', 'legs0_segments0_arrivalTo_airport_city_iata', 'legs0_segments0_arrivalTo_airport_iata', 'legs0_segments0_baggageAllowance_quantity', 'legs0_segments0_baggageAllowance_weightMeasurementType', 'legs0_segments0_cabinClass', 'legs0_segments0_departureFrom_airport_iata', 'legs0_segments0_duration', 'legs0_segments0_flightNumber', 'legs0_segments0_marketingCarrier_code', 'legs0_segments0_operatingCarrier_code', 'legs0_segments0_seatsAvailable', 'legs0_segments1_aircraft_code', 'legs0_segments1_arrivalTo_airport_city_iata', 'legs0_segments1_a