In [31]:
import pandas as pd
import pickle
import numpy as np

def load_model_from_dict(model_path):
    with open(model_path, 'rb') as f:
        loaded_data = pickle.load(f)
    
    if isinstance(loaded_data, dict):
        for key in ['model', 'best_model', 'estimator', 'ranker', 'lgbm_model', 'trained_model']:
            if key in loaded_data and hasattr(loaded_data[key], 'predict'):
                return loaded_data[key]
        for key, value in loaded_data.items():
            if hasattr(value, 'predict'):
                return value
        raise ValueError("No model found with predict method")
    return loaded_data

# 配置
model_path = r"D:/kaggle/filght/data/aeroclub-recsys-2025/segmented/data/aeroclub-recsys-2025/models/LGBMRanker_segment_0.pkl"
test_dir = r"D:/kaggle/filght/data/aeroclub-recsys-2025/segmented/test"

# 加载模型
model = load_model_from_dict(model_path)
feature_names = model.booster_.feature_name()

test_files = ["test_segment_0.parquet", "test_segment_1.parquet", "test_segment_2.parquet"]
all_results = []

for test_file in test_files:
    # 加载数据
    test_data = pd.read_parquet(f"{test_dir}/{test_file}")
    
    # 处理缺失特征
    missing_features = set(feature_names) - set(test_data.columns)
    if missing_features:
        missing_data = pd.DataFrame(
            np.zeros((len(test_data), len(missing_features))), 
            columns=list(missing_features)
        )
        test_data = pd.concat([test_data, missing_data], axis=1)
    
    # 预测
    X_test = test_data[feature_names]
    predictions = model.predict(X_test)
    
    # 生成结果
    results = test_data[['Id', 'ranker_id']].copy()
    results['prediction_score'] = predictions
    
    # 确保唯一排名：使用Id作为tie-breaker
    results = results.sort_values(['ranker_id', 'prediction_score', 'Id'], 
                                 ascending=[True, False, True])
    results['selected'] = results.groupby('ranker_id').cumcount() + 1
    
    # 保存单个结果
    final_results = results[['Id', 'ranker_id', 'selected']]
    final_results.to_csv(f"predictions_{test_file.replace('.parquet', '.csv')}", index=False)
    all_results.append(final_results)

# 合并所有结果
if all_results:
    final_predictions = pd.concat(all_results, ignore_index=True)
    final_predictions.to_csv('final_predictions.csv', index=False)
    print(f"完成预测，总计 {len(final_predictions)} 行结果")

完成预测，总计 6897776 行结果
