In [1]:
import pandas as pd
import numpy as np
import torch
# 进度条库
from tqdm import trange
from sklearn.model_selection import train_test_split
import os

In [2]:
def process_data(data, seq_len=10):
    """修复了numpy类型转换问题的数据处理函数"""
    inputs = []
    labels = []
    
    for opt_id in trange(data['optID'].unique().shape[0]):
        opt_data = data[data['optID'] == data['optID'].unique()[opt_id]]
        opt_values = opt_data.iloc[:, 3:19].values.astype(np.float64)  # 提前转换类型
        
        for i in range(len(opt_values) - seq_len + 1):
            # 创建3x10x5的张量
            input_tensor = torch.zeros((3, seq_len, 5), dtype=torch.float64)
            
            # 通道0: 基础期权数据 (列3-8)
            input_tensor[0] = torch.from_numpy(opt_values[i:i+seq_len, 0:5])
            
            # 通道1: 希腊字母数据 (列8-13)
            input_tensor[1] = torch.from_numpy(opt_values[i:i+seq_len, 5:10])
            
            # 通道2: 价格数据 (列13-18)
            input_tensor[2] = torch.from_numpy(opt_values[i:i+seq_len, 10:15])
            
            # 标签: 第10天的settlePrice (列18)
            labels.append(torch.tensor(opt_values[i+seq_len-1, 15], dtype=torch.float64))
            inputs.append(input_tensor)
    
    return torch.stack(inputs), torch.stack(labels)

In [3]:
def save_data(data, path):
    """保存数据到指定路径"""
    os.makedirs(os.path.dirname(path), exist_ok=True)
    torch.save(data, path)

In [None]:
# 删除lewis_fft_results.csv中的strike列
import pandas as pd

# 读取文件
df = pd.read_csv(r'.\3DTensor-basedDL4OptionPricing\data_ipynb\lewis_fft_results.csv')

# 删除strike列（方法1：直接drop）
df.drop(columns=['strike'], inplace=True)

# 保存修改后的文件（覆盖原文件）
df.to_csv(r'.\3DTensor-basedDL4OptionPricing\data_ipynb\lewis_fft_results.csv', index=False)

In [None]:
# 主数据处理流程
def main():
    # 1. 加载原始数据
    train_data = pd.read_csv(r".\3DTensor-basedDL4OptionPricing\data_ipynb\train_data.csv")
    test_data = pd.read_csv(r".\3DTensor-basedDL4OptionPricing\data_ipynb\test_data(lewis_fft).csv")

    # 2. 划分训练集和验证集 (80%训练, 20%验证)
    train_data, val_data = train_test_split(
        train_data, 
        test_size=0.2, 
        random_state=42,
        stratify=train_data['optID']  # 按期权ID分层抽样
    )

    # 3. 处理三个数据集
    print("Processing training data...")
    train_input, train_label = process_data(train_data)
    
    print("Processing validation data...")
    val_input, val_label = process_data(val_data)
    
    print("Processing test data...")
    test_input, test_label = process_data(test_data)

    # 4. 保存数据
    base_path = r".\3DTensor-basedDL4OptionPricing\data_ipynb\torch-data-ipynb"
    
    save_data(train_input, os.path.join(base_path, "train_input(V).pt"))
    save_data(train_label, os.path.join(base_path, "train_label(V).pt"))
    
    save_data(val_input, os.path.join(base_path, "val_input.pt"))
    save_data(val_label, os.path.join(base_path, "val_label.pt"))
    
    # train 和validation数据都是从 train_data.csv中划分的
    # test数据是从 test_data(lewis_fft).csv中处理的
    save_data(test_input, os.path.join(base_path, "test_input(lewis_fft).pt"))
    save_data(test_label, os.path.join(base_path, "test_label(lewis-fft).pt"))

    # 5. 打印数据形状验证
    print("\nData shapes:")
    print(f"Train inputs: {train_input.shape}, labels: {train_label.shape}")
    print(f"Validation inputs: {val_input.shape}, labels: {val_label.shape}")
    print(f"Test inputs: {test_input.shape}, labels: {test_label.shape}")

In [14]:

if __name__ == "__main__":
    main()

  test_data = pd.read_csv(r"E:\pyProjects\3DTensor-basedDL4OptionPricing\data_ipynb\test_data(lewis_fft).csv")


Processing training data...


100%|██████████| 600/600 [00:01<00:00, 476.73it/s]


Processing validation data...


100%|██████████| 600/600 [00:00<00:00, 1790.37it/s]


Processing test data...


100%|██████████| 229/229 [00:02<00:00, 102.33it/s]



Data shapes:
Train inputs: torch.Size([25488, 3, 10, 5]), labels: torch.Size([25488])
Validation inputs: torch.Size([2996, 3, 10, 5]), labels: torch.Size([2996])
Test inputs: torch.Size([11386, 3, 10, 5]), labels: torch.Size([11386])
