In [7]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from tqdm import tqdm
import torch

# 检查是否有可用的GPU，如果没有则回退到CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
if device.type == "cuda":
    print("代码将在GPU上计算")
else:
    print("代码将在CPU上计算")

# 加载数据文件
data = pd.read_csv('18-21_merged_data_label.csv')

# 选择因子列的范围（第三列到第260列）
factor_columns = data.columns[9:267]

# 创建一个DataFrame用于存储结果
result_df = pd.DataFrame(columns=['Factor', 'Accuracy'])

# 将数据集划分为训练集和验证集
train_data = data[data['trade_date'].between('2018-07-01', '2020-12-31')]
test_data = data[data['trade_date'].between('2021-01-01', '2021-11-30')]

train_features = train_data[factor_columns]
test_features = test_data[factor_columns]
train_labels = train_data['label'].astype(int)
test_labels = test_data['label'].astype(int)


# 迭代遍历每个因子并进行单因子检验
for factor in tqdm(factor_columns, desc='Performing Single Factor Test'):
    # 提取当前因子的训练集和测试集特征
    train_feature = train_features[[factor]]
    test_feature = test_features[[factor]]

    # 创建并训练逻辑回归模型
    model = LogisticRegression()
    model.fit(train_feature, train_labels)

    # 在测试集上进行预测并计算准确率
    predictions = model.predict(test_feature)
    accuracy = accuracy_score(test_labels, predictions)

    # 将结果存储到DataFrame中
    result_df = pd.concat([result_df, pd.DataFrame({'Factor': [factor], 'Accuracy': [accuracy]})], ignore_index=True)

# 按准确率降序排序，选择前n个因子作为优质因子（可以根据需求修改n的值）
n = 15
selected_factors = result_df.sort_values('Accuracy', ascending=False).head(n)

# 将结果存储到新的文件
selected_factors.to_csv('18-21SelectedFactors.csv', index=False)

# 显示结果
print("选出的优质因子：")
print(selected_factors)


代码将在GPU上计算


Performing Single Factor Test: 100%|██████████| 258/258 [00:02<00:00, 108.59it/s]

选出的优质因子：
                                                Factor  Accuracy
192  ('technical_indicators_overbought_oversold', '...  0.502662
201  ('technical_indicators_overbought_oversold', '...  0.473522
197  ('technical_indicators_overbought_oversold', 'J')  0.469506
193  ('technical_indicators_overbought_oversold', '...  0.453348
185  ('technical_indicators_momentum_reversal', 'CMO')  0.423461
187  ('technical_indicators_momentum_reversal', 'IMI')  0.420846
135  ('liquidity_factor', 'Absolute_Return_to_Volume')  0.420659
254          ('volatility_factor', 'Total_Volatility')  0.419819
198  ('technical_indicators_overbought_oversold', 'K')  0.417951
245  ('volatility_factor', 'Idiosyncratic_Volatility')  0.413094
138  ('liquidity_factor', 'Capitalization_Adjusted_...  0.410666
208         ('technical_indicators_trending', 'VMACD')  0.409545
247      ('volatility_factor', 'Maximum_Daily_Return')  0.407677
211         ('technical_indicators_volatility', 'CVI')  0.407397
163             




In [15]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm

# 设置随机种子
seed = 42
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)

# 加载数据
df = pd.read_csv('18-21_SRl&Factor&SDPG_CN_data.csv')
# 选择需要的列
selected_cols = list(df.columns[18:68]) + ['label', 'trade_date']
df = df[selected_cols]

# 划分训练集和测试集
train_df = df[df['trade_date'].between('2018-07-01', '2020-12-31')]
test_df = df[df['trade_date'].between('2021-01-01', '2021-11-30')]

# 划分训练集和验证集
train_df, val_df = train_test_split(train_df, test_size=0.2, random_state=seed)

# 定义自定义数据集类
class CustomDataset(Dataset):
    def __init__(self, df):
        self.df = df

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        data = self.df.iloc[index]

        factor_data = torch.from_numpy(data[selected_cols[:-2]].values.astype(np.float32))

        label = torch.tensor(int(data['label']), dtype=torch.long)

        return {
            'factor_data': factor_data,
            'label': label
        }

# 设置模型参数
batch_size = 16
hidden_size = 128
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
num_epochs = 20
learning_rate = 0.001  # 调整学习率

# 创建数据加载器
train_dataset = CustomDataset(train_df)
val_dataset = CustomDataset(val_df)
test_dataset = CustomDataset(test_df)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# 定义模型
class MLPClassifier(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(MLPClassifier, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.2)
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.fc3 = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        out = self.fc1(x)
        out = self.relu(out)
        out = self.dropout(out)
        out = self.fc2(out)
        out = self.relu(out)
        out = self.dropout(out)
        out = self.fc3(out)
        return out

# 实例化模型
input_size = len(selected_cols[:-2])
num_classes = 3
model = MLPClassifier(input_size, hidden_size, num_classes)
model = model.to(device)

# 定义损失函数和优化器
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=1e-5)

# 训练模型
best_val_acc = 0.0
for epoch in range(num_epochs):
    train_loss = 0.0
    train_preds = []
    train_labels = []

    progress_bar = tqdm(train_loader, desc='Epoch {:1d}'.format(epoch+1), leave=False)

    model.train()
    for batch in progress_bar:
        factor_data = batch['factor_data'].to(device)
        labels = batch['label'].to(device)

        optimizer.zero_grad()

        logits = model(factor_data)

        loss = criterion(logits, labels)

        loss.backward()
        optimizer.step()

        train_loss += loss.item()

        preds = torch.argmax(logits, dim=1)
        train_preds.extend(preds.detach().cpu().numpy())
        train_labels.extend(labels.detach().cpu().numpy())

    train_loss /= len(train_loader)

    train_acc = accuracy_score(train_labels, train_preds)

    val_loss= 0.0
    val_preds = []
    val_labels = []

    model.eval()
    with torch.no_grad():
        for batch in val_loader:
            factor_data = batch['factor_data'].to(device)
            labels = batch['label'].to(device)

            logits = model(factor_data)

            preds = torch.argmax(logits, dim=1)
            val_preds.extend(preds.detach().cpu().numpy())
            val_labels.extend(labels.detach().cpu().numpy())

            val_loss += criterion(logits, labels).item()

    val_loss /= len(val_loader)

    val_acc = accuracy_score(val_labels, val_preds)

    print(f"Epoch {epoch+1}/{num_epochs}")
    print(f"Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}")
    print(f"Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}\n")

    if val_acc > best_val_acc:
        best_val_acc = val_acc
        torch.save(model.state_dict(), 'Factor_best_model.pt')

# 在测试集上评估模型
test_loss = 0.0
test_preds = []
test_labels = []

model.load_state_dict(torch.load('Factor_best_model.pt'))
model.eval()
with torch.no_grad():
    for batch in test_loader:
        factor_data = batch['factor_data'].to(device)
        labels = batch['label'].to(device)

        logits = model(factor_data)

        preds = torch.argmax(logits, dim=1)
        test_preds.extend(preds.detach().cpu().numpy())
        test_labels.extend(labels.detach().cpu().numpy())

        test_loss += criterion(logits, labels).item()

test_loss /= len(test_loader)

test_acc = accuracy_score(test_labels, test_preds)

print("------------------------")
print("Final Evaluation")
print("------------------------")
print(f"Test Loss: {test_loss:.4f}")
print(f"Test Acc: {test_acc:.4f}")


                                                           

Epoch 1/20
Train Loss: 0.8325, Train Acc: 0.6239
Val Loss: 0.7524, Val Acc: 0.6695


                                                           

Epoch 2/20
Train Loss: 0.7304, Train Acc: 0.6823
Val Loss: 0.7313, Val Acc: 0.6726


                                                           

Epoch 3/20
Train Loss: 0.6953, Train Acc: 0.7029
Val Loss: 0.7003, Val Acc: 0.6946


                                                           

Epoch 4/20
Train Loss: 0.6708, Train Acc: 0.7109
Val Loss: 0.6876, Val Acc: 0.7104


                                                           

Epoch 5/20
Train Loss: 0.6497, Train Acc: 0.7219
Val Loss: 0.6817, Val Acc: 0.7124


                                                           

Epoch 6/20
Train Loss: 0.6363, Train Acc: 0.7315
Val Loss: 0.6591, Val Acc: 0.7220


                                                           

Epoch 7/20
Train Loss: 0.6154, Train Acc: 0.7398
Val Loss: 0.6464, Val Acc: 0.7297


                                                           

Epoch 8/20
Train Loss: 0.6073, Train Acc: 0.7485
Val Loss: 0.6491, Val Acc: 0.7332


                                                           

Epoch 9/20
Train Loss: 0.5913, Train Acc: 0.7506
Val Loss: 0.6527, Val Acc: 0.7224


                                                            

Epoch 10/20
Train Loss: 0.5729, Train Acc: 0.7618
Val Loss: 0.6364, Val Acc: 0.7402


                                                            

Epoch 11/20
Train Loss: 0.5560, Train Acc: 0.7672
Val Loss: 0.6244, Val Acc: 0.7398


                                                            

Epoch 12/20
Train Loss: 0.5453, Train Acc: 0.7698
Val Loss: 0.6133, Val Acc: 0.7444


                                                            

Epoch 13/20
Train Loss: 0.5337, Train Acc: 0.7775
Val Loss: 0.6118, Val Acc: 0.7479


                                                            

Epoch 14/20
Train Loss: 0.5234, Train Acc: 0.7846
Val Loss: 0.6126, Val Acc: 0.7463


                                                            

Epoch 15/20
Train Loss: 0.5078, Train Acc: 0.7904
Val Loss: 0.6216, Val Acc: 0.7413


                                                            

Epoch 16/20
Train Loss: 0.5050, Train Acc: 0.7907
Val Loss: 0.6037, Val Acc: 0.7622


                                                            

Epoch 17/20
Train Loss: 0.4909, Train Acc: 0.7961
Val Loss: 0.6059, Val Acc: 0.7548


                                                            

Epoch 18/20
Train Loss: 0.4827, Train Acc: 0.8010
Val Loss: 0.6072, Val Acc: 0.7514


                                                            

Epoch 19/20
Train Loss: 0.4787, Train Acc: 0.8034
Val Loss: 0.5915, Val Acc: 0.7556


                                                            

Epoch 20/20
Train Loss: 0.4612, Train Acc: 0.8146
Val Loss: 0.6091, Val Acc: 0.7556

------------------------
Final Evaluation
------------------------
Test Loss: 0.6163
Test Acc: 0.7502
