# 训练分类模型。涨跌趋势分类模型

In [7]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

from factorFactory import FactorFactory
from sklearn.metrics import accuracy_score, roc_auc_score

### （1）数据预处理：

In [3]:
df = pd.read_csv('SPY_2020-01-01_2025-05-07_5m_raw.csv')

# 拿到 y
df_label = df.copy()
df_label['next_timestamp_return'] = df_label['close'].shift(-1) / df_label['close'] - 1
df_label['label'] = (df_label['next_timestamp_return'] > 0).astype(int)
y = df_label['label'].to_numpy()


# 拿到 X
X = df.copy()
ff = FactorFactory()
X = ff.generate_factors(X, include_bounded_factors=False)

X.shape, y.shape

((242741, 127), (242741,))

In [4]:
# 清洗数据
X['label'] = y

# 丢弃包含 NaN 的行
X = X.dropna()
# 分离出清洗后的 X 和 y
y = X['label'].to_numpy()
X = X.drop(columns=['label', 'open', 'high', 'low', 'close', 'timestamp'])

# 查看新形状
X.shape, y.shape

((242692, 122), (242692,))

In [5]:
# 分开数据
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


#### 降维度（可选）

In [9]:
from sklearn.decomposition import PCA

# 拟合 PCA
pca = PCA()
pca.fit(X_train)

explained = pca.explained_variance_ratio_
cumulative = np.cumsum(explained)

percent = 0.6

# 找满足累计解释 >= percent% 的主成分个数
n_components = np.argmax(cumulative >= percent) + 1
print(f"前 {n_components} 个主成分累计解释了 {cumulative[n_components - 1]:.2%} 的方差")

前 1 个主成分累计解释了 99.75% 的方差


#### 标准化，归一化

In [1]:
from sklearn.preprocessing import StandardScaler

stdScaler = StandardScaler()
X_train_scaled = stdScaler.fit_transform(X_train)
X_test_scaled = stdScaler.transform(X_test)

X_train_scaled.shape, y_train.shape, X_test_scaled.shape, y_test.shape

NameError: name 'X_train' is not defined

### （2）训练模型：

#### 线形分类

In [5]:
from sklearn.linear_model import LogisticRegression

lgr = LogisticRegression(penalty='l2', C=1.0, max_iter=5000, random_state=42, n_jobs=-1)
lgr.fit(X_train_scaled, y_train)
acc_lgr = lgr.score(X_test_scaled, y_test)
auc_lgr = roc_auc_score(y_test, lgr.predict_proba(X_test_scaled)[:, 1])
print("LogisticRegression - Accuracy:", acc_lgr, "AUC:", auc_lgr)

LogisticRegression - Accuracy: 0.5194585796987989 AUC: 0.5288741131131051


In [6]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
rf.fit(X_train_scaled, y_train)
acc_rf = rf.score(X_test_scaled, y_test)
auc_rf = roc_auc_score(y_test, rf.predict_proba(X_test_scaled)[:, 1])
print("Random Forest - Accuracy:", acc_rf, "AUC:", auc_rf)

Random Forest - Accuracy: 0.5095078184552628 AUC: 0.5131969758406619


In [7]:
from sklearn.ensemble import GradientBoostingClassifier

gb = GradientBoostingClassifier(n_estimators=100, learning_rate=0.01, random_state=42)
gb.fit(X_train_scaled, y_train)
acc_gb = gb.score(X_test_scaled, y_test)
auc_gb = roc_auc_score(y_test, gb.predict_proba(X_test_scaled)[:, 1])
print("Gradient Boosting - Accuracy:", acc_gb, "AUC:", auc_gb)

Gradient Boosting - Accuracy: 0.5178722264570758 AUC: 0.5290655070065187


In [None]:
from sklearn.svm import SVC

svc = SVC(probability=True, kernel='rbf', C=1.0, random_state=42)
svc.fit(X_train_scaled, y_train)
acc_svc = svc.score(X_test_scaled, y_test)
auc_svc = roc_auc_score(y_test, svc.predict_proba(X_test_scaled)[:, 1])
print("SVC - Accuracy:", acc_svc, "AUC:", auc_svc)

In [None]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=5, n_jobs=-1)
knn.fit(X_train_scaled, y_train)
acc_knn = knn.score(X_test_scaled, y_test)
auc_knn = roc_auc_score(y_test, knn.predict_proba(X_test_scaled)[:, 1])
print("KNN - Accuracy:", acc_knn, "AUC:", auc_knn)

In [None]:
from sklearn.linear_model import SGDClassifier

sgd = SGDClassifier(loss='log_loss', max_iter=1000, random_state=42)
sgd.fit(X_train_scaled, y_train)
acc_sgd = sgd.score(X_test_scaled, y_test)
y_proba_sgd = sgd.predict_proba(X_test_scaled)[:, 1]
auc_sgd = roc_auc_score(y_test, y_proba_sgd)
print("SGDClassifier - Accuracy:", acc_sgd, "AUC:", auc_sgd)

In [None]:
from sklearn.naive_bayes import GaussianNB

nb = GaussianNB()
nb.fit(X_train_scaled, y_train)
acc_nb = nb.score(X_test_scaled, y_test)
auc_nb = roc_auc_score(y_test, nb.predict_proba(X_test_scaled)[:, 1])
print("Naive Bayes - Accuracy:", acc_nb, "AUC:", auc_nb)