## 测量因子

In [1]:
import pandas as pd
import numpy as np

from Utility.factorFactory import FactorFactory
import matplotlib.pyplot as plt

from Utility.registry import FACTOR_REGISTRY

  from .autonotebook import tqdm as notebook_tqdm


### （1）预处理数据：

In [2]:
df = pd.read_csv ('SPY_2020-07-01_2021-07-01_minute.csv')
df.shape

(220161, 17)

In [3]:
df.columns

Index(['timestamp', 'open', 'high', 'low', 'close', 'volume', 'dollar_volume',
       'tick_count', 'trade_size_mean', 'trade_size_std', 'zero_return_count',
       'price_direction_ratio', 'large_trade_count', 'large_trade_volume',
       'vwap', 'large_trade_ratio', 'large_trade_volume_ratio'],
      dtype='object')

In [4]:
finfact = FactorFactory (df, forward_period=5, window=100, scaler='minmax', top_k=500)

In [5]:
FACTOR_REGISTRY.keys()

dict_keys(['rsi', 'sma', 'ema', 'macd', 'bbpband'])

In [None]:
finfact.generate_factors (mode='thread', n_job=12, bounded_only=False)
df_featured = finfact.df_features
df_featured.shape

Applying Factors: 100%|██████████| 80/80 [00:04<00:00, 17.24it/s]
Applying Factors: 100%|██████████| 1200/1200 [01:01<00:00, 19.62it/s]
🔄 cross_op (thread):  21%|██        | 36496/172800 [00:13<01:30, 1507.59it/s] 

### （3）评测因子：

In [None]:
finfact.get_summary ()

In [None]:
cur_features = finfact.next(steps=1, k=10, mode='thread', n_job=12, bounded_only=True)

In [None]:
df_features = finfact.df_features
df_features.columns.tolist()

In [None]:
finfact.visualize_structure_2d (seq_len=256,
                                perplexity=30,
                                n_neighbors=10,
                                random_state=42,
                                pca_evp=0.6,
                                umap_components=2
                                )

In [None]:
df_eval_report = finfact.evaluate_clusterings(
    dim_reduction='umap',
    reduction_params={'n_components': 2, 'n_neighbors': 10, 'min_dist': 0.1},
    algos= ['KMeans'],
    n_jobs=12,
    backend='thread',
    seq_len=1
)

In [None]:
df_eval_report

In [None]:
best_row = df_eval_report.loc[12]

In [None]:
label_series = pd.Series(best_row['_labels'])
counts = label_series.value_counts()
print(counts)

#### IC分析

#### 相关性检测

In [None]:
df_filtered = finfact.df_features.set_index('timestamp')

In [None]:
corr = pd.DataFrame (df_filtered).corr ()
plt.figure (figsize=(12, 10))
sns.heatmap (corr, cmap='coolwarm', center=0)
plt.title ('Feature Correlation Matrix')
plt.show ()

#### PCA分析

In [None]:
from sklearn.decomposition import PCA

pca = PCA ()
pca.fit (df_filtered)
percent = 0.99

cumulative = np.cumsum (pca.explained_variance_ratio_)
n_PCs = np.argmax (cumulative >= percent) + 1

print (f"前 {n_PCs} 个主成分累计解释了 {cumulative[n_PCs - 1]:.2%} 的方差")
feature_names = df_featured.columns

# 打印所有有重大贡献的主成分
for i in range (n_PCs):
    # 第 i 个主成分的系数
    pc = pca.components_[i]

    # 取绝对值最大的特征位置
    top_feature_idx = np.argmax (np.abs (pc))
    top_feature_name = feature_names[top_feature_idx]

    print (f"PC{i + 1} 的主导因子是: {top_feature_name}（贡献系数: {pc[top_feature_idx]:.4f}）")