## 测量因子

In [1]:
import pandas as pd
import numpy as np

from Utility.factorFactory import FactorFactory
import matplotlib.pyplot as plt

from Utility.registry import FACTOR_REGISTRY

  from .autonotebook import tqdm as notebook_tqdm


### （1）预处理数据：

In [2]:
df = pd.read_csv ('SPY_2020-07-01_2021-07-01_minute.csv')
df.shape

(220161, 17)

In [3]:
df.columns

Index(['timestamp', 'open', 'high', 'low', 'close', 'volume', 'dollar_volume',
       'tick_count', 'trade_size_mean', 'trade_size_std', 'zero_return_count',
       'price_direction_ratio', 'large_trade_count', 'large_trade_volume',
       'vwap', 'large_trade_ratio', 'large_trade_volume_ratio'],
      dtype='object')

In [4]:
finfact = FactorFactory (df, forward_period=5, window=100, scaler='minmax', top_k=500, use_disk_cache=True)

IO Dump: 100%|██████████| 16/16 [00:00<00:00, 140.95it/s]
IO Dump: 0it [00:00, ?it/s]


In [5]:
FACTOR_REGISTRY.keys()

dict_keys(['rsi', 'sma', 'ema', 'macd', 'bbpband'])

In [6]:
finfact.generate_factors(mode='thread', bounded_only=True)

IO Load: 100%|██████████| 16/16 [00:00<00:00, 58.84it/s]
Applying Factors: 100%|██████████| 32/32 [00:02<00:00, 11.51it/s]
IO Dump: 100%|██████████| 112/112 [00:00<00:00, 261.97it/s]
🔄 Unary op: 100%|██████████| 32/32 [00:00<00:00, 51761.56it/s]
IO Dump: 100%|██████████| 32/32 [00:00<00:00, 718.11it/s]
🔄 cross op (thread): 100%|██████████| 512/512 [00:05<00:00, 99.98it/s] 


[IOcache] dropped 16 features!


IO Load: 100%|██████████| 1/1 [00:00<00:00, 69.86it/s]
ic eval: 100%|██████████| 640/640 [00:30<00:00, 20.78it/s]
IO Load: 100%|██████████| 640/640 [00:07<00:00, 87.20it/s] 
IO Delete: 100%|██████████| 140/140 [00:00<00:00, 192525.43it/s]


### （3）评测因子：

In [13]:
finfact.get_summary ()

Unnamed: 0,spearman_ic,pearson_ir,pca_coeff,spearman_ic_norm,pearson_ir_norm,pca_coeff_norm,combined_score
large_trade_ratio_minus_close,0.014017,1.890403,0.086086,0.622433,1.000000,0.998336,2.620770
high_minus_close,0.039323,0.571535,0.085624,0.981583,0.560640,0.992981,2.535204
price_direction_ratio_minus_high,0.013636,1.401288,0.086161,0.617034,0.837059,0.999210,2.453303
large_trade_volume_ratio_minus_low,0.013796,1.297378,0.086161,0.619297,0.802443,0.999211,2.420951
vwap_minus_close,0.027046,0.466066,0.086159,0.807341,0.525504,0.999191,2.332037
...,...,...,...,...,...,...,...
zero_return_count_minus_trade_size_std,0.008529,-0.042674,0.001496,0.544552,0.356026,0.017346,0.917923
trade_size_std_minus_large_trade_volume_ratio,0.006552,0.057721,0.000655,0.516489,0.389471,0.007599,0.913558
sin_(trade_size_mean)_minus_cos_(open)_mul_bbpband_10_1.5_(large_trade_ratio),0.002072,0.003913,0.007192,0.452914,0.371545,0.083407,0.907867
rsi_6_(sin_(volume)_mul_sin_(volume)),0.001846,0.017849,0.006569,0.449706,0.376188,0.076181,0.902075


In [12]:
cur_features = finfact.next(k=10, mode='thread', bounded_only=True)

IO Load: 100%|██████████| 500/500 [00:06<00:00, 79.66it/s]
Applying Factors: 100%|██████████| 20/20 [00:02<00:00,  9.80it/s]
IO Dump: 100%|██████████| 70/70 [00:00<00:00, 307.60it/s]
🔄 Unary op: 100%|██████████| 20/20 [00:00<00:00, 37035.80it/s]
IO Dump: 100%|██████████| 20/20 [00:00<00:00, 1044.06it/s]
🔄 cross op (thread): 100%|██████████| 200/200 [00:03<00:00, 62.68it/s] 


[IOcache] dropped 10 features!


IO Load: 100%|██████████| 780/780 [00:09<00:00, 84.14it/s]
IO Load: 100%|██████████| 1/1 [00:00<00:00, 43.52it/s]
  np.subtract(arr, avg, out=arr, casting='unsafe', where=where)
ic eval: 100%|██████████| 780/780 [00:39<00:00, 19.75it/s]
IO Load: 100%|██████████| 780/780 [00:09<00:00, 81.60it/s]
IO Delete: 100%|██████████| 280/280 [00:00<00:00, 10962.74it/s]


In [None]:
df_features = finfact.df_features
df_features.columns.tolist()

In [None]:
finfact.visualize_structure_2d (seq_len=256,
                                perplexity=30,
                                n_neighbors=10,
                                random_state=42,
                                pca_evp=0.6,
                                umap_components=2
                                )

In [None]:
df_eval_report = finfact.evaluate_clusterings(
    dim_reduction='umap',
    reduction_params={'n_components': 2, 'n_neighbors': 10, 'min_dist': 0.1},
    algos= ['KMeans'],
    n_jobs=12,
    backend='thread',
    seq_len=1
)

In [None]:
df_eval_report

In [None]:
best_row = df_eval_report.loc[12]

In [None]:
label_series = pd.Series(best_row['_labels'])
counts = label_series.value_counts()
print(counts)

#### IC分析

#### 相关性检测

In [None]:
df_filtered = finfact.df_features.set_index('timestamp')

In [None]:
corr = pd.DataFrame (df_filtered).corr ()
plt.figure (figsize=(12, 10))
sns.heatmap (corr, cmap='coolwarm', center=0)
plt.title ('Feature Correlation Matrix')
plt.show ()

#### PCA分析

In [None]:
from sklearn.decomposition import PCA

pca = PCA ()
pca.fit (df_filtered)
percent = 0.99

cumulative = np.cumsum (pca.explained_variance_ratio_)
n_PCs = np.argmax (cumulative >= percent) + 1

print (f"前 {n_PCs} 个主成分累计解释了 {cumulative[n_PCs - 1]:.2%} 的方差")
feature_names = df_featured.columns

# 打印所有有重大贡献的主成分
for i in range (n_PCs):
    # 第 i 个主成分的系数
    pc = pca.components_[i]

    # 取绝对值最大的特征位置
    top_feature_idx = np.argmax (np.abs (pc))
    top_feature_name = feature_names[top_feature_idx]

    print (f"PC{i + 1} 的主导因子是: {top_feature_name}（贡献系数: {pc[top_feature_idx]:.4f}）")