In [1]:
import pandas as pd
import numpy as np

from factorFactory import FactorFactory
from Util import evaluate_time_series_ic, evaluate_time_series_ic
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv ('SPY_2020-01-01_2025-05-07_5m_raw.csv')
df.shape

(242741, 6)

In [3]:
finfact = FactorFactory ()
df = finfact.generate_factors (df)
df.shape

(242741, 210)

In [5]:
# 去掉 null
first_valid_idx = df[df.notnull().all(axis=1)].index[0]
df = df.loc[first_valid_idx:].reset_index(drop=True)

# 计算每列的缺失值数量
null_counts = df.isnull().sum()
# 只保留那些缺失值数 > 0 的列
null_counts = null_counts[null_counts > 0]
print(null_counts)
df.shape, df.columns

Series([], dtype: int64)


((242542, 210),
 Index(['timestamp', 'open', 'high', 'low', 'close', 'volume', 'rsi_6_open',
        'rsi_6_high', 'rsi_6_low', 'rsi_6_close',
        ...
        'pct_chg_5_low', 'pct_chg_5_close', 'pct_chg_10_open',
        'pct_chg_10_high', 'pct_chg_10_low', 'pct_chg_10_close',
        'pct_chg_20_open', 'pct_chg_20_high', 'pct_chg_20_low',
        'pct_chg_20_close'],
       dtype='object', length=210))

In [6]:
all_cols = df.columns.tolist()
print (df.columns.tolist())

['timestamp', 'open', 'high', 'low', 'close', 'volume', 'rsi_6_open', 'rsi_6_high', 'rsi_6_low', 'rsi_6_close', 'rsi_10_open', 'rsi_10_high', 'rsi_10_low', 'rsi_10_close', 'rsi_14_open', 'rsi_14_high', 'rsi_14_low', 'rsi_14_close', 'rsi_20_open', 'rsi_20_high', 'rsi_20_low', 'rsi_20_close', 'rsi_30_open', 'rsi_30_high', 'rsi_30_low', 'rsi_30_close', 'sma_5_open', 'sma_5_high', 'sma_5_low', 'sma_5_close', 'sma_10_open', 'sma_10_high', 'sma_10_low', 'sma_10_close', 'sma_20_open', 'sma_20_high', 'sma_20_low', 'sma_20_close', 'sma_30_open', 'sma_30_high', 'sma_30_low', 'sma_30_close', 'sma_50_open', 'sma_50_high', 'sma_50_low', 'sma_50_close', 'sma_100_open', 'sma_100_high', 'sma_100_low', 'sma_100_close', 'sma_200_open', 'sma_200_high', 'sma_200_low', 'sma_200_close', 'ema_5_open', 'ema_5_high', 'ema_5_low', 'ema_5_close', 'ema_10_open', 'ema_10_high', 'ema_10_low', 'ema_10_close', 'ema_12_open', 'ema_12_high', 'ema_12_low', 'ema_12_close', 'ema_20_open', 'ema_20_high', 'ema_20_low', 'ema

In [7]:
usable_factors = [
    # 1) 全部 RSI（0–100）
    *[c for c in all_cols if c.startswith("rsi_")],

    # 2) ADX（0–100）
    *[c for c in all_cols if c.startswith("adx_")],

    # 3) Williams %R（-100–0）
    *[c for c in all_cols if c.startswith("wr_")],

    # 4) CMF（-1–1） & MFI（0–100）
    *[c for c in all_cols if c in ("cmf_20", "mfi_14")],
]

print(len(usable_factors))

29


In [8]:
results = evaluate_time_series_ic(
    df=df,
    factors=usable_factors,
    target_col='close',
    forward=3,        # 往后 3 个 timestamp
    date_col='timestamp',
    window=100,
    n_jobs=8
)
results

Unnamed: 0_level_0,ic_global,ic_sw_mean,ic_sw_std,ir_sw
factor,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
rsi_6_high,-0.00659,-0.077092,0.149231,-0.516599
rsi_6_low,-0.009672,-0.080427,0.149238,-0.538919
rsi_6_close,-0.012005,-0.083634,0.149281,-0.560247
rsi_10_high,-0.00548,-0.097505,0.151366,-0.644165
rsi_10_close,-0.00958,-0.103169,0.152674,-0.675746
rsi_6_open,-0.006326,-0.073622,0.145405,-0.506321
rsi_10_open,-0.005498,-0.094463,0.148286,-0.637035
rsi_10_low,-0.008268,-0.100601,0.152177,-0.661077
rsi_14_open,-0.004269,-0.109878,0.148921,-0.737831
rsi_14_high,-0.004179,-0.112914,0.151797,-0.74385


In [9]:
# 选择出优质因子
results[results['ic_global'].abs () > 0.01]

Unnamed: 0_level_0,ic_global,ic_sw_mean,ic_sw_std,ir_sw
factor,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
rsi_6_close,-0.012005,-0.083634,0.149281,-0.560247
wr_10,-0.014647,-0.066694,0.144033,-0.463046
wr_14,-0.01171,-0.074684,0.148011,-0.504586
wr_20,-0.01041,-0.086225,0.150913,-0.571353


In [10]:
results[results['ir_sw'].abs () > 1]

Unnamed: 0_level_0,ic_global,ic_sw_mean,ic_sw_std,ir_sw
factor,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
rsi_30_close,-0.001992,-0.153779,0.153288,-1.003204
