In [3]:
import numpy as np
import pandas as pd
import iisignature
from iisignature import sig, prepare, logsig, logsiglength
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import RidgeClassifier, LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler

In [2]:
pip install iisignature

Collecting iisignature
  Downloading iisignature-0.24.tar.gz (1.3 MB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.3 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.1/1.3 MB[0m [31m3.7 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━[0m [32m0.6/1.3 MB[0m [31m9.8 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m13.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: iisignature
  Building wheel for iisignature (setup.py) ... [?25l[?25hdone
  Created wheel for iisignature: filename=iisignature-0.24-cp311-cp311-linux_x86_64.whl size=3246726 sha256=b96d9c7f98159faaad580e8bbc6fda2fcbb58a8b2daaed4c6133e33556cadbe1
  Stored in directory: /root/.cache/pip/wheels/1c/f4/57/0b4d3787a07f20a3cd1a9

In [4]:
x_train = pd.read_csv("/content/x_train.csv")
y_train = pd.read_csv("/content/y_train.csv")
x_test = pd.read_csv("/content/x_test.csv")
y_test = pd.read_csv("/content/test_rand.csv")

In [5]:
x_train.sort_values(by=['STOCK', "ID"]).reset_index(drop=True)

Unnamed: 0,ID,DATE,STOCK,INDUSTRY,INDUSTRY_GROUP,SECTOR,SUB_INDUSTRY,RET_1,VOLUME_1,RET_2,...,RET_16,VOLUME_16,RET_17,VOLUME_17,RET_18,VOLUME_18,RET_19,VOLUME_19,RET_20,VOLUME_20
0,2377,1,0,37,12,5,94,-0.005967,0.136699,0.009031,...,-0.009219,-0.493354,-0.007660,-0.585497,-0.001063,-0.351363,0.005127,-0.324675,-0.019275,-0.291751
1,5198,4,0,37,12,5,94,0.001348,-0.269520,0.011100,...,-0.000762,-0.313575,0.007867,0.071338,0.007733,-0.405243,-0.003276,-0.424336,-0.010489,-0.050591
2,8017,5,0,37,12,5,94,-0.014405,0.192655,0.003614,...,0.002742,-0.367499,-0.005843,-0.405562,0.002930,-0.315935,0.010462,-0.474957,-0.003541,-0.260130
3,20826,11,0,37,12,5,94,0.008938,0.430916,0.002662,...,0.009097,0.023598,0.011266,0.079711,0.019038,-0.230167,-0.000287,-0.312123,0.008682,-0.226628
4,33843,21,0,37,12,5,94,-0.006523,-0.060371,-0.007632,...,-0.014461,-0.337686,-0.007224,-0.161117,-0.001461,-0.095494,0.012667,0.471895,-0.038752,1.532045
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
418590,391556,206,5716,50,17,7,114,0.029552,-0.075091,-0.001428,...,0.024553,0.076480,0.017026,0.170516,0.002276,-0.106224,-0.034597,0.123750,-0.015676,-0.228186
418591,394490,208,5716,50,17,7,114,0.008316,0.028099,-0.006688,...,0.020970,0.090287,0.002887,-0.050408,0.008736,-0.159294,0.027350,-0.022922,0.008186,-0.080569
418592,400150,210,5716,50,17,7,114,-0.004633,-0.173518,0.001687,...,-0.024517,0.695373,-0.015320,-0.149467,-0.035810,-0.262389,0.000896,-0.172450,0.008586,-0.482171
418593,403129,211,5716,50,17,7,114,0.010883,0.172313,0.008844,...,0.005832,-0.045512,-0.008823,-0.026153,-0.011428,-0.142636,0.011253,-0.224195,0.000609,-0.341878


In [6]:
df = x_train.sort_values(by=['STOCK', "ID"]).reset_index(drop=True)
df = df.drop(columns=['DATE'])
df = df.fillna(0)

ret_cols = [col for col in df.columns if col.startswith('RET')]
df[ret_cols] = df[ret_cols].apply(lambda x: np.log1p(x))

df

Unnamed: 0,ID,STOCK,INDUSTRY,INDUSTRY_GROUP,SECTOR,SUB_INDUSTRY,RET_1,VOLUME_1,RET_2,VOLUME_2,...,RET_16,VOLUME_16,RET_17,VOLUME_17,RET_18,VOLUME_18,RET_19,VOLUME_19,RET_20,VOLUME_20
0,2377,0,37,12,5,94,-0.005985,0.136699,0.008990,-0.003109,...,-0.009262,-0.493354,-0.007689,-0.585497,-0.001064,-0.351363,0.005114,-0.324675,-0.019463,-0.291751
1,5198,0,37,12,5,94,0.001347,-0.269520,0.011039,-0.085622,...,-0.000762,-0.313575,0.007837,0.071338,0.007704,-0.405243,-0.003281,-0.424336,-0.010544,-0.050591
2,8017,0,37,12,5,94,-0.014510,0.192655,0.003607,0.353804,...,0.002739,-0.367499,-0.005860,-0.405562,0.002926,-0.315935,0.010408,-0.474957,-0.003547,-0.260130
3,20826,0,37,12,5,94,0.008898,0.430916,0.002659,0.175444,...,0.009056,0.023598,0.011203,0.079711,0.018859,-0.230167,-0.000287,-0.312123,0.008644,-0.226628
4,33843,0,37,12,5,94,-0.006544,-0.060371,-0.007661,-0.037806,...,-0.014567,-0.337686,-0.007250,-0.161117,-0.001462,-0.095494,0.012588,0.471895,-0.039523,1.532045
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
418590,391556,5716,50,17,7,114,0.029123,-0.075091,-0.001429,-0.026934,...,0.024256,0.076480,0.016883,0.170516,0.002273,-0.106224,-0.035209,0.123750,-0.015800,-0.228186
418591,394490,5716,50,17,7,114,0.008282,0.028099,-0.006711,-0.169174,...,0.020753,0.090287,0.002883,-0.050408,0.008698,-0.159294,0.026983,-0.022922,0.008153,-0.080569
418592,400150,5716,50,17,7,114,-0.004644,-0.173518,0.001686,-0.231339,...,-0.024822,0.695373,-0.015439,-0.149467,-0.036467,-0.262389,0.000896,-0.172450,0.008549,-0.482171
418593,403129,5716,50,17,7,114,0.010824,0.172313,0.008805,-0.198001,...,0.005815,-0.045512,-0.008862,-0.026153,-0.011494,-0.142636,0.011190,-0.224195,0.000609,-0.341878


In [7]:
ret_cols = sorted(
    [col for col in df.columns if col.startswith("RET_")],
    key=lambda x: int(x.split("_")[1])
)
volume_cols = sorted(
    [col for col in df.columns if col.startswith("VOLUME_")],
    key=lambda x: int(x.split("_")[1])
)

# expand RET and VOLUME
rets = df[ret_cols].values.ravel()
volumes = df[volume_cols].values.ravel()

# repeat INDUSTRY, INDUSTRY_GROUP, SECTOR, SUB_INDUSTRY for each RET and VOLUME
industry = np.repeat(df["INDUSTRY"].values, len(ret_cols))
industry_group = np.repeat(df["INDUSTRY_GROUP"].values, len(ret_cols))
sector = np.repeat(df["SECTOR"].values, len(ret_cols))
sub_industry = np.repeat(df["SUB_INDUSTRY"].values, len(ret_cols))

# generate day for RET{i} and VOLUME{i}
days = np.tile(np.arange(20,0,-1), len(df))


# generate ID and stock (repeat 20 times)
ids = np.repeat(df["ID"].values, len(ret_cols))
stocks = np.repeat(df["STOCK"].values, len(ret_cols))

df_signature = pd.DataFrame({
    "ID": ids,
    "STOCK": stocks,
    "DAY": days,
    "RET": rets,
    "VOLUME": volumes,
    "INDUSTRY": industry,
    "INDUSTRY_GROUP": industry_group,
    "SECTOR": sector,
    "SUB_INDUSTRY": sub_industry
})

df_signature = df_signature.sort_values(["STOCK", "ID", "DAY"])
df_signature

Unnamed: 0,ID,STOCK,DAY,RET,VOLUME,INDUSTRY,INDUSTRY_GROUP,SECTOR,SUB_INDUSTRY
19,2377,0,1,-0.019463,-0.291751,37,12,5,94
18,2377,0,2,0.005114,-0.324675,37,12,5,94
17,2377,0,3,-0.001064,-0.351363,37,12,5,94
16,2377,0,4,-0.007689,-0.585497,37,12,5,94
15,2377,0,5,-0.009262,-0.493354,37,12,5,94
...,...,...,...,...,...,...,...,...,...
8371884,416228,5716,16,-0.003120,-0.276386,50,17,7,114
8371883,416228,5716,17,-0.056791,-0.059554,50,17,7,114
8371882,416228,5716,18,0.006046,-0.296746,50,17,7,114
8371881,416228,5716,19,0.024358,-0.189710,50,17,7,114


### Issue RET and VOLUME has outliers, maybe need to drop them

In [8]:
print(df_signature.RET.max())
print(df_signature.RET.min())
print(df_signature.VOLUME.max())
print(df_signature.VOLUME.min())

4.7874880981224415
-2.2663186682994865
1232.1740087603175
-5.600055783253279


In [11]:
df_signature = df_signature[
    (df_signature['RET'] >= df_signature['RET'].quantile(0.005)) &
    (df_signature['RET'] <= df_signature['RET'].quantile(0.995)) &
    (df_signature['VOLUME'] >= df_signature['VOLUME'].quantile(0.005)) &
    (df_signature['VOLUME'] <= df_signature['VOLUME'].quantile(0.995))
]

Day SIG_3 === 19?? why?

In [13]:
def calculate_signature(df, order=3, sig_col=[]):
    grouped = df.groupby(["STOCK", "ID", 'INDUSTRY', 'INDUSTRY_GROUP', 'SECTOR', 'SUB_INDUSTRY'])
    signatures = []
    keys = []

    for (stock, id_, industry, industry_group, sector, sub_industry), group in grouped:
        group = group.sort_values("DAY", ascending=True)

        # Compute additional indicators:
        group['MOMENTUM'] = group['RET'].diff().fillna(0)
        # group['SMA_RET'] = group['RET'].rolling(window=3, min_periods=1).mean()
        # group['VOLATILITY'] = group['RET'].rolling(window=3, min_periods=1).std().fillna(0)
        group['SMA_VOLUME'] = group['VOLUME'].rolling(window=3, min_periods=1).mean()
        # Since RET is log-transformed, cumulative return is computed via exponentiating the cumsum
        # group['CUM_RET'] = np.exp(group['RET'].cumsum()) - 1

        path = group[sig_col].values.astype(np.float64)

        base_sig = iisignature.sig(path, order)
        sig = np.insert(base_sig, 0, 1.0) # augmented signature
        signatures.append(sig)
        keys.append((stock, id_, industry, industry_group, sector, sub_industry))

    sig_length = iisignature.siglength(len(sig_col), order)
    sig_columns = [f"SIG_{i}" for i in range(sig_length+1)]

    df_signature = pd.DataFrame(signatures, columns=sig_columns)
    df_signature[["STOCK", "ID", 'INDUSTRY', 'INDUSTRY_GROUP', 'SECTOR', 'SUB_INDUSTRY']] = keys

    # Reorganize columns
    cols_order = ["STOCK", "ID", 'INDUSTRY', 'INDUSTRY_GROUP', 'SECTOR', 'SUB_INDUSTRY'] + sig_columns
    df_signature = df_signature[cols_order]

    return df_signature

# Original features: RET, VOLUME, DAY
# Additional 5 features: MOMENTUM, SMA_RET, VOLATILITY, SMA_VOLUME, CUM_RET
sig_lst = [
    "RET",          # Original log-transformed return
    "VOLUME",       # Trading volume
    "DAY",          # Time indicator
    "MOMENTUM",     # First difference of RET
    # "SMA_RET",      # 3-day simple moving average of RET
    # "VOLATILITY",   # 3-day rolling standard deviation of RET
    "SMA_VOLUME",   # 3-day simple moving average of VOLUME
    # "CUM_RET"       # Cumulative return: exp(cumsum(RET)) - 1
    ]
df_final = calculate_signature(df_signature, order=3, sig_col=sig_lst)
df_final

[1;30;43m流式输出内容被截断，只能显示最后 5000 行内容。[0m
5652 99570
5652 102295
5652 107843
5652 110846
5652 113530
5652 116384
5652 123835
5652 126839
5652 129600
5652 132391
5652 137976
5652 140956
5652 143756
5652 146476
5652 151841
5652 154808
5652 160437
5652 167929
5652 173470
5652 176436
5652 179439
5652 182439
5652 185359
5652 190878
5652 193862
5652 196818
5652 202043
5652 207286
5652 210283
5652 215656
5652 218602
5652 221416
5652 229586
5652 232549
5652 247300
5652 250177
5652 255294
5652 260918
5652 263740
5652 266745
5652 269709
5652 272452
5652 275388
5652 283398
5652 288735
5652 293754
5652 296712
5652 299642
5652 302518
5652 308035
5652 313144
5652 316116
5652 319088
5652 330944
5652 336247
5652 339125
5652 344518
5652 347358
5652 350337
5652 363198
5652 366177
5652 373351
5652 376159
5652 383815
5652 391505
5652 394444
5652 397168
5652 400104
5652 403093
5652 405779
5652 416177
5653 5151
5653 7971
5653 10815
5653 18457
5653 63179
5653 68366
5653 75952
5653 86416
5653 91406
5653 102296

Unnamed: 0,STOCK,ID,INDUSTRY,INDUSTRY_GROUP,SECTOR,SUB_INDUSTRY,SIG_0,SIG_1,SIG_2,SIG_3,...,SIG_146,SIG_147,SIG_148,SIG_149,SIG_150,SIG_151,SIG_152,SIG_153,SIG_154,SIG_155
0,0,2377,37,12,5,94,1.0,0.013478,0.428449,19.0,...,0.000957,0.014810,-0.186846,0.000521,-0.000430,-0.010830,-0.447852,2.053303,-0.003519,0.006626
1,0,5198,37,12,5,94,1.0,0.011891,-0.218929,19.0,...,-0.000129,0.000265,0.041227,-0.000042,0.001409,-0.000763,0.025077,0.531083,-0.001262,-0.000909
2,0,8017,37,12,5,94,1.0,-0.010962,0.452785,19.0,...,0.000200,0.000808,-0.022517,0.000088,-0.001202,-0.001312,0.002075,0.256479,-0.000947,0.010497
3,0,20826,37,12,5,94,1.0,0.000254,0.657544,19.0,...,0.000003,-0.001654,-0.018307,0.000001,-0.000241,0.000178,-0.015720,0.182750,0.000392,0.010741
4,0,33843,37,12,5,94,1.0,0.032979,-1.592417,19.0,...,-0.001113,-0.003338,0.249961,-0.000390,-0.022180,0.002762,0.127567,27.744835,0.001417,-0.746880
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
418590,5716,391556,50,17,7,114,1.0,0.044924,0.153095,19.0,...,0.001134,-0.007119,0.000702,0.000143,0.000554,0.003502,-0.047801,0.828372,0.000086,0.000497
418591,5716,394490,50,17,7,114,1.0,0.000129,0.108667,19.0,...,0.000019,-0.000505,-0.027009,0.000005,-0.001875,0.000194,0.004026,0.206992,0.000834,0.000292
418592,5716,400150,50,17,7,114,1.0,-0.013193,0.308652,19.0,...,0.003563,0.012251,-0.366435,0.000034,0.052786,0.000306,-0.650324,3.462449,-0.032832,0.007911
418593,5716,403129,50,17,7,114,1.0,0.010215,0.514191,19.0,...,0.000275,-0.002166,0.045480,0.000029,-0.000273,0.001088,-0.017277,0.398116,0.000707,0.003245


In [14]:
df_final = pd.merge(df_final, y_train, on=["ID"], how="inner")
# df_final['RET'] = df_final["RET"].astype(int) # Convert RET to binary classification target
df_final['RET'] = df_final["RET"].astype(int)
df_final

Unnamed: 0,STOCK,ID,INDUSTRY,INDUSTRY_GROUP,SECTOR,SUB_INDUSTRY,SIG_0,SIG_1,SIG_2,SIG_3,...,SIG_147,SIG_148,SIG_149,SIG_150,SIG_151,SIG_152,SIG_153,SIG_154,SIG_155,RET
0,0,2377,37,12,5,94,1.0,0.013478,0.428449,19.0,...,0.014810,-0.186846,0.000521,-0.000430,-0.010830,-0.447852,2.053303,-0.003519,0.006626,0
1,0,5198,37,12,5,94,1.0,0.011891,-0.218929,19.0,...,0.000265,0.041227,-0.000042,0.001409,-0.000763,0.025077,0.531083,-0.001262,-0.000909,0
2,0,8017,37,12,5,94,1.0,-0.010962,0.452785,19.0,...,0.000808,-0.022517,0.000088,-0.001202,-0.001312,0.002075,0.256479,-0.000947,0.010497,1
3,0,20826,37,12,5,94,1.0,0.000254,0.657544,19.0,...,-0.001654,-0.018307,0.000001,-0.000241,0.000178,-0.015720,0.182750,0.000392,0.010741,1
4,0,33843,37,12,5,94,1.0,0.032979,-1.592417,19.0,...,-0.003338,0.249961,-0.000390,-0.022180,0.002762,0.127567,27.744835,0.001417,-0.746880,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
418590,5716,391556,50,17,7,114,1.0,0.044924,0.153095,19.0,...,-0.007119,0.000702,0.000143,0.000554,0.003502,-0.047801,0.828372,0.000086,0.000497,1
418591,5716,394490,50,17,7,114,1.0,0.000129,0.108667,19.0,...,-0.000505,-0.027009,0.000005,-0.001875,0.000194,0.004026,0.206992,0.000834,0.000292,1
418592,5716,400150,50,17,7,114,1.0,-0.013193,0.308652,19.0,...,0.012251,-0.366435,0.000034,0.052786,0.000306,-0.650324,3.462449,-0.032832,0.007911,0
418593,5716,403129,50,17,7,114,1.0,0.010215,0.514191,19.0,...,-0.002166,0.045480,0.000029,-0.000273,0.001088,-0.017277,0.398116,0.000707,0.003245,1


In [None]:
# df_final.to_csv('signature.csv') # uncomment when exporting data

In [15]:
industry_lst = df_final.INDUSTRY.unique() # 'INDUSTRY'
industry_group_lst = df_final.INDUSTRY_GROUP.unique() # 'INDUSTRY_GROUP'
sector_lst = df_final.SECTOR.unique() # 'SECTOR'
sub_industry_lst = df_final.SUB_INDUSTRY.unique() # 'SUB_INDUSTRY'

In [16]:
param_grid = {"alpha": np.logspace(-3, 3, num=10)}
results = []

for idx, df_grouped in df_final.groupby("SECTOR"):
    try:
        # Clean memory
        import gc
        gc.collect()

        # Build feature matrix
        feature_cols = [c for c in df_grouped.columns if c.startswith("SIG_")]
        X = df_grouped[feature_cols].values
        y = np.array(df_grouped["RET"])

        # Train/validation split
        X_train_sub, X_val_sub, y_train_sub, y_val_sub = train_test_split(
            X, y, test_size=0.2, random_state=42
        )

        # Scale the features
        scaler = StandardScaler()
        X_train_sub = scaler.fit_transform(X_train_sub)
        X_val_sub = scaler.transform(X_val_sub)

        # Grid search (5-fold CV)
        gs = GridSearchCV(
            estimator=RidgeClassifier(),
            param_grid=param_grid,
            cv=5,
            scoring="accuracy",
            # scoring="f1",
            n_jobs=2
        )
        gs.fit(X_train_sub, y_train_sub)

        # Evaluate on validation set
        best_alpha = gs.best_params_["alpha"]
        best_model = gs.best_estimator_
        y_val_pred = best_model.predict(X_val_sub)
        acc = accuracy_score(y_val_sub, y_val_pred)
        report = classification_report(y_val_sub, y_val_pred)

        # Store results
        results.append({
            "Grouped_by": idx,
            "Num_Samples": len(df_grouped),
            "Train_Samples": len(X_train_sub),
            "Val_Samples": len(X_val_sub),
            "Best_Alpha": best_alpha,
            "Val_Accuracy": acc,
            "Report": report,
            "Model": best_model
        })
    except Exception as e:
        print(f"Error processing group {idx}: {str(e)}")
        continue



# Summarize
df_results = pd.DataFrame(results)
print(df_results[[
    "Grouped_by", "Num_Samples", "Train_Samples", "Val_Samples", "Best_Alpha", "Val_Accuracy"
]])

    Grouped_by  Num_Samples  Train_Samples  Val_Samples   Best_Alpha  \
0            0         6304           5043         1261  1000.000000   
1            1        21264          17011         4253     0.001000   
2            2        18967          15173         3794     0.100000   
3            3        55473          44378        11095   215.443469   
4            4        63519          50815        12704  1000.000000   
5            5        17295          13836         3459     2.154435   
6            6        55123          44098        11025    46.415888   
7            7        87903          70322        17581  1000.000000   
8            8        70843          56674        14169    10.000000   
9            9         5555           4444         1111    10.000000   
10          10        13295          10636         2659    46.415888   
11          11         3054           2443          611     0.464159   

    Val_Accuracy  
0       0.493259  
1       0.541265  
2     

In [17]:
df_results

Unnamed: 0,Grouped_by,Num_Samples,Train_Samples,Val_Samples,Best_Alpha,Val_Accuracy,Report,Model
0,0,6304,5043,1261,1000.0,0.493259,precision recall f1-score ...,RidgeClassifier(alpha=1000.0)
1,1,21264,17011,4253,0.001,0.541265,precision recall f1-score ...,RidgeClassifier(alpha=0.001)
2,2,18967,15173,3794,0.1,0.510543,precision recall f1-score ...,RidgeClassifier(alpha=0.1)
3,3,55473,44378,11095,215.443469,0.525732,precision recall f1-score ...,RidgeClassifier(alpha=215.44346900318823)
4,4,63519,50815,12704,1000.0,0.522749,precision recall f1-score ...,RidgeClassifier(alpha=1000.0)
5,5,17295,13836,3459,2.154435,0.519514,precision recall f1-score ...,RidgeClassifier(alpha=2.154434690031882)
6,6,55123,44098,11025,46.415888,0.516916,precision recall f1-score ...,RidgeClassifier(alpha=46.41588833612773)
7,7,87903,70322,17581,1000.0,0.525055,precision recall f1-score ...,RidgeClassifier(alpha=1000.0)
8,8,70843,56674,14169,10.0,0.519585,precision recall f1-score ...,RidgeClassifier(alpha=10.0)
9,9,5555,4444,1111,10.0,0.533753,precision recall f1-score ...,RidgeClassifier(alpha=10.0)


In [11]:
import pandas as pd
a = pd.read_parquet('datasets/sig_data_SP500_w10_o3_d3_weighted_gamma0.5_clusters.parquet').query('CLUSTER == 8')
a.symbol.unique()

array(['SW', 'VLTO'], dtype=object)