In [1]:
import numpy as np
import pandas as pd
import iisignature
from iisignature import sig, prepare, logsig, logsiglength

In [2]:
x_train = pd.read_csv(r"C:\Users\shirl\OneDrive\桌面\ucb capstone qrt\raw data provided by qrt\x_train.csv")

In [6]:
x_train.head()

Unnamed: 0,ID,DATE,STOCK,INDUSTRY,INDUSTRY_GROUP,SECTOR,SUB_INDUSTRY,RET_1,VOLUME_1,RET_2,...,RET_16,VOLUME_16,RET_17,VOLUME_17,RET_18,VOLUME_18,RET_19,VOLUME_19,RET_20,VOLUME_20
0,0,0,2,18,5,3,44,-0.015748,0.147931,-0.015504,...,0.059459,0.630899,0.003254,-0.379412,0.008752,-0.110597,-0.012959,0.174521,-0.002155,-0.000937
1,1,0,3,43,15,6,104,0.003984,,-0.09058,...,0.015413,,0.003774,,-0.018518,,-0.028777,,-0.034722,
2,2,0,4,57,20,8,142,0.00044,-0.096282,-0.058896,...,0.008964,-0.010336,-0.017612,-0.354333,-0.006562,-0.519391,-0.012101,-0.356157,-0.006867,-0.308868
3,3,0,8,1,1,1,2,0.031298,-0.42954,0.007756,...,-0.031769,0.012105,0.033824,-0.290178,-0.001468,-0.663834,-0.01352,-0.562126,-0.036745,-0.631458
4,4,0,14,36,12,5,92,0.027273,-0.847155,-0.039302,...,-0.038461,-0.277083,-0.012659,0.139086,0.004237,-0.017547,0.004256,0.57951,-0.040817,0.802806


In [7]:
df = x_train.sort_values(by=['STOCK', "ID"]).reset_index(drop=True)
df = df.drop(columns=['DATE'])
df = df.fillna(0)

ret_cols = [col for col in df.columns if col.startswith('RET')]
df[ret_cols] = df[ret_cols].apply(lambda x: np.log1p(x))

df.head()

Unnamed: 0,ID,STOCK,INDUSTRY,INDUSTRY_GROUP,SECTOR,SUB_INDUSTRY,RET_1,VOLUME_1,RET_2,VOLUME_2,...,RET_16,VOLUME_16,RET_17,VOLUME_17,RET_18,VOLUME_18,RET_19,VOLUME_19,RET_20,VOLUME_20
0,2377,0,37,12,5,94,-0.005985,0.136699,0.00899,-0.003109,...,-0.009262,-0.493354,-0.007689,-0.585497,-0.001064,-0.351363,0.005114,-0.324675,-0.019463,-0.291751
1,5198,0,37,12,5,94,0.001347,-0.26952,0.011039,-0.085622,...,-0.000762,-0.313575,0.007837,0.071338,0.007704,-0.405243,-0.003281,-0.424336,-0.010544,-0.050591
2,8017,0,37,12,5,94,-0.01451,0.192655,0.003607,0.353804,...,0.002739,-0.367499,-0.00586,-0.405562,0.002926,-0.315935,0.010408,-0.474957,-0.003547,-0.26013
3,20826,0,37,12,5,94,0.008898,0.430916,0.002659,0.175444,...,0.009056,0.023598,0.011203,0.079711,0.018859,-0.230167,-0.000287,-0.312123,0.008644,-0.226628
4,33843,0,37,12,5,94,-0.006544,-0.060371,-0.007661,-0.037806,...,-0.014567,-0.337686,-0.00725,-0.161117,-0.001462,-0.095494,0.012588,0.471895,-0.039523,1.532045


In [10]:
ret_cols = sorted(
    [col for col in df.columns if col.startswith("RET_")],
    key=lambda x: int(x.split("_")[1])
)
volume_cols = sorted(
    [col for col in df.columns if col.startswith("VOLUME_")],
    key=lambda x: int(x.split("_")[1])
)

# expand RET and VOLUME
rets = df[ret_cols].values.ravel()
volumes = df[volume_cols].values.ravel()

# generate day for RET{i} and VOLUME{i}
days = np.tile(np.arange(1, 21), len(df))

# generate ID and stock (repeat 20 times)
ids = np.repeat(df["ID"].values, len(ret_cols))
stocks = np.repeat(df["STOCK"].values, len(ret_cols))

df_signature = pd.DataFrame({
    "ID": ids,
    "STOCK": stocks,
    "DAY": days,
    "RET": rets,
    "VOLUME": volumes
})

df_signature = df_signature.sort_values(["STOCK", "ID", "DAY"])
df_signature.head(20)

Unnamed: 0,ID,STOCK,DAY,RET,VOLUME
0,2377,0,1,-0.005985,0.136699
1,2377,0,2,0.00899,-0.003109
2,2377,0,3,0.008262,0.015042
3,2377,0,4,0.002634,0.570881
4,2377,0,5,0.012658,0.746002
5,2377,0,6,0.038965,1.461322
6,2377,0,7,-0.000854,-0.037151
7,2377,0,8,0.02858,0.118536
8,2377,0,9,-0.002852,-0.225504
9,2377,0,10,-0.006113,-0.268285


In [11]:
def calculate_signature(df, depth=3):
    grouped = df.groupby(["STOCK", "ID"])
    signatures = []
    keys = []
    
    for (stock, id_), group in grouped:
        # descending order on DAY
        group = group.sort_values("DAY", ascending=False)
        path = group[["RET", "VOLUME", "DAY"]].values.astype(np.float64)

        # std = np.std(path, axis=0)
        # std[std == 0] = 1.0
        # path_norm = path / std

        base_sig = iisignature.sig(path, depth)
        sig = np.insert(base_sig, 0, 1.0)
        
        signatures.append(sig)
        keys.append((stock, id_))
    
    sig_length = iisignature.siglength(3, depth)
    sig_columns = [f"SIG_{i}" for i in range(sig_length+1)]
    
    df_signature = pd.DataFrame(signatures, columns=sig_columns)
    df_signature[["STOCK", "ID"]] = keys
    
    return df_signature

df_final = calculate_signature(df_signature, depth=3)
df_final

Unnamed: 0,SIG_0,SIG_1,SIG_2,SIG_3,SIG_4,SIG_5,SIG_6,SIG_7,SIG_8,SIG_9,...,SIG_32,SIG_33,SIG_34,SIG_35,SIG_36,SIG_37,SIG_38,SIG_39,STOCK,ID
0,1.0,0.013478,0.428449,-19.0,9.082757e-05,0.009108,-0.428579,-0.003333,0.091784,-4.956083,...,-0.102650,1.126432,0.044290,-2.204561,60.680047,-2.201951,-0.087743,-1143.166667,0,2377
1,1.0,0.011891,-0.218929,-19.0,7.069959e-05,-0.007773,-0.196316,0.005170,0.023965,3.954660,...,0.127537,-0.066829,-0.116247,-0.381386,7.583428,0.314765,-5.739110,-1143.166667,0,5198
2,1.0,-0.010962,0.452785,-19.0,6.008520e-05,0.006720,-0.098989,-0.011684,0.102507,-1.873926,...,-0.068548,-0.211418,0.178865,-1.542704,34.266897,-2.813371,46.791971,-1143.166667,0,8017
3,1.0,0.000254,0.657544,-19.0,3.217722e-08,-0.002544,0.077847,0.002711,0.216182,-2.244209,...,0.013742,-0.302089,-0.057454,-3.168119,9.863237,0.936377,92.435108,-1143.166667,0,20826
4,1.0,0.032979,-1.592417,-19.0,5.438006e-04,-0.048787,-0.676390,-0.003729,1.267895,32.774865,...,-0.007140,0.476700,0.025107,-1.288694,-26.644716,-0.711383,37.252379,-1143.166667,0,33843
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
418590,1.0,0.044924,0.153095,-19.0,1.009070e-03,-0.015567,-0.448465,0.022444,0.011719,-4.980251,...,0.095900,0.867096,-0.059219,-0.437152,-13.880403,3.414761,-12.738529,-1143.166667,5716,391556
418591,1.0,0.000129,0.108667,-19.0,8.311179e-09,0.004220,0.070598,-0.004206,0.005904,0.115477,...,-0.041252,-0.583668,0.018718,-0.735698,-12.246397,0.985782,26.834676,-1143.166667,5716,394490
418592,1.0,-0.013193,0.308652,-19.0,8.703255e-05,-0.064216,0.098804,0.060144,0.047633,-9.412588,...,0.358276,1.271780,-0.429097,-2.279437,-21.898073,-2.078660,-22.758782,-1143.166667,5716,400150
418593,1.0,0.010215,0.514191,-19.0,5.217518e-05,0.002404,-0.000213,0.002849,0.132196,-3.360734,...,-0.075941,0.602905,0.020663,-1.329850,-10.061456,1.540369,65.915190,-1143.166667,5716,403129


In [None]:
# df_final.to_csv('signature.csv') # uncomment when exporting data