In [None]:
import torch
import pandas as pd 
import numpy as np
df_AMA_m = pd.read_csv('AMA.csv')
df_AMA_m['Date'] = pd.to_datetime(df_AMA_m['Date'])
df_AMA_m.set_index('Date', inplace=True)

features = torch.load('./DL4MultiFactors/features.pt').float().numpy()
rr = torch.load('./DL4MultiFactors/return_rate.pt').float().numpy()


### Equal weighted

In [None]:
factor_ew = pd.DataFrame(np.nan, index= df_AMA_m.index, columns=df_AMA_m.columns)
n,d,t = features.shape
direction = np.array([1, -1, -1, -1, -1, -1, 1, -1, -1, 1, -1, 1])
for i in range(t):
    #training set and features normalization
    feature = features[:,:,i]*direction
    valid_index = ~np.any(np.isnan(feature),axis = 1)
    X_train = feature[valid_index]
    mean = np.mean(X_train, axis=0)
    std = np.std(X_train, axis=0)
    X_train = (X_train- mean) / std
    factor_ew.iloc[i, valid_index] = np.mean(X_train, axis=1).reshape((X_train.shape[0],))

### Linear Regression

In [None]:
from sklearn import linear_model

factor_lr = pd.DataFrame(np.nan, index= df_AMA_m.index, columns=df_AMA_m.columns)
n,d,t = features.shape
for i in range(t):
    if i == 0:
        #training set and features normalization
        feature = features[:,:,i]
        valid_index = ~np.any(np.isnan(feature),axis = 1)
        X_train = feature[valid_index]
        mean = np.mean(X_train, axis=0)
        std = np.std(X_train, axis=0)
        X_train = (X_train- mean) / std
        factor_lr.iloc[i, valid_index] = np.mean(X_train, axis=1).reshape((X_train.shape[0],))
    else:
        #LR model training
        feature = features[:,:,:i].reshape((n*i, d))
        return_rate = rr[:,:,:i].reshape((n*i, 1))
        valid_index = ~((np.any(np.isnan(feature),axis = 1))|(np.any(np.isnan(return_rate),axis = 1))) #没有nan值的样本index
        X_train = feature[valid_index]
        mean = np.mean(X_train, axis=0)
        std = np.std(X_train, axis=0)
        X_train = (X_train- mean) / std
        y_train = return_rate[valid_index].reshape(-1)
        reg = linear_model.LinearRegression()
        reg.fit(X_train, y_train)
        # Normalization and Fit the current 
        X = features[:,:,i]
        valid_index = ~np.any(np.isnan(X),axis = 1)
        X_train = X[valid_index]
        mean = np.mean(X_train, axis=0)
        std = np.std(X_train, axis=0)
        X_train = (X_train- mean)/std
        factor_lr.iloc[i,valid_index] = reg.predict(X_train)  

### SVR

In [None]:
from sklearn import svm

factor_svr = pd.DataFrame(np.nan, index= df_AMA_m.index, columns=df_AMA_m.columns)
n,d,t = features.shape
for i in range(t):
    if i == 0:
        #training set and features normalization
        feature = features[:,:,i]
        valid_index = ~np.any(np.isnan(feature),axis = 1)
        X_train = feature[valid_index]
        mean = np.mean(X_train, axis=0)
        std = np.std(X_train, axis=0)
        X_train = (X_train- mean) / std
        factor_svr.iloc[i, valid_index] = np.mean(X_train, axis=1).reshape((X_train.shape[0],))
        continue

    if i % 12 == 1:
        print(i)
        if i == 1:
            reg = svm.SVR()
            feature = features[:,:,:i].reshape((n*i, d))
            return_rate = rr[:,:,:i].reshape((n*i, 1))
            valid_index = ~((np.any(np.isnan(feature),axis = 1))|(np.any(np.isnan(return_rate),axis = 1))) #没有nan值的样本index
            X_train = feature[valid_index]
            mean = np.mean(X_train, axis=0)
            std = np.std(X_train, axis=0)
            X_train = (X_train- mean) / std
            y_train = return_rate[valid_index].reshape(-1)
            reg.fit(X_train, y_train)
        else:
            feature = features[:,:,(i - 12):i].reshape((n*12, d))
            return_rate = rr[:,:,(i-12):i].reshape((n*12, 1))
            valid_index = ~((np.any(np.isnan(feature),axis = 1))|(np.any(np.isnan(return_rate),axis = 1))) #没有nan值的样本index
            X_train = feature[valid_index]
            mean = np.mean(X_train, axis=0)
            std = np.std(X_train, axis=0)
            X_train = (X_train- mean) / std
            y_train = return_rate[valid_index].reshape(-1)
            reg.fit(X_train, y_train)
    # Normalization and Fit the current
    X = features[:,:,i]
    valid_index = ~np.any(np.isnan(X),axis = 1)
    X_train = X[valid_index]
    mean = np.mean(X_train, axis=0)
    std = np.std(X_train, axis=0)
    X_train = (X_train- mean)/std
    factor_svr.iloc[i,valid_index] = reg.predict(X_train)

### SVR + RFE


In [None]:
from sklearn import svm
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import RFE

factor_svr_rfe = pd.DataFrame(np.nan, index= df_AMA_m.index, columns=df_AMA_m.columns)
n,d,t = features.shape
# stock numbers(5523) * features(12) * time （114 months）
n_features_to_select = 5  # 选择希望保留的特征数量

for i in range(t):
    if i == 0:
        #第0次，无训练集，选用12个因子等权均值
        feature = features[:,:,i]
        valid_index = ~np.any(np.isnan(feature),axis = 1)
        X_train = feature[valid_index]
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        factor_svr_rfe.iloc[i, valid_index] = np.mean(X_train, axis=1).reshape((X_train.shape[0],))
        continue
    if i % 12 == 1:
        #每12期更新一次，对现有模型重新fit新的12期训练数据
        print(i)
        if i == 1:
            reg = svm.SVR(kernel='linear')
            feature = features[:,:,:i].reshape((n*i, d))
            return_rate = rr[:,:,:i].reshape((n*i, 1))
            #去掉含有nan特征与nan 收益率的样本点
            valid_index = ~((np.any(np.isnan(feature),axis = 1))|(np.any(np.isnan(return_rate),axis = 1))) #没有nan值的样本index
            X_train = feature[valid_index]
            #feature normalization
            scaler = StandardScaler()
            X_train = scaler.fit_transform(X_train)
            y_train = return_rate[valid_index].reshape(-1)
            #RFE 特征选择&模型训练
            rfe = RFE(estimator=reg, n_features_to_select=n_features_to_select)
            X_train_rfe = rfe.fit_transform(X_train, y_train)
            reg.fit(X_train_rfe, y_train)
        else:
            feature = features[:,:,(i - 12):i].reshape((n*12, d))
            return_rate = rr[:,:,(i-12):i].reshape((n*12, 1))
            valid_index = ~((np.any(np.isnan(feature),axis = 1))|(np.any(np.isnan(return_rate),axis = 1))) #没有nan值的样本index
            X_train = feature[valid_index]
            scaler = StandardScaler()
            X_train = scaler.fit_transform(X_train)
            y_train = return_rate[valid_index].reshape(-1)
            #RFE 特征选择
            rfe = RFE(estimator=reg, n_features_to_select=n_features_to_select)
            X_train_rfe = rfe.fit_transform(X_train, y_train)
            reg.fit(X_train_rfe, y_train)
    # test set(下一期因子集) Normalization and predict
    X = features[:,:,i]
    valid_index = ~np.any(np.isnan(X),axis = 1)
    X_test = X[valid_index]
    X_test = scaler.transform(X_test)
    X_test_rfe = rfe.transform(X_test)#RFE 选择特征
    factor_svr_rfe.iloc[i,valid_index] = reg.predict(X_test_rfe)