In [2]:
import torch
import random
import pandas as pd
import numpy as np
import os
import math
import pickle
from collections import Counter
import torch.nn.functional as F
from sklearn.preprocessing import StandardScaler, LabelEncoder

# 构建数据集

In [63]:
DATA_DIR = ".\\datasets"
SP500_LIST_PATH = ".\\datasets\\SP500_Companies.csv"
SP500_PATH = ".\\datasets\\SP500_datasets"

csv_path=SP500_LIST_PATH
stock_path=SP500_PATH

In [71]:
# the information of stocks(i.e. name & category of sector).
SP500_name = pd.read_csv(csv_path, encoding='ISO-8859-1')

# original information of stock price.
SP500_stock = {}
for target in SP500_name["Symbol"].unique():
    da = {}
    da["category"] = SP500_name[SP500_name.Symbol == target]["Sector"].iloc[0]
    out_name = os.path.join(stock_path, target + ".csv")
    da["stock_price"] = pd.read_csv(out_name, encoding='ISO-8859-1')[64:]
    SP500_stock[target] = da

In [72]:
SP500_stock['LVS']

{'category': 'Consumer Services',
 'stock_price':             Date       Open       High        Low      Close  Adj Close  \
 64    2015-04-07  56.720001  56.919998  55.970001  56.009998  42.979973   
 65    2015-04-08  56.799999  57.599998  55.750000  56.930000  43.685955   
 66    2015-04-09  57.610001  59.900002  57.599998  59.560001  45.704109   
 67    2015-04-10  59.400002  59.430000  58.669998  59.290001  45.496929   
 68    2015-04-13  58.930000  59.000000  57.759998  58.049999  44.545399   
 ...          ...        ...        ...        ...        ...        ...   
 2009  2022-12-23  46.369999  46.740002  45.900002  46.520000  46.169579   
 2010  2022-12-27  47.160000  48.750000  46.980000  48.459999  48.094967   
 2011  2022-12-28  47.980000  48.080002  46.169998  46.180000  45.832142   
 2012  2022-12-29  46.540001  47.919998  46.340000  47.080002  46.725361   
 2013  2022-12-30  46.570000  48.110001  46.439999  48.070000  47.707905   
 
         Volume  
 64     2681600  
 

In [73]:
# Let all stocks has the same date.
print(SP500_stock.keys())
need_day = np.array(SP500_stock["AAPL"]["stock_price"]["Date"])
for target in SP500_stock.keys():
    SP500_stock[target]["stock_price"] = SP500_stock[target]["stock_price"][
        SP500_stock[target]["stock_price"]["Date"].isin(need_day)].reset_index(
        drop=True
    )
    SP500_stock[target]["stock_price"].index = SP500_stock[target]["stock_price"]["Date"]
print('same date over')

dict_keys(['LVS', 'NWS', 'NWSA', 'LYV', 'DIS', 'ROL', 'CMG', 'MGM', 'CTAS', 'HLT', 'SBUX', 'CMCSA', 'DPZ', 'BKNG', 'YUM', 'MCD', 'MAR', 'DRI', 'EXPE', 'CZR', 'CHTR', 'WBD', 'RCL', 'CCL', 'WYNN', 'PARA', 'NCLH', 'CRM', 'PANW', 'TYL', 'CSGP', 'NOW', 'VRSK', 'CDNS', 'IBM', 'SNPS', 'INTU', 'PTC', 'ADSK', 'PAYC', 'FICO', 'ANSS', 'ADBE', 'MSCI', 'NFLX', 'FTNT', 'ROP', 'TRMB', 'ORCL', 'EA', 'AKAM', 'BR', 'ATVI', 'META', 'FDS', 'MSFT', 'JKHY', 'ADP', 'VRSN', 'IT', 'FFIV', 'PAYX', 'MTCH', 'GOOG', 'ACN', 'GOOGL', 'EPAM', 'CDW', 'J', 'JNPR', 'CSCO', 'CTSH', 'GEN', 'FIS', 'TTWO', 'DXC', 'TAP', 'CLX', 'VFC', 'TSN', 'EL', 'CHD', 'MNST', 'BF-B', 'CL', 'MKC', 'NKE', 'PEP', 'KDP', 'KMB', 'PG', 'KO', 'HSY', 'K', 'MDLZ', 'HRL', 'CAG', 'PM', 'CPB', 'GIS', 'RL', 'MO', 'STZ', 'IFF', 'SJM', 'WELL', 'VTR', 'DLR', 'EQIX', 'AMT', 'CPT', 'SBAC', 'PGR', 'IRM', 'AJG', 'O', 'ICE', 'MKTX', 'ARE', 'WY', 'PLD', 'KIM', 'CBRE', 'PRU', 'FI', 'EQR', 'MMC', 'ESS', 'UDR', 'REG', 'MAA', 'BRO', 'CCI', 'AON', 'CME', 'CBOE', 'M

In [74]:
### feature ###

# normalize stock price
normalize_scalar = {}
for target in SP500_stock.keys():
    scaler = StandardScaler()
    nor_data = scaler.fit_transform(np.array(SP500_stock[target]["stock_price"]["Close"]).reshape(-1, 1)).ravel()
    SP500_stock[target]["stock_price"]["nor_close"] = nor_data
    normalize_scalar[target] = scaler

print('normalize over')

# calculate return ratio
for target in SP500_stock.keys():
    return_tratio = []
    data = np.array(SP500_stock[target]["stock_price"]["Close"])
    for i in range(len(data)):
        if i == 0:
            return_tratio.append(0)
        else:
            return_tratio.append((data[i] - data[i - 1]) / data[i - 1])
    SP500_stock[target]["stock_price"]["return ratio"] = return_tratio
print('calculate return ratio over')

# feature of c_open / c_close / c_low
for target in SP500_stock.keys():
    function = lambda x, y: (x / y) - 1
    data = SP500_stock[target]["stock_price"]
    data["c_open"] = list(map(function, data["Open"], data["Close"]))
    data["c_high"] = list(map(function, data["High"], data["Close"]))
    data["c_low"] = list(map(function, data["Low"], data["Close"]))
print('calculate c_feature over')

normalize over
calculate return ratio over
calculate c_feature over


In [75]:
# optimize  5 / 10 / 15 / 20 / 25 / 30 days moving average
for target in SP500_stock.keys():
    data = SP500_stock[target]["stock_price"]["Close"]
    for i in [5, 10, 15, 20, 25, 30]:
        rolling_avg = data.rolling(window=i).mean()
        SP500_stock[target]["stock_price"]["{}-days".format(i)] = (rolling_avg / data - 1).fillna(0)
print('calculate moving average over')

calculate moving average over


In [76]:
SP500_stock['AAPL']['stock_price']

Unnamed: 0_level_0,Date,Open,High,Low,Close,Adj Close,Volume,nor_close,return ratio,c_open,c_high,c_low,5-days,10-days,15-days,20-days,25-days,30-days
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2015-04-07,2015-04-07,31.910000,32.029999,31.495001,31.502501,28.310480,140049200,-0.873963,0.000000,0.012935,0.016745,-0.000238,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2015-04-08,2015-04-08,31.462500,31.600000,31.242500,31.400000,28.218369,149316800,-0.876049,-0.003254,0.001990,0.006369,-0.005016,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2015-04-09,2015-04-09,31.462500,31.645000,31.165001,31.639999,28.434044,129936000,-0.871164,0.007643,-0.005610,0.000158,-0.015013,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2015-04-10,2015-04-10,31.487499,31.802500,31.315001,31.775000,28.555368,160752000,-0.868416,0.004267,-0.009048,0.000865,-0.014477,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2015-04-13,2015-04-13,32.092499,32.142502,31.652500,31.712500,28.499199,145460400,-0.869688,-0.001967,0.011983,0.013559,-0.001892,-0.003358,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-12-23,2022-12-23,130.919998,132.419998,129.639999,131.860001,131.127060,63814900,1.168633,-0.002798,-0.007129,0.004247,-0.016836,0.007447,0.037760,0.053481,0.066745,0.080504,0.089390
2022-12-27,2022-12-27,131.380005,131.410004,128.720001,130.029999,129.307236,69007800,1.131387,-0.013878,0.010382,0.010613,-0.010075,0.018027,0.041244,0.059796,0.076302,0.089170,0.099680
2022-12-28,2022-12-28,129.669998,131.029999,125.870003,126.040001,125.339417,85438400,1.050178,-0.030685,0.028800,0.039591,-0.001349,0.040321,0.058791,0.084423,0.104372,0.116677,0.128610
2022-12-29,2022-12-29,127.989998,130.479996,127.730003,129.610001,128.889572,75703700,1.122839,0.028324,-0.012499,0.006712,-0.014505,0.002654,0.019134,0.048726,0.066847,0.079571,0.092269


In [77]:
# category of sector (one hot encoding)(unnecessary)
# label = LabelEncoder()
# label.fit(SP500_name["Sector"].unique())

# for target in SP500_stock.keys():
#     for label in SP500_name["Sector"].unique():
#         cate = SP500_stock[target]["category"]
#         if label != cate:
#             SP500_stock[target]["stock_price"]["label_%s" % (label)] = 0
#         if label == cate:
#             SP500_stock[target]["stock_price"]["label_%s" % (label)] = 1
# print('one hot sector over')

# total feature
features = {}
for target in SP500_stock.keys():
#     features[target] = SP500_stock[target]["stock_price"].iloc[30:,
#                        [4] + list(range(7, len(SP500_stock[target]["stock_price"].columns)))].reset_index(drop=True)
    features[target] = SP500_stock[target]["stock_price"].iloc[30:,7:].reset_index(drop=True)
print('all features over')

all features over


In [78]:
features['AAPL']

Unnamed: 0,nor_close,return ratio,c_open,c_high,c_low,5-days,10-days,15-days,20-days,25-days,30-days
0,-0.853304,-0.000922,0.004767,0.006227,-0.003306,-0.009779,-0.020474,-0.020369,-0.014727,-0.017360,-0.019064
1,-0.853355,-0.000077,-0.000461,0.007074,-0.005536,-0.003475,-0.016515,-0.019565,-0.014097,-0.016276,-0.017846
2,-0.846588,0.010226,-0.010046,0.001827,-0.011873,-0.009849,-0.021805,-0.026324,-0.023423,-0.024644,-0.026562
3,-0.840736,0.008753,-0.007092,0.003244,-0.008601,-0.012751,-0.026581,-0.032966,-0.031043,-0.030756,-0.033640
4,-0.855594,-0.022031,0.022990,0.025382,-0.003857,0.008610,-0.002106,-0.010708,-0.010384,-0.008298,-0.011158
...,...,...,...,...,...,...,...,...,...,...,...
1915,1.168633,-0.002798,-0.007129,0.004247,-0.016836,0.007447,0.037760,0.053481,0.066745,0.080504,0.089390
1916,1.131387,-0.013878,0.010382,0.010613,-0.010075,0.018027,0.041244,0.059796,0.076302,0.089170,0.099680
1917,1.050178,-0.030685,0.028800,0.039591,-0.001349,0.040321,0.058791,0.084423,0.104372,0.116677,0.128610
1918,1.122839,0.028324,-0.012499,0.006712,-0.014505,0.002654,0.019134,0.048726,0.066847,0.079571,0.092269


In [79]:
# movement of stock
Y_buy_or_not = {}
for target in SP500_stock.keys():
    Y_buy_or_not[target] = (features[target]["return ratio"] >= 0) * 1
    features[target]['buy_or_not'] = Y_buy_or_not[target]
print('movement label over')

movement label over


In [80]:
features['AAPL']

Unnamed: 0,nor_close,return ratio,c_open,c_high,c_low,5-days,10-days,15-days,20-days,25-days,30-days,buy_or_not
0,-0.853304,-0.000922,0.004767,0.006227,-0.003306,-0.009779,-0.020474,-0.020369,-0.014727,-0.017360,-0.019064,0
1,-0.853355,-0.000077,-0.000461,0.007074,-0.005536,-0.003475,-0.016515,-0.019565,-0.014097,-0.016276,-0.017846,0
2,-0.846588,0.010226,-0.010046,0.001827,-0.011873,-0.009849,-0.021805,-0.026324,-0.023423,-0.024644,-0.026562,1
3,-0.840736,0.008753,-0.007092,0.003244,-0.008601,-0.012751,-0.026581,-0.032966,-0.031043,-0.030756,-0.033640,1
4,-0.855594,-0.022031,0.022990,0.025382,-0.003857,0.008610,-0.002106,-0.010708,-0.010384,-0.008298,-0.011158,0
...,...,...,...,...,...,...,...,...,...,...,...,...
1915,1.168633,-0.002798,-0.007129,0.004247,-0.016836,0.007447,0.037760,0.053481,0.066745,0.080504,0.089390,0
1916,1.131387,-0.013878,0.010382,0.010613,-0.010075,0.018027,0.041244,0.059796,0.076302,0.089170,0.099680,0
1917,1.050178,-0.030685,0.028800,0.039591,-0.001349,0.040321,0.058791,0.084423,0.104372,0.116677,0.128610,0
1918,1.122839,0.028324,-0.012499,0.006712,-0.014505,0.002654,0.019134,0.048726,0.066847,0.079571,0.092269,1


In [83]:
int(len(features["AAPL"])*0.7)

1344

In [84]:
## Trianing & Testing ##
train_size = 0.8
test_size = 0.2
days = len(features["AAPL"])

train_day = int(days * train_size)

# data of training set and testing set
train_data = {}
test_data = {}

train_Y_buy_or_not = {}
test_Y_buy_or_not = {}

train_return_ratio = {}
test_return_ratio = {}

train_x, train_y1, train_y2, test_x, test_y1, test_y2 = [], [], [], [], [], []

for target in SP500_stock.keys():
    train_data[target] = features[target].drop(['return ratio', 'buy_or_not'], axis=1).iloc[:train_day, :]
    train_Y_buy_or_not[target] = features[target]['buy_or_not'][:train_day]
    train_return_ratio[target] = features[target]['return ratio'][:train_day]
    
    train_x.append(train_data[target])
    train_y1.append(train_Y_buy_or_not[target])
    train_y2.append(train_return_ratio[target])
    

    test_data[target] = features[target].drop(['return ratio', 'buy_or_not'], axis=1).iloc[train_day:, :]
    test_Y_buy_or_not[target] = features[target]['buy_or_not'][train_day:]
    test_return_ratio[target] = features[target]['return ratio'][train_day:]
    
    test_x.append(test_data[target])
    test_y1.append(test_Y_buy_or_not[target])
    test_y2.append(test_return_ratio[target])    
print('train&test over')

train&test over


In [85]:
train = {}
train["x"] = np.array(train_x)
train["y_return ratio"] = np.array(train_y1)
train["y_up_or_down"] = np.array(train_y2)

test = {}
test["x"] = np.array(test_x)
test["y_return ratio"] = np.array(test_y1)
test["y_up_or_down"] = np.array(test_y2)

data = {"train": train, "test": test}

In [89]:
data['train']['x'].shape

(480, 1536, 10)

In [90]:
data['train']['y_return ratio'].shape

(480, 1536)

In [87]:
data['test']['x'].shape

(480, 384, 10)

In [88]:
with open('./datasets/sp500_data2.pkl', "wb") as f:
    pickle.dump(data, f)
f.close()

# 尝试利用FFT计算单股票周期

In [21]:
df = pd.read_csv('./datasets/test.csv')
df

Unnamed: 0,date,feature1,feature2,feature3,feature4,feature5,feature6,feature7,feature8,feature9,feature10,y_reg
0,2015-02-17,0.609252,-0.001489,0.019196,-0.011749,-0.027006,-0.049661,-0.059672,-0.070396,-0.074261,-0.077291,0.000000
1,2015-02-18,0.621339,-0.003798,0.004954,-0.011559,-0.019452,-0.047028,-0.057695,-0.066314,-0.072583,-0.076079,0.002151
2,2015-02-19,0.612971,0.000496,0.006946,-0.001985,-0.005755,-0.037242,-0.051386,-0.058996,-0.067882,-0.070784,-0.001486
3,2015-02-20,0.601814,0.004640,0.006628,-0.005302,0.001624,-0.028235,-0.044087,-0.051839,-0.062383,-0.066048,-0.001985
4,2015-02-23,0.449336,0.010901,0.013797,-0.007494,0.023744,0.004599,-0.012457,-0.021248,-0.033555,-0.038347,-0.027175
...,...,...,...,...,...,...,...,...,...,...,...,...
1979,2022-12-23,-0.684020,-0.003224,0.004729,-0.013328,0.000301,0.020185,0.020120,0.012296,-0.005374,-0.014625,0.000000
1980,2022-12-27,-0.503650,-0.026826,0.005984,-0.030541,-0.031077,-0.020450,-0.019673,-0.023267,-0.041197,-0.050537,0.041702
1981,2022-12-28,-0.715632,0.038978,0.041143,-0.000217,0.015894,0.022521,0.027833,0.026602,0.009441,-0.001364,-0.047049
1982,2022-12-29,-0.631955,-0.011470,0.017842,-0.015718,-0.002719,-0.000574,0.008114,0.007232,-0.006126,-0.018224,0.019489


In [22]:
data = df.iloc[0:192,1:]
data = data.to_numpy()
data = torch.tensor(data)
data

tensor([[ 6.0925e-01, -1.4893e-03,  1.9196e-02,  ..., -7.4261e-02,
         -7.7291e-02,  0.0000e+00],
        [ 6.2134e-01, -3.7979e-03,  4.9538e-03,  ..., -7.2583e-02,
         -7.6079e-02,  2.1513e-03],
        [ 6.1297e-01,  4.9609e-04,  6.9456e-03,  ..., -6.7882e-02,
         -7.0784e-02, -1.4861e-03],
        ...,
        [-7.6119e-01,  2.6265e-03,  1.5540e-02,  ...,  5.3438e-02,
          4.4685e-02, -7.3865e-03],
        [-7.3051e-01, -9.5610e-03,  3.9113e-03,  ...,  4.5789e-02,
          3.8737e-02,  7.2226e-03],
        [-7.9280e-01,  1.8743e-02,  2.2271e-02,  ...,  5.9828e-02,
          5.4429e-02, -1.4559e-02]], dtype=torch.float64)

In [23]:
def FFT_for_Period(x, k=2):
    # [T, C]
    xf = torch.fft.rfft(x, dim=0)  # 对时间维度进行FFT
    # find period by amplitudes
    frequency_list = abs(xf).mean(-1)  # 沿特征维度求平均
    frequency_list[0] = 0  # 忽略直流分量
    _, top_list = torch.topk(frequency_list, k)  # 找出最大的k个频率
    top_list = top_list.detach().cpu().numpy()  # 转换为NumPy数组
    period = x.shape[0] // top_list  # 计算周期
    return period, abs(xf)[:, top_list]  # 返回周期和对应频率的振幅

In [24]:
period, weight = FFT_for_Period(data, k=6)

In [25]:
period

array([ 96, 192,  64,  27,  24,  32], dtype=int64)

In [27]:
# 96 192 block

# 4 16 78 192

In [28]:
weight.shape

torch.Size([97, 6])

# 多股票周期

In [3]:
with open("./datasets/sp500_data2.pkl", 'rb') as f:
    sp500 = pickle.load(f)

In [4]:
sp500.keys()

dict_keys(['train', 'test'])

In [5]:
sp500['train']['y_return ratio'].shape

(480, 1587)

In [6]:
sp500['train']['x'].shape

(480, 1587, 10)

In [7]:
def FFT_for_Period2(x, k=2):
    # [B, T, C]
    xf = torch.fft.rfft(x, dim=1)
    # find period by amplitudes
    frequency_list = abs(xf).mean(0).mean(-1)
    frequency_list[0] = 0
    _, top_list = torch.topk(frequency_list, k)
    top_list = top_list.detach().cpu().numpy()
    period = x.shape[1] // top_list
    return period, abs(xf).mean(-1)[:, top_list]

In [8]:
sp500['train']['x'][:,0:96,:].shape

(480, 96, 10)

In [9]:
1578//32

49

In [10]:
lists = []
weight_matrices = []
for i in range(49):
    x = torch.tensor(sp500['train']['x'][:,i*32:(i+1)*32,:])
    period, weight = FFT_for_Period2(x, k=6)
    lists.append(period)
    weight_matrices.append(weight)

In [11]:
from collections import Counter
import numpy as np

# 将所有列表合并成一个大列表
all_elements = []
for lst in lists:
    all_elements.extend(lst)

# 计算每个元素的出现次数
element_counts = Counter(all_elements)

# 计算每个元素的总 weight
total_weights = {element: 0 for element in element_counts}
for lst, matrix in zip(lists, weight_matrices):
    for i, element in enumerate(lst):
        total_weights[element] += torch.sum(matrix[:, i]).item()  # 使用torch.sum()并将结果转换为Python标量

# 打印结果
for element, count in element_counts.items():
    print(f"元素 {element} 出现了 {count} 次，总 weight 是 {total_weights[element]}")

元素 32 出现了 49 次，总 weight 是 11538.03066346975
元素 16 出现了 49 次，总 weight 是 6220.017524495532
元素 10 出现了 49 次，总 weight 是 4685.3853599664
元素 8 出现了 49 次，总 weight 是 3564.5474813009096
元素 6 出现了 47 次，总 weight 是 2889.182659986676
元素 5 出现了 38 次，总 weight 是 1883.76159784494
元素 4 出现了 11 次，总 weight 是 528.6750634058911
元素 2 出现了 2 次，总 weight 是 344.42159151375415


In [19]:
element_counts.keys()

dict_keys([32, 16, 10, 8, 6, 5, 4, 2])

In [21]:
time_step_list = list(element_counts.keys())[1:-1]
time_step_list

[16, 10, 8, 6, 5, 4]

In [22]:
period_list = random.sample(time_step_list, 4)
period_list

[6, 4, 16, 10]

In [103]:
# 将所有列表合并成一个大列表
all_elements = []
for lst in big_list:
    all_elements.extend(lst)

# 计算每个元素的出现次数
element_counts = Counter(all_elements)

# 找出出现次数最多的元素
most_common_element, count = element_counts.most_common(1)[0]

print(f"出现次数最多的元素是：{most_common_element}，出现了 {count} 次")

出现次数最多的元素是：21，出现了 49 次


In [105]:
element_counts

Counter({21: 49, 10: 49, 7: 49, 5: 49, 3: 48, 4: 38, 2: 12})

In [106]:
element_counts.keys()

dict_keys([21, 10, 7, 2, 5, 4, 3])

In [78]:
x = torch.tensor(sp500['train']['x'][:,:96,:])

In [79]:
period, weight = FFT_for_Period2(x, k=8)

In [80]:
plist = list(period)
random.sample(plist, 4)

[10, 4, 32, 5]

In [75]:
period

array([96, 48, 32, 19, 24, 16, 13, 10], dtype=int64)

In [38]:
weight

tensor([[4.0901, 4.9544, 2.6029,  ..., 2.5530, 1.8428, 1.4130],
        [4.5345, 2.3719, 1.8184,  ..., 0.8166, 0.8248, 1.0244],
        [4.2921, 2.9802, 1.8067,  ..., 1.1092, 0.7119, 1.0659],
        ...,
        [2.0259, 0.8587, 0.9936,  ..., 0.9705, 0.3870, 0.3352],
        [4.7146, 1.3396, 1.7141,  ..., 1.0693, 0.4388, 1.0547],
        [2.0233, 2.0244, 0.6792,  ..., 0.6841, 0.1863, 0.6908]],
       dtype=torch.float64)

In [39]:
weight.shape

torch.Size([480, 8])

In [40]:
period_weight = F.softmax(weight, dim=1)
period_weight

tensor([[0.2442, 0.5795, 0.0552,  ..., 0.0525, 0.0258, 0.0168],
        [0.7514, 0.0864, 0.0497,  ..., 0.0182, 0.0184, 0.0225],
        [0.6443, 0.1735, 0.0537,  ..., 0.0267, 0.0180, 0.0256],
        ...,
        [0.3313, 0.1031, 0.1180,  ..., 0.1153, 0.0643, 0.0611],
        [0.8460, 0.0289, 0.0421,  ..., 0.0221, 0.0118, 0.0218],
        [0.2912, 0.2915, 0.0759,  ..., 0.0763, 0.0464, 0.0768]],
       dtype=torch.float64)

In [41]:
period_weight.shape

torch.Size([480, 8])

In [42]:
period_weight = period_weight.unsqueeze(1).unsqueeze(1).repeat(1, T, N, 1)
period_weight.shape

NameError: name 'T' is not defined

# 探索hats里wikidata的关系数据

In [45]:
!dir "datasets/hats_data/relation/"

 驱动器 E 中的卷是 中卷
 卷的序列号是 B008-A223

 E:\_Project\Python\MSGAT\datasets\hats_data\relation 的目录

2024-01-10  17:17    <DIR>          .
2024-01-10  17:17    <DIR>          ..
2019-11-11  09:46        60,836,024 adj_mat.pkl
2019-11-11  09:46             4,804 ordered_ticker.pkl
2019-11-11  09:46       114,480,884 rel_mat.pkl
2019-11-11  09:46           287,801 rel_num.pkl
               4 个文件    175,609,513 字节
               2 个目录 64,545,128,448 可用字节


In [46]:
with open("./datasets/hats_data/relation/adj_mat.pkl", 'rb') as file1:
    adj_mat = pickle.load(file1)

with open("./datasets/hats_data/relation/ordered_ticker.pkl", 'rb') as file2:
    ordered_ticker = pickle.load(file2)
    
with open("./datasets/hats_data/relation/rel_mat.pkl", 'rb') as file3:
    rel_mat = pickle.load(file3)
    
with open("./datasets/hats_data/relation/rel_num.pkl", 'rb') as file4:
    rel_num = pickle.load(file4)

In [47]:
# 关系矩阵
# (85, 423, 423) 代表85种关系，每种关系是423×423的矩阵，1代表有这个关系，0代表没有
adj_mat

array([[[0, 1, 1, ..., 1, 0, 1],
        [1, 0, 1, ..., 1, 0, 1],
        [1, 1, 0, ..., 1, 0, 1],
        ...,
        [1, 1, 1, ..., 0, 0, 1],
        [0, 0, 0, ..., 0, 0, 0],
        [1, 1, 1, ..., 1, 0, 0]],

       [[0, 0, 0, ..., 1, 0, 0],
        [0, 0, 0, ..., 0, 1, 1],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [1, 0, 0, ..., 0, 0, 0],
        [0, 1, 0, ..., 0, 0, 1],
        [0, 1, 0, ..., 0, 1, 0]],

       [[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 1, 1],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 1, 0, ..., 0, 0, 1],
        [0, 1, 0, ..., 0, 1, 0]],

       ...,

       [[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]],

       [[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 

In [48]:
# 排好序的股票名称
# len = 423
ordered_ticker

['MSCI',
 'CPRI',
 'SO',
 'CHTR',
 'AEP',
 'CE',
 'MXIM',
 'DUK',
 'CPRT',
 'FRC',
 'APTV',
 'WCG',
 'JEF',
 'LIN',
 'EVRG',
 'SRE',
 'WELL',
 'ATO',
 'HFC',
 'FTNT',
 'WLTW',
 'TTWO',
 'NEE',
 'MDT',
 'UNH',
 'JKHY',
 'BR',
 'TFX',
 'CBRE',
 'FLT',
 'BHGE',
 'WAB',
 'ROL',
 'IPGP',
 'BKNG',
 'ABMD',
 'AMT',
 'LEG',
 'CF',
 'UAL',
 'PBCT',
 'HOG',
 'RRC',
 'NOC',
 'NSC',
 'NOV',
 'OXY',
 'DG',
 'AXP',
 'PKG',
 'CINF',
 'SJM',
 'AON',
 'ED',
 'UDR',
 'AVY',
 'URI',
 'BDX',
 'HST',
 'KEY',
 'PPG',
 'JBHT',
 'HUM',
 'DHI',
 'IDXX',
 'BK',
 'ZBH',
 'CL',
 'PHM',
 'FITB',
 'EXPE',
 'COP',
 'PNW',
 'NWL',
 'HBAN',
 'AYI',
 'SYMC',
 'ACN',
 'SIG',
 'STZ',
 'A',
 'DISH',
 'SNPS',
 'HSIC',
 'PKI',
 'TXT',
 'TDG',
 'DISCA',
 'KLAC',
 'MAA',
 'ADP',
 'ZION',
 'DFS',
 'AFL',
 'AMD',
 'RSG',
 'KMB',
 'MSI',
 'L',
 'UNP',
 'PXD',
 'FRT',
 'KR',
 'FMC',
 'NRG',
 'BIIB',
 'GPS',
 'DRE',
 'ATVI',
 'ALGN',
 'VLO',
 'XEL',
 'VNO',
 'ULTA',
 'EW',
 'PNR',
 'STT',
 'CPB',
 'RE',
 'FLS',
 'CB',
 'GM',
 'CTL

In [49]:
# 85种关系，423支股票，每一行代表与这支股票包含这种关系的股票索引，后面全填充0所以是不规则的389
# (85, 423, 398)
rel_mat

array([[[  1.,   2.,   3., ...,   0.,   0.,   0.],
        [  0.,   2.,   3., ...,   0.,   0.,   0.],
        [  0.,   1.,   3., ...,   0.,   0.,   0.],
        ...,
        [  0.,   1.,   2., ...,   0.,   0.,   0.],
        [189., 335.,   0., ...,   0.,   0.,   0.],
        [  0.,   1.,   2., ...,   0.,   0.,   0.]],

       [[  4.,   5.,   7., ...,   0.,   0.,   0.],
        [  3.,   6.,   8., ...,   0.,   0.,   0.],
        [  0.,   0.,   0., ...,   0.,   0.,   0.],
        ...,
        [  0.,   4.,   5., ...,   0.,   0.,   0.],
        [  1.,   3.,   6., ...,   0.,   0.,   0.],
        [  1.,   3.,   6., ...,   0.,   0.,   0.]],

       [[  0.,   0.,   0., ...,   0.,   0.,   0.],
        [  6.,  16.,  19., ...,   0.,   0.,   0.],
        [  0.,   0.,   0., ...,   0.,   0.,   0.],
        ...,
        [  0.,   0.,   0., ...,   0.,   0.,   0.],
        [  1.,   6.,  16., ...,   0.,   0.,   0.],
        [  1.,   6.,  16., ...,   0.,   0.,   0.]],

       ...,

       [[  0.,   0.,   0

In [50]:
rel_mat[:,:,-1]

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [51]:
rel_mat[0,:,:]

array([[  1.,   2.,   3., ...,   0.,   0.,   0.],
       [  0.,   2.,   3., ...,   0.,   0.,   0.],
       [  0.,   1.,   3., ...,   0.,   0.,   0.],
       ...,
       [  0.,   1.,   2., ...,   0.,   0.,   0.],
       [189., 335.,   0., ...,   0.,   0.,   0.],
       [  0.,   1.,   2., ...,   0.,   0.,   0.]])

In [52]:
# 代表85种关系，每种关系 每只股票与其余多少支股票有这种关系
# (85, 423)
rel_num

array([[369, 370, 369, ..., 369,   2, 370],
       [295, 103,   0, ..., 295, 103, 103],
       [  0, 124,   0, ...,   0, 124, 124],
       ...,
       [  0,   0,   0, ...,   0,   0,   0],
       [  0,   0,   0, ...,   0,   0,   0],
       [  0,   0,   0, ...,   0,   0,   0]], dtype=int64)

# test

In [83]:
import yfinance as yf
test_df = yf.download('BRK-B', start="2001-01-01", end="2023-01-01", proxy="http://127.0.0.1:7890")

[*********************100%%**********************]  1 of 1 completed


Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2001-01-02,47.459999,49.500000,47.299999,48.220001,48.220001,2300000
2001-01-03,48.220001,48.599998,46.200001,46.700001,46.700001,1105000
2001-01-04,46.599998,46.799999,45.040001,45.880001,45.880001,1110000
2001-01-05,45.799999,45.799999,44.799999,44.980000,44.980000,830000
2001-01-08,45.060001,45.500000,43.639999,44.840000,44.840000,805000
...,...,...,...,...,...,...
2022-12-23,302.880005,306.570007,300.929993,306.489990,306.489990,2460400
2022-12-27,306.450012,308.579987,304.649994,305.549988,305.549988,2730900
2022-12-28,304.769989,307.459991,303.260010,303.429993,303.429993,2628200
2022-12-29,305.940002,309.380005,305.239990,309.059998,309.059998,2846200


In [25]:
sp = pd.read_csv("./datasets/SP500_Companies.csv", encoding='ISO-8859-1')
sector_list = sp["Sector"].unique()
len_list = []
for sector in sector_list:
    len_list.append(len(sp[sp["Sector"] == sector]))
len_array = np.array(len_list)
len_array

array([27, 49, 29, 90, 44, 46, 22,  9, 12, 27, 12,  9, 30, 21,  7, 15, 12,
       16,  3])

In [4]:
a = torch.rand(388,480,7,29)
for idx,_ in enumerate(a):
    print(idx)
    print(_.shape)

0
torch.Size([480, 7, 29])
1
torch.Size([480, 7, 29])
2
torch.Size([480, 7, 29])
3
torch.Size([480, 7, 29])
4
torch.Size([480, 7, 29])
5
torch.Size([480, 7, 29])
6
torch.Size([480, 7, 29])
7
torch.Size([480, 7, 29])
8
torch.Size([480, 7, 29])
9
torch.Size([480, 7, 29])
10
torch.Size([480, 7, 29])
11
torch.Size([480, 7, 29])
12
torch.Size([480, 7, 29])
13
torch.Size([480, 7, 29])
14
torch.Size([480, 7, 29])
15
torch.Size([480, 7, 29])
16
torch.Size([480, 7, 29])
17
torch.Size([480, 7, 29])
18
torch.Size([480, 7, 29])
19
torch.Size([480, 7, 29])
20
torch.Size([480, 7, 29])
21
torch.Size([480, 7, 29])
22
torch.Size([480, 7, 29])
23
torch.Size([480, 7, 29])
24
torch.Size([480, 7, 29])
25
torch.Size([480, 7, 29])
26
torch.Size([480, 7, 29])
27
torch.Size([480, 7, 29])
28
torch.Size([480, 7, 29])
29
torch.Size([480, 7, 29])
30
torch.Size([480, 7, 29])
31
torch.Size([480, 7, 29])
32
torch.Size([480, 7, 29])
33
torch.Size([480, 7, 29])
34
torch.Size([480, 7, 29])
35
torch.Size([480, 7, 29])
36

In [8]:
torch.rand(388,1).shape

torch.Size([388, 1])

In [9]:
np.array(torch.rand(351,480)).ravel().shape

(168480,)