In [1]:
import torch
mat = torch.Tensor([[1,2,3],[4,5,6],[7,8,9]])
print(mat)

tensor([[1., 2., 3.],
        [4., 5., 6.],
        [7., 8., 9.]])


In [2]:
mask = torch.LongTensor([1,0,1]).reshape(3,1)
print(mask.size())
print(mat*mask)
print(torch.mul(mat, mask))
print(torch.mul(mat, mask.reshape(1,3)))

torch.Size([3, 1])
tensor([[1., 2., 3.],
        [0., 0., 0.],
        [7., 8., 9.]])
tensor([[1., 2., 3.],
        [0., 0., 0.],
        [7., 8., 9.]])
tensor([[1., 0., 3.],
        [4., 0., 6.],
        [7., 0., 9.]])


In [3]:
print(torch.mm(mat, mask.float()))

tensor([[ 4.],
        [10.],
        [16.]])


In [5]:
print(mat.to_sparse())
print(mat.to_sparse().t())

tensor(indices=tensor([[0, 0, 0, 1, 1, 1, 2, 2, 2],
                       [0, 1, 2, 0, 1, 2, 0, 1, 2]]),
       values=tensor([1., 2., 3., 4., 5., 6., 7., 8., 9.]),
       size=(3, 3), nnz=9, layout=torch.sparse_coo)
tensor(indices=tensor([[0, 1, 2, 0, 1, 2, 0, 1, 2],
                       [0, 0, 0, 1, 1, 1, 2, 2, 2]]),
       values=tensor([1., 2., 3., 4., 5., 6., 7., 8., 9.]),
       size=(3, 3), nnz=9, layout=torch.sparse_coo)


In [5]:
import torch
from torch_sparse import SparseTensor

edge_index = torch.LongTensor([[1,0,0], [2,1,1]])
num_nodes = 3
adj = SparseTensor(row=edge_index[0], col=edge_index[1], sparse_sizes=(num_nodes, num_nodes))

In [11]:
print(adj)

SparseTensor(row=tensor([0, 0, 1]),
             col=tensor([1, 1, 2]),
             size=(3, 3), nnz=3, density=33.33%)


In [13]:
from torch_geometric.nn import GCNConv, GINConv  #, Sequential
from torch.nn import ReLU, Linear, Sequential

x = torch.FloatTensor([[1,1,1], [0.5,0.5,0.5], [2,2,2]])
conv = GCNConv(3, 32)
out1 = conv(x, edge_index)
out2 = conv(x, adj.t())
assert torch.allclose(out1, out2)

conv = GINConv(nn=Sequential(Linear(3, 32), ReLU(), Linear(32, 32)))
out1 = conv(x, edge_index)
out2 = conv(x, adj.t())
assert torch.allclose(out1, out2)

In [2]:
import numpy as np
import pandas as pd
import tushare as ts
from utils import *
print(ts.__version__)

ts.set_token('1f2f092156dfe775a88e478f3a54565e0adab29246c83814e95fa0dd')
pro = ts.pro_api()

1.2.84


In [6]:
# 交易日历
# trading_day = pd.read_csv("./data/trading_day_20091009_20211214.csv")
trading_day = pro.trade_cal(exchange='', start_date='20060101', end_date='20200110')
open_day = trading_day[trading_day['is_open']==1]
open_day = open_day[open_day['cal_date'].between('20060101','20191231')]  # between: [left, right]
print(len(open_day))
# [20060104, 20191231]

3404


In [7]:
open_day['cal_date'].to_csv("./data/trading_opendays_3404.txt", header=None, index=None, sep=' ')

In [3]:
num_days = 3404
num_stocks = 1931  # or 1271
stock_list_path = "./data/stock_codes_1931.txt"
date_list_path = "./data/trading_opendays_3404.txt"
mask = np.zeros((num_days, num_stocks))
close_price = np.zeros((num_days, num_stocks))

stock_id_map = get_stock_id_mapping(stock_list_path)
days_id_map = get_days_id_mapping(date_list_path)

In [4]:
for i, code in enumerate(stock_id_map.keys()):
    # tmp = pro.daily(ts_code=code, start_date='20100101', end_date='20191231')
    # adj='qfq' / 'hfq' 日线前复权/后复权, None是无复权
    try:
        tmp = ts.pro_bar(ts_code=code, adj='hfq', start_date='20060101', end_date='20200110')
        tmp.to_csv(f'./data/close_hfq_2006/{code}.csv', index=None)
        # 不是每支股票都正好有条数3404据
        if i==0:
            print(tmp.columns)
        if i%200==0:  print(f'./data/close_hfq_2006/{code}.csv')
    except:
        print("error", i, code)


Index(['ts_code', 'trade_date', 'open', 'high', 'low', 'close', 'pre_close',
       'change', 'pct_chg', 'vol', 'amount'],
      dtype='object')
./data/close_hfq_2006/000001.SZ.csv
./data/close_hfq_2006/000631.SZ.csv
./data/close_hfq_2006/000931.SZ.csv
./data/close_hfq_2006/600182.SH.csv
./data/close_hfq_2006/600425.SH.csv
./data/close_hfq_2006/600678.SH.csv
./data/close_hfq_2006/600901.SH.csv
./data/close_hfq_2006/601866.SH.csv
./data/close_hfq_2006/603303.SH.csv
./data/close_hfq_2006/603739.SH.csv


In [8]:
s_list = pd.read_csv("./data/stock_main_list_2139_9.csv")

for idx,row in s_list.iterrows():
    stock_id = stock_id_map.get(row['ts_code'], -1)
    if stock_id == -1:
        continue

    cur_prices = pd.read_csv('./data/close_hfq_2006/'+str(row['ts_code'])+'.csv')
    
    # fill moving average leading nan
    # fill_ma_nan(cur_prices)  
    normaliza_cols = ['close']
    # normalize: x/x.max()
    for col in normaliza_cols:
        cur_prices[col] = cur_prices[col] / cur_prices[col].max()

    for iidx, rrow in cur_prices.iterrows():
        trade_date_id = days_id_map.get(str(rrow['trade_date']), -1)
        if trade_date_id == -1:
            continue
        mask[trade_date_id, stock_id] = 1 # 这一天有收盘价，因此可以预测这一天的价格
        close_price[trade_date_id, stock_id] = rrow[['close']].values

    print(stock_id, row['ts_code'], mask[:, stock_id].sum())

0 000001.SZ 3271.0
1 000002.SZ 3234.0
2 000004.SZ 3075.0
3 000005.SZ 2967.0
4 000006.SZ 3244.0
5 000007.SZ 2714.0
6 000008.SZ 3067.0
7 000009.SZ 3219.0
8 000010.SZ 2424.0
9 000011.SZ 3214.0
10 000012.SZ 3301.0
11 000014.SZ 3287.0
12 000016.SZ 3262.0
13 000017.SZ 2299.0
14 000019.SZ 3029.0
15 000020.SZ 2904.0
16 000021.SZ 3316.0
17 000023.SZ 3045.0
18 000025.SZ 3318.0
19 000026.SZ 3313.0
20 000027.SZ 3281.0
21 000028.SZ 3235.0
22 000029.SZ 2553.0
23 000030.SZ 2569.0
24 000031.SZ 3131.0
25 000032.SZ 3108.0
26 000034.SZ 2672.0
27 000035.SZ 2348.0
28 000036.SZ 3298.0
29 000037.SZ 3052.0
30 000038.SZ 1519.0
31 000039.SZ 3319.0
32 000040.SZ 3179.0
33 000042.SZ 3126.0
34 000045.SZ 3184.0
35 000046.SZ 3253.0
36 000048.SZ 3180.0
37 000049.SZ 3357.0
38 000050.SZ 3135.0
39 000055.SZ 3337.0
40 000056.SZ 3153.0
41 000058.SZ 2896.0
42 000059.SZ 3258.0
43 000060.SZ 3364.0
44 000061.SZ 3170.0
45 000062.SZ 3235.0
46 000063.SZ 3299.0
47 000065.SZ 3233.0
48 000066.SZ 3111.0
49 000068.SZ 2796.0
50 000069.

In [9]:
print(np.count_nonzero(np.isnan(close_price)))
print(close_price.shape, mask.shape)
print(close_price.shape, np.count_nonzero(close_price)/(3404*1931), np.count_nonzero(np.isnan(close_price))/(3404*1931))

170
(3404, 1931) (3404, 1931)
(3404, 1931) 0.7003931463943173 2.58628925911028e-05


In [10]:
np.save("./data/mask_3404_1931.npy", mask)
np.save("./data/close_price_3404_1931.npy", close_price)

In [13]:
# forward fill nan
close_price[close_price==0] = np.nan
print(close_price.shape, np.count_nonzero(np.isnan(close_price))/(3404*1931))
for i in range(num_stocks):
    df = pd.DataFrame(close_price[:, i])
    df = df.fillna(method='ffill')
    # print(df.shape, df.isna().sum().sum())
    close_price[:, i:i+1] = df.fillna(0.0).copy().values
print(close_price.shape, np.count_nonzero(np.isnan(close_price))/(3404*1931))

(3404, 1931) 0.2996327164982739
(3404, 1931) 0.0


In [14]:
# 未上市的股票的nan都用0填充了
np.save("./data/close_price_ffil_3404_1931.npy", close_price)

In [2]:
import torch
t = torch.Tensor([[300]])
print(t.size())
print(t.flatten().size())
print(t.flatten().tolist())
print(type(t.flatten().tolist()))

torch.Size([1, 1])
torch.Size([1])
[300.0]
<class 'list'>


In [None]:
import torch


def normalize(x):
    # row nornalize torch tensor
    return torch.nn.functional.normalize(t, p=1.0, dim=1)


t = torch.tensor([[1000, 10, 0.5], [765, 5, 0.35], [800, 7, 0.09]])
print(normalize(t))

In [None]:
sector_industry

这些图都是不带self loop的
sector_industry:
NASDAQ [1026, 1026, 97]
NYSE [1737, 1737, 108]

wiki_relation:
NASDAQ [1026, 1026, 43]
NYSE [1737, 1737, 33]

In [7]:
# multihot -> onehot graph
import numpy as np
rsr_graph_path = ["../Temporal_Relational_Stock_Ranking/data/relation/sector_industry/NASDAQ_industry_relation.npy",\
                "../Temporal_Relational_Stock_Ranking/data/relation/sector_industry/NYSE_industry_relation.npy",\
                "../Temporal_Relational_Stock_Ranking/data/relation/wikidata/NASDAQ_wiki_relation.npy",\
                "../Temporal_Relational_Stock_Ranking/data/relation/wikidata/NYSE_wiki_relation.npy"]
onehot_save_path = ["./data/graphs/NASDAQ_industry_relation_1026_1026.npy",\
                "./data/graphs/NYSE_industry_relation_1737_1737.npy", \
                "./data/graphs/NASDAQ_wiki_relation_1026_1026.npy", \
                "./data/graphs/NYSE_wiki_relation_1737_1737.npy"]

for i in range(4):
    multihot_graph = np.load(rsr_graph_path[i])
    num_stocks = multihot_graph.shape[0]
    onehot_graph = np.amax(multihot_graph, axis=-1)
    print(onehot_graph.shape, sum(sum(onehot_graph)) / (1026*1026), onehot_save_path[i])
    np.save(onehot_save_path[i], onehot_graph)

'''
(1026, 1026) 0.050929250785616845 ./data/graphs/NASDAQ_industry_relation_1026_1026.npy
(1737, 1737) 0.26991685950852873 ./data/graphs/NYSE_industry_relation_1737_1737.npy
(1026, 1026) 0.0028147312183425858 ./data/graphs/NASDAQ_wiki_relation_1026_1026.npy
(1737, 1737) 0.00969434089881407 ./data/graphs/NYSE_wiki_relation_1737_1737.npy
'''


(1026, 1026) 0.050929250785616845 ./data/graphs/NASDAQ_industry_relation_1026_1026.npy
(1737, 1737) 0.26991685950852873 ./data/graphs/NYSE_industry_relation_1737_1737.npy
(1026, 1026) 0.0028147312183425858 ./data/graphs/NASDAQ_wiki_relation_1026_1026.npy
(1737, 1737) 0.00969434089881407 ./data/graphs/NYSE_wiki_relation_1737_1737.npy


In [16]:
# multihot -> onehot graph
import numpy as np
rsr_graph_path = ["../Temporal_Relational_Stock_Ranking/data/relation/sector_industry/NASDAQ_industry_relation.npy",\
                "../Temporal_Relational_Stock_Ranking/data/relation/sector_industry/NYSE_industry_relation.npy",\
                "../Temporal_Relational_Stock_Ranking/data/relation/wikidata/NASDAQ_wiki_relation.npy",\
                "../Temporal_Relational_Stock_Ranking/data/relation/wikidata/NYSE_wiki_relation.npy"]
#onehot_save_path = ["./data/graphs/NASDAQ_industry_relation_1026_1026.npy",\
#                "./data/graphs/NYSE_industry_relation_1737_1737.npy", \
#                "./data/graphs/NASDAQ_wiki_relation_1026_1026.npy", \
#                "./data/graphs/NYSE_wiki_relation_1737_1737.npy"]
onehot_save_path = {"NASDAQ":"./data/graphs/NASDAQ_1026_1026.npy", "NYSE": "./data/graphs/NYSE_1737_1737.npy"}
# NASDAQ: 0,2
# NYSE: 1,3
market = "NYSE"
industry_multihot_graph = np.load(rsr_graph_path[1])
wiki_multihot_graph = np.load(rsr_graph_path[3])
multihot_graph = np.concatenate([industry_multihot_graph, wiki_multihot_graph], axis=-1)
num_stocks = multihot_graph.shape[0]
onehot_graph = np.amax(multihot_graph, axis=-1)
print(onehot_graph.shape, sum(sum(onehot_graph)) / (1737*1737))
np.save(onehot_save_path[market], onehot_graph)


(1737, 1737) 0.09668135924769213


In [8]:
for i in range(1737):
    if sum(multihot_graph[i][i]) != 33:
        print(i, sum(multihot_graph[i][i]))

0 1
1 1
2 1
3 1
4 1
5 1
6 1
7 1
8 1
9 1
10 1
11 1
12 1
13 1
14 1
15 1
16 1
17 1
18 1
19 1
20 1
21 1
22 1
23 1
24 1
25 1
26 1
27 1
28 1
29 1
30 1
31 1
32 1
33 1
34 1
35 1
36 1
37 1
38 1
39 1
40 1
41 1
42 1
43 1
44 1
45 1
46 1
47 1
48 1
49 1
50 1
51 1
52 1
53 1
54 1
55 1
56 1
57 1
58 1
59 1
60 1
61 1
62 1
63 1
64 1
65 1
66 1
67 1
68 1
69 1
70 1
71 1
72 1
73 1
74 1
75 1
76 1
77 1
78 1
79 1
80 1
81 1
82 1
83 1
84 1
85 1
86 1
87 1
88 1
89 1
90 1
91 1
92 1
93 1
94 1
95 1
96 1
97 1
98 1
99 1
100 1
101 1
102 1
103 1
104 1
105 1
106 1
107 1
108 1
109 1
110 1
111 1
112 1
113 1
114 1
115 1
116 1
117 1
118 1
119 1
120 1
121 1
122 1
123 1
124 1
125 1
126 1
127 1
128 1
129 1
130 1
131 1
132 1
133 1
134 1
135 1
136 1
137 1
138 1
139 1
140 1
141 1
142 1
143 1
144 1
145 1
146 1
147 1
148 1
149 1
150 1
151 1
152 1
153 1
154 1
155 1
156 1
157 1
158 1
159 1
160 1
161 1
162 1
163 1
164 1
165 1
166 1
167 1
168 1
169 1
170 1
171 1
172 1
173 1
174 1
175 1
176 1
177 1
178 1
179 1
180 1
181 1
182 1
183 1
184 1


In [4]:
def load_EOD_data(data_path, market_name, tickers, steps=1):
    # tickers: stocks
    eod_data = []
    masks = []
    ground_truth = []
    base_price = []
    for index, ticker in enumerate(tickers):
        single_EOD = np.genfromtxt(
            os.path.join(data_path, market_name + '_' + ticker + '_1.csv'),
            dtype=np.float32, delimiter=',', skip_header=False
        )
        if market_name == 'NASDAQ':
            # remove the last day since lots of missing data
            single_EOD = single_EOD[:-1, :]
        if index == 0:
            print('single EOD data shape:', single_EOD.shape)
            eod_data = np.zeros([len(tickers), single_EOD.shape[0],
                                 single_EOD.shape[1] - 1], dtype=np.float32)
            masks = np.ones([len(tickers), single_EOD.shape[0]],
                            dtype=np.float32)
            ground_truth = np.zeros([len(tickers), single_EOD.shape[0]],
                                    dtype=np.float32)
            base_price = np.zeros([len(tickers), single_EOD.shape[0]],
                                  dtype=np.float32)
        for row in range(single_EOD.shape[0]):
            if abs(single_EOD[row][-1] + 1234) < 1e-8:
                masks[index][row] = 0.0
            elif row > steps - 1 and abs(single_EOD[row - steps][-1] + 1234) \
                    > 1e-8:
                ground_truth[index][row] = \
                    (single_EOD[row][-1] - single_EOD[row - steps][-1]) / \
                    single_EOD[row - steps][-1]
            for col in range(single_EOD.shape[1]):
                if abs(single_EOD[row][col] + 1234) < 1e-8:
                    single_EOD[row][col] = 1.1
        eod_data[index, :, :] = single_EOD[:, 1:]
        base_price[index, :] = single_EOD[:, -1]
    return eod_data, masks, ground_truth, base_price


def get_batch(self, offset=None):
    if offset is None:
        offset = random.randrange(0, self.valid_index)
    seq_len = self.parameters['seq']
    mask_batch = self.mask_data[:, offset: offset + seq_len + self.steps]
    mask_batch = np.min(mask_batch, axis=1)
    return self.embedding[:, offset, :], \
            np.expand_dims(mask_batch, axis=1), \
            np.expand_dims(
                self.price_data[:, offset + seq_len - 1], axis=1
            ), \
            np.expand_dims(
                self.gt_data[:, offset + seq_len + self.steps - 1], axis=1
            )

In [10]:
import os
import numpy as np

data_path = "../Temporal_Relational_Stock_Ranking/data/2013-01-01"  # p
market = "NYSE"  # m
tickers_fname = market + "_tickers_qualify_dr-0.98_min-5_smooth.csv"  # t


tickers = np.genfromtxt(os.path.join(data_path, '..', tickers_fname), \
    dtype=str, delimiter='\t', skip_header=False)
print('#tickers selected:', len(tickers))  # 1026

eod_data, mask_data, gt_data, price_data = \
            load_EOD_data(data_path, market, tickers, steps=1)
print(eod_data.shape, mask_data.shape, gt_data.shape, price_data.shape)
'''
NASDAQ
single EOD data shape: (1245, 6)
(1026, 1245, 5) (1026, 1245) (1026, 1245) (1026, 1245)
NYSE
single EOD data shape: (1245, 6)
(1737, 1245, 5) (1737, 1245) (1737, 1245) (1737, 1245)
'''


#tickers selected: 1737
single EOD data shape: (1245, 6)
(1737, 1245, 5) (1737, 1245) (1737, 1245) (1737, 1245)


'\nNASDAQ\nsingle EOD data shape: (1245, 6)\n(1026, 1245, 5) (1026, 1245) (1026, 1245) (1026, 1245)\nNYSE\nsingle EOD data shape: (1245, 6)\n(1737, 1245, 5) (1737, 1245) (1737, 1245) (1737, 1245)\n'

In [11]:
print(eod_data.shape, mask_data.shape, gt_data.shape, price_data.shape)
# (1026, 1245, 5) (1026, 1245) (1026, 1245) (1026, 1245)
# (1737, 1245, 5) (1737, 1245) (1737, 1245) (1737, 1245)

(1737, 1245, 5) (1737, 1245) (1737, 1245) (1737, 1245)


In [12]:
# --label_cnt=1, --input_dim=5
# 1245 = 756 + 252 + 237
market = "NYSE"

x_data = np.concatenate([np.expand_dims(gt_data, axis=-1), eod_data], axis=-1) 
x_data = np.transpose(x_data, (1,0,2))
mask_data = np.transpose(mask_data, (1,0))
print(x_data.shape, mask_data.shape)
# (1245, 1206, 6) (1245, 1206)
# (1245, 1737, 6) (1245, 1737)

train_x, valid_x, test_x = x_data[:756,:, :], x_data[756:756+252, :, :], x_data[756+252:, :, :]
print(train_x.shape, valid_x.shape, test_x.shape)
# (756, 1026, 6) (252, 1026, 6) (237, 1026, 6)
# (756, 1737, 6) (252, 1737, 6) (237, 1737, 6)

train_mask, valid_mask, test_mask = mask_data[:756, :], mask_data[756:756+252, :], mask_data[756+252:, :]
print(train_mask.shape, valid_mask.shape, test_mask.shape)
# (756, 1026) (252, 1026) (237, 1026)
# (756, 1737) (252, 1737) (237, 1737)

(1245, 1737, 6) (1245, 1737)
(756, 1737, 6) (252, 1737, 6) (237, 1737, 6)
(756, 1737) (252, 1737) (237, 1737)


In [13]:
np.save("./data/"+market+"/train_756_1737_6.npy", train_x)
np.save("./data/"+market+"/valid_252_1737_6.npy", valid_x)
np.save("./data/"+market+"/test_237_1737_6.npy", test_x)

np.save("./data/"+market+"/train_mask_756_1737.npy", train_mask)
np.save("./data/"+market+"/valid_mask_252_1737.npy", valid_mask)
np.save("./data/"+market+"/test_mask_237_1737.npy", test_mask)

In [1]:
import torch

q = torch.ones((10,4,3))
k = torch.ones((10,4,3))
v = torch.ones((10,4,3))

alpha = torch.mm(q, k.transpose(0, 2))
print(alpha.size())

  from .autonotebook import tqdm as notebook_tqdm


RuntimeError: self must be a matrix