In [1]:
import json
import os
import numpy as np
import torch
import random

In [2]:
root_path = 'data/'

### Fetch data from text files

In [3]:
raw_data = {}
all_files = os.listdir(root_path)

for i, filename in enumerate(all_files):
    
    # Only load 1/N of all stocks
    if i % 3 == 0:
    
        len_stocks = len(all_files)
        print("Loading stock {}/{} ({})   ".format(i + 1, len_stocks, filename), end='\r')
        
        with open(root_path + filename) as f:
            if not filename.startswith('.'):
                data = json.load(f)
                                
                prices = []
                dates = []
                for k, v in data.items():
                    vals = [ float(v["4. close"]), abs(float(v["2. high"]) - float(v["3. low"])), float(v["5. volume"]) ]
                    prices.append(np.array(vals))
                    dates.append(k)
                
                # reverse so that data is increasing in time
                prices.reverse()
                dates.reverse()
                raw_data[filename.split('.')[0]] = (prices, dates)

print("")
print("Total number of stocks: " + str(len(raw_data)))

Loading stock 3562/3563 (RDCM.txt)      
Total number of stocks: 1187


### Format data into time series data points with sequence length 122

In [4]:
X_train, X_val = [], []
y_train, y_val = [], []

train_mean, val_mean = [], []
train_std, val_std = [], []

window_size = 122 # a third of a year

plen = 0
for i, items in enumerate(raw_data.items()):
    print("({}/{})".format(i, len(raw_data.items())), end="\r")
    k, v = items
    prices, _ = v
    if len(prices) < window_size + 1:
        continue
    
    prices = torch.tensor(prices).float()
    
    # used for validating that no data is missing
    plen += prices.shape[0] - window_size - 1
    
    for j in range(prices.shape[0] - window_size - 1):
        window = prices[j:j+window_size+1]     # window from 1st to 122nd and pred at 123rd
        
        mean = torch.mean(window, dim=0)
        std = torch.std(window, dim=0)
        
        has_zeros = False
        for k in range(std.shape[0]):
            if std[k] == 0:
                has_zeros = True
                
        if has_zeros:
            break
        
        norm_window = (window - mean) / std
        norm_x = norm_window[:window_size]
        norm_y = norm_window[window_size][0]
        
        # 90% training to 10% validation
        
        if random.randrange(10) != 0:
            X_train.append(norm_x.unsqueeze(0))
            y_train.append(norm_y.item())
            train_mean.append(mean)
            train_std.append(std)
        else:
            X_val.append(norm_x.unsqueeze(0))
            y_val.append(norm_y.item())
            val_mean.append(mean)
            val_std.append(std)
            
print("Converting to torch tensors...")
X_train = torch.cat(X_train)
y_train = torch.tensor(y_train).unsqueeze(1)
X_val = torch.cat(X_val)
y_val = torch.tensor(y_val).unsqueeze(1)

train_mean = torch.stack(train_mean)
val_mean = torch.stack(val_mean)
train_std = torch.stack(train_std)
val_std = torch.stack(val_std)
    
print("Done")

Converting to torch tensors...
Done


### Validate data size and shape

In [5]:
print("X_train shape: \t\t", X_train.shape)
print("X_val shape: \t\t", X_val.shape)
print("y_train shape: \t\t", y_train.shape)
print("y_val shape: \t\t", y_val.shape)
print("")

print("train_mean shape: \t", train_mean.shape)
print("val_mean: \t\t", val_mean.shape)
print("train_std: \t\t", train_std.shape)
print("val_std: \t\t", val_std.shape)
print("")

# # false means that something is wrong
# print(plen == X_train.shape[0] + X_val.shape[0])
# print(plen == y_train.shape[0] + y_val.shape[0])
# print(plen == train_mean.shape[0] + val_mean.shape[0])
# print(plen == train_std.shape[0] + val_mean.shape[0])

X_train shape: 		 torch.Size([2542795, 122, 3])
X_val shape: 		 torch.Size([282732, 122, 3])
y_train shape: 		 torch.Size([2542795, 1])
y_val shape: 		 torch.Size([282732, 1])

train_mean shape: 	 torch.Size([2542795, 3])
val_mean: 		 torch.Size([282732, 3])
train_std: 		 torch.Size([2542795, 3])
val_std: 		 torch.Size([282732, 3])



### Save data to a file

In [None]:
torch.save((X_train, X_val, y_train, y_val, train_mean, val_mean, train_std, val_std), "assets/all.pt")