In [None]:
%pip install -r requirements.txt

In [1]:
import pandas as pd
import torch
from exp.exp_main import Exp_Main
import random
import numpy as np
import yfinance as yf
import os
from sklearn.preprocessing import MinMaxScaler
import joblib

In [3]:
df = yf.download("^GSPC", interval='1d' , start="2000-01-01", end="2023-04-30" )

[*********************100%%**********************]  1 of 1 completed


In [4]:
target = 'Close'

df = df.reset_index()
df_cols = list(df.columns)
df_cols.pop(0)
df_cols.insert(0,'date')
df = df.set_axis(df_cols,axis=1)
df_cols.remove(target)
df_cols.append(target)
df = df[df_cols]
print( df.head() )

        date         Open         High          Low    Adj Close      Volume  \
0 2000-01-03  1469.250000  1478.000000  1438.359985  1455.219971   931800000   
1 2000-01-04  1455.219971  1455.219971  1397.430054  1399.420044  1009000000   
2 2000-01-05  1399.420044  1413.270020  1377.680054  1402.109985  1085500000   
3 2000-01-06  1402.109985  1411.900024  1392.099976  1403.449951  1092300000   
4 2000-01-07  1403.449951  1441.469971  1400.729980  1441.469971  1225200000   

         Close  
0  1455.219971  
1  1399.420044  
2  1402.109985  
3  1403.449951  
4  1441.469971  


In [9]:
def scaler(df, time_col = 'date', target = 'Close', features = 'M', also_return_target_scaler = False):
    
    if features == 'S':
        df = df[[time_col, target]]
        scaled_df = df.copy()
    elif features == 'M' or features == 'MS':
        scaled_df = df.copy()
        for any_col in df.drop(columns=[target, time_col]).columns:
            scaler_ = MinMaxScaler()
            scaled_df[any_col] = scaler_.fit_transform( df[any_col].values.reshape((-1,1)) ).reshape(-1)
    else:
        raise NotImplementedError('NotImplemented!')
    
    target_scaler = MinMaxScaler()
    scaled_df[target] = target_scaler.fit_transform( df[target].values.reshape((-1,1)) ).reshape(-1)
    
    joblib.dump(target_scaler, 'target_scaler.pkl')
    
    if also_return_target_scaler:
        return scaled_df, target_scaler
    else:
        return scaled_df

In [10]:
scaled_df = scaler(df)

In [11]:
scaled_df

Unnamed: 0,date,Open,High,Low,Adj Close,Volume,Close
0,2000-01-03,0.191497,0.189829,0.187582,0.189001,0.051867,0.189001
1,2000-01-04,0.188096,0.184304,0.177631,0.175457,0.058822,0.175457
2,2000-01-05,0.174570,0.174130,0.172829,0.176110,0.065713,0.176110
3,2000-01-06,0.175222,0.173798,0.176335,0.176436,0.066326,0.176436
4,2000-01-07,0.175547,0.180969,0.178433,0.185664,0.078299,0.185664
...,...,...,...,...,...,...,...
5863,2023-04-24,0.836993,0.836005,0.838991,0.839923,0.264399,0.839923
5864,2023-04-25,0.835626,0.832129,0.827713,0.824047,0.326353,0.824047
5865,2023-04-26,0.826257,0.823214,0.822357,0.820251,0.313595,0.820251
5866,2023-04-27,0.823229,0.834993,0.828663,0.839513,0.305805,0.839513


In [12]:
def splitter(df, test_size = 0.2):
    
    train_size = int( df.shape[0] -(df.shape[0] * test_size) )
    
    return df.iloc[:train_size], df.iloc[train_size:]

In [18]:
train_df , test_df = splitter(scaled_df)

In [19]:
train_df.shape , test_df.shape

((4694, 7), (1174, 7), (587, 7))

In [20]:
train_df.to_csv('dataset_example/WindData/dataset/train/df.csv',index=False)
test_df.to_csv('dataset_example/WindData/dataset/test/df.csv',index=False)
test_df.to_csv('dataset_example/WindData/dataset/val/df.csv',index=False)

In [2]:
def main():
    class Args:
        def __init__(self):
            # status
            self.is_training = 1
            # model id for saving
            self.model_id = 'test4'
            # model name
            self.model = 'LSTM'
            # Whether to save loss plots or not
            self.plot_flag = 1
            # Base dir to save test results
            self.test_dir = ''
            # Whether to print inter-epoch losses
            self.verbose = 1
            # dataset type, Wind or WindGraph
            self.data = 'Wind'
            # root path of the data file
            self.root_path = './dataset_example/WindData/dataset/'
            # data file
            self.data_path = 'df.csv'
            # optional target station for non-graph models
            self.target = 'Close'
            # freq for time features encoding
            self.freq = 'b'
            # location of model checkpoints
            self.checkpoints = './checkpoints/'
            # Whether to checkpoint or not
            self.checkpoint_flag = 1
            # number of closest nodes for graph connectivity, None --> complete graph
            self.n_closest = None
            # Whether to use all stations or just target for non-spatial models
            self.all_stations = 0
            # Only use every nth point. Set data_step = 1 for full dataset
            self.data_step = 1
            # Minimum number of nodes in a graph
            self.min_num_nodes = 2
            # forecasting task, options:[M, S]; M:multivariate input, S:univariate input
            self.features = 'S'
            # input sequence length
            self.seq_len = 5
            # start token length. Note that Graph models only use label_len and pred_len
            self.label_len = 1
            # prediction sequence length
            self.pred_len = 1
            # Number of encoder input features
            self.enc_in = 1
            # Number of decoder input features
            self.dec_in = 1
            # output size, note that it is assumed that the target features are placed last
            self.c_out = 1
            # dimension of model
            self.d_model = 512
            # num of heads
            self.n_heads = 8
            # number of encoder layers for non-spatial and number of LSTM or MLP layers for GraphLSTM and GraphMLP
            self.e_layers = 4
            # num of decoder layers
            self.d_layers = 4
            # Number of sequential graph blocks in GNN
            self.gnn_layers = 2
            # dimension of fcn
            self.d_ff = 2048
            # window size of moving average for Autoformer
            self.moving_avg = 25
            # attn factor
            self.factor = 3
            # whether to use distilling in encoder
            self.distil = True
            # dropout
            self.dropout = 0.5
            # time features encoding, options:[timeF, fixed, learned]
            self.embed = 'timeF'
            # activation
            self.activation = 'gelu'
            # whether to output attention in ecoder
            self.output_attention = False
            # Local attention length for LogSparse Transformer
            self.win_len = 6
            # Restart attention length for LogSparse Transformer
            self.res_len = None
            # Key/Query convolution kernel length for LogSparse Transformer
            self.qk_ker = 4
            # Weather to apply ConvAttn for values (in addition to K/Q for LogSparseAttn)
            self.v_conv = 0
            # Weather to apply logsparse mask for LogSparse Transformer
            self.sparse_flag = 1
            # Weather to find top keys instead of queries in Informer
            self.top_keys = 0
            # Kernel size for the 1DConv value embedding
            self.kernel_size = 3
            # The training strategy to use for the LSTM model. recursive or mixed_teacher_forcing
            self.train_strat_lstm = 'mixed_teacher_forcing'
            # Whether to apply laynorm to outputs of Enc or Dec in FFTransformer
            self.norm_out = 1
            # Number of wavelet decompositions for FFTransformer
            self.num_decomp = 4
            # Whether to apply MLP to GNN outputs
            self.mlp_out = 0
            # data loader num workers
            self.num_workers = 0
            # experiments times
            self.itr = 1
            # train epochs
            self.train_epochs = 1
            # batch size of train input data
            self.batch_size = 32
            # early stopping patience
            self.patience = 5
            # optimizer learning rate
            self.learning_rate = 0.1
            # Rate for which to decay lr with
            self.lr_decay_rate = 0.8
            # exp description
            self.des = 'test'
            # loss function
            self.loss = 'mse'
            # adjust learning rate
            self.lradj = 'type1'
            # use gpu
            self.use_gpu = torch.cuda.is_available()
            # gpu
            self.gpu = 0
            # use multiple gpus, still experimental for graph data
            self.use_multi_gpu = False
            # device ids of multiple gpus
            self.devices = '0,1,2,3'
        def __repr__(self):
            return '\n'.join([f'{key} = {value}' for key, value in self.__dict__.items()])
    
    args = Args()
    
    # Setup for multi-GPU if specified
    if args.use_gpu and args.use_multi_gpu:
        args.devices = args.devices.replace(' ', '')
        device_ids = args.devices.split(',')
        args.device_ids = [int(id_) for id_ in device_ids]
        args.gpu = args.device_ids[0]
    
    print('Args in experiment:')
    print(args)
    
    Exp = Exp_Main
    
    if args.is_training:
        for ii in range(args.itr):
            # setting record of experiments
            setting = '{}_{}_{}_ft{}_sl{}_ll{}_pl{}_{}'.format(
                args.model_id,
                args.model,
                args.data,
                args.features,
                args.seq_len,
                args.label_len,
                args.pred_len,
                ii)
            
            exp = Exp(args)  # set experiments
            print('>>>>>>>start training : {}>>>>>>>>>>>>>>>>>>>>>>>>>>'.format(setting))
            exp.train(setting)
            
            print('>>>>>>>testing : {}<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<'.format(setting))
            exp.test(setting, base_dir=args.test_dir)
            
            torch.cuda.empty_cache()
    else:
        ii = 0
        setting = '{}_{}_{}_ft{}_sl{}_ll{}_pl{}_{}'.format(
            args.model_id,
            args.model,
            args.data,
            args.features,
            args.seq_len,
            args.label_len,
            args.pred_len,
            ii)
        exp = Exp(args)  # set experiments
        print('>>>>>>>testing : {}<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<'.format(setting))
        exp.test(setting, base_dir=args.test_dir)
        torch.cuda.empty_cache()

if __name__ == "__main__":
    main()

Args in experiment:
is_training = 1
model_id = test4
model = LSTM
plot_flag = 1
test_dir = 
verbose = 1
data = Wind
root_path = ./dataset_example/WindData/dataset/
data_path = df.csv
target = Close
freq = b
checkpoints = ./checkpoints/
checkpoint_flag = 1
n_closest = None
all_stations = 0
data_step = 1
min_num_nodes = 2
features = S
seq_len = 5
label_len = 1
pred_len = 1
enc_in = 1
dec_in = 1
c_out = 1
d_model = 8
n_heads = 1
e_layers = 1
d_layers = 1
gnn_layers = 1
d_ff = 4
moving_avg = 25
factor = 3
distil = True
dropout = 0.5
embed = timeF
activation = gelu
output_attention = False
win_len = 6
res_len = None
qk_ker = 4
v_conv = 0
sparse_flag = 1
top_keys = 0
kernel_size = 3
train_strat_lstm = mixed_teacher_forcing
norm_out = 1
num_decomp = 4
mlp_out = 0
num_workers = 0
itr = 1
train_epochs = 1
batch_size = 32
patience = 5
learning_rate = 0.1
lr_decay_rate = 0.8
des = test
loss = mse
lradj = type1
use_gpu = False
gpu = 0
use_multi_gpu = False
devices = 0,1,2,3
Use CPU
>>>>>>>start tr