# Imports

In [1]:
from modules.data import  data_reader, data_loader
from modules.sr import result_saver
from modules.models import lag_llama
from modules.experiment.tscv import get_tscv_results, get_summary, extract_metrics
from modules.visualization import graphs
from modules.fine_tuning import lag_llama_ft


In [2]:
"""
import os
print(os.getcwd())
"""

'\nimport os\nprint(os.getcwd())\n'

In [3]:
"""
from datetime import datetime

# Get the current date and time
current_datetime = datetime.now()

# Format the date and time as a string
datetime_str = current_datetime.strftime("%Y-%d-%m %H:%M:%S")

# Return the date and time as a string
NOW = datetime_str
"""

'\nfrom datetime import datetime\n\n# Get the current date and time\ncurrent_datetime = datetime.now()\n\n# Format the date and time as a string\ndatetime_str = current_datetime.strftime("%Y-%d-%m %H:%M:%S")\n\n# Return the date and time as a string\nNOW = datetime_str\n'

# Parameters

In [4]:
# experiment parameters
#constraint: FOLDS x PREDICTION_LENGTH + TRAIN_SIZE !!must not!! be bigger than len(data)

PREDICTION_LENGTH = 1 # currentlz only works for PREDICTION_LENGTH > 1
TICKER = "AAPL"
FREQUENCY = "daily" # currently we only have dailz frequency
TYPE_OF_DATA = "stock" # currently we only have stock prices saved
MODELS = ["arima", "llama", "autoregressor", "fine-tuned Llama"] # currentlz works onlz for these two
#FOLDS = 10 # for TSCV # reduced to two for testing purposes
CONTEXT_LENGTH = 245 # set to 245 for testing purposes
METRICS = ['r2', 'mse', 'mae', 'rmse', 'mda', "mape"]

# fine-tuning parameters
BATCH_SIZE = 10
MAX_EPOCHS = 5

# data parameters
FT_START_DATE = "2022-07-07"
START_DATE = "2023-07-07"
END_DATE = "2024-07-07"

# want to add
#TRAIN_PERIOD = # context lenghts. Should take a look into this
TRAIN_SIZE = CONTEXT_LENGTH



# Data

In [5]:
"""
data = data_reader.read_data(type = TYPE_OF_DATA, frequency = FREQUENCY)
# this is just in case there are more CSVs of the same type and frequency, the data should be the first in the list
#if len(data) > 1:
data = data[0]

simple_data = data_reader.read_data(type = TYPE_OF_DATA, frequency = FREQUENCY, match = ["simple"])
#if len(simple_data) > 1:
simple_data = simple_data[0]

train_data = data_reader.read_data(type = TYPE_OF_DATA, frequency = FREQUENCY, match = ["train"])
train_data = train_data[0]

test_data = data_reader.read_data(type = TYPE_OF_DATA, frequency = FREQUENCY, match = ["test"])
test_data = test_data[0]
"""

'\ndata = data_reader.read_data(type = TYPE_OF_DATA, frequency = FREQUENCY)\n# this is just in case there are more CSVs of the same type and frequency, the data should be the first in the list\n#if len(data) > 1:\ndata = data[0]\n\nsimple_data = data_reader.read_data(type = TYPE_OF_DATA, frequency = FREQUENCY, match = ["simple"])\n#if len(simple_data) > 1:\nsimple_data = simple_data[0]\n\ntrain_data = data_reader.read_data(type = TYPE_OF_DATA, frequency = FREQUENCY, match = ["train"])\ntrain_data = train_data[0]\n\ntest_data = data_reader.read_data(type = TYPE_OF_DATA, frequency = FREQUENCY, match = ["test"])\ntest_data = test_data[0]\n'

In [6]:
DATA_CONFIG = {"ticker" : TICKER,
               "frequency" : FREQUENCY,
               "start" : START_DATE,
               "end" : END_DATE}

FT_DATA_CONFIG = {"ticker" : TICKER,
                     "frequency" : FREQUENCY,
                     "start" : FT_START_DATE,
                     "end" : START_DATE}

In [7]:
data = data_loader.get_data(data_type=TYPE_OF_DATA, kwargs=DATA_CONFIG)
ft_data = data_loader.get_data(data_type=TYPE_OF_DATA, kwargs=FT_DATA_CONFIG)

[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed


# Additional parameters

In [8]:
DATA_LENGTH = len(data)
FT_LENGTH = len(ft_data)
FOLDS = int((DATA_LENGTH - TRAIN_SIZE) / PREDICTION_LENGTH) # this calculates max ammount of folds we can have given a set TRAIN_SIZE and PREDICTION_LENGTH
FOLDS = 5


# Fine tuning

In [9]:
#preparing the training data for lag llama fine tuning
ft_train_data = lag_llama.prepare_data(data=ft_data, 
                                       prediction_length=0, 
                                       frequency=FREQUENCY)

In [10]:
# creating the lag llama predictor object 
predictor = lag_llama_ft.get_predictor(prediction_length=PREDICTION_LENGTH, 
                                       context_length=CONTEXT_LENGTH, 
                                       batch_size=BATCH_SIZE, 
                                       max_epochs=MAX_EPOCHS)

In [None]:
# fine-tuning the predictor object
predictor = predictor.train(ft_train_data, 
                            cache_data = True, 
                            shuffle_buffer_length = 1000)

In [None]:
r, p = get_tscv_results(data = data,
                           prediction_horizon=PREDICTION_LENGTH,
                           context_length=CONTEXT_LENGTH, 
                           folds=FOLDS, 
                           frequency=FREQUENCY,
                           predictor=predictor)

# Saving results

In [34]:
r

Unnamed: 0,r2,mse,mae,rmse,mda,mape
arima,0.450407,14.482253,3.190176,3.805556,0.6,0.014594
lag_llama,-58.940031,1579.470616,38.714596,39.742554,0.2,0.17656
autoregressor,0.447312,14.563802,3.34167,3.816255,0.6,0.015282
ft_lag_llama,-0.700098,44.799023,5.472705,6.693207,0.6,0.02515


In [35]:
p

Unnamed: 0,arima,lag_llama,autoregressor,ft_lag_llama,actual
0,213.656341,178.009499,213.25,218.587265,214.100006
1,214.125477,188.782514,214.100006,215.746447,210.619995
2,210.232611,177.785137,210.619995,203.581973,216.75
3,217.31034,176.589256,216.75,217.426447,220.270004
4,220.542241,181.412634,220.270004,220.673695,221.550003
5,221.632904,174.763386,221.550003,220.005372,226.339996


In [18]:
experiment_name = f"PREDICTION_LENGTH={PREDICTION_LENGTH}__TICKER={TICKER}__FREQUENCY={FREQUENCY}__TYPE_OF_DATA={TYPE_OF_DATA}__FOLDS={FOLDS}__CONTEXT_LENGTH/TRAIN_SIZE={CONTEXT_LENGTH}__FT_START_DATE={FT_START_DATE}__START_DATE={START_DATE}__END_DATE={END_DATE}__FT_LENGTH={FT_LENGTH}__DATA_LENGTH={DATA_LENGTH}" 

In [27]:
experiment_name = f"P_L={PREDICTION_LENGTH}__T={TICKER}__FR={FREQUENCY}__T_O_D={TYPE_OF_DATA}__FO={FOLDS}__C_L_T_S={CONTEXT_LENGTH}__FT_S_D={FT_START_DATE}__S_D={START_DATE}__E_D={END_DATE}__FT_L={FT_LENGTH}__D_L={DATA_LENGTH}.csv"

In [28]:
experiment_name

'P_L=1__T=AAPL__FR=daily__T_O_D=stock__FO=6__C_L_T_S=245__FT_S_D=2022-07-07__S_D=2023-07-07__E_D=2024-07-07__FT_L=251__D_L=251.csv'

In [29]:
result_saver.save_results(r, experiment_name, type="evaluation")
result_saver.save_results(p, experiment_name, type="prediction")

Data has been written to 'c:\Users\topco\Dokumenti\MSc Banking and Digital Finance UCL\Modules\Dissertation\MSc_dissertation\results\evaluation\P_L=1__T=AAPL__FR=daily__T_O_D=stock__FO=6__C_L_T_S=245__FT_S_D=2022-07-07__S_D=2023-07-07__E_D=2024-07-07__FT_L=251__D_L=251.csv'.
Data has been written to 'c:\Users\topco\Dokumenti\MSc Banking and Digital Finance UCL\Modules\Dissertation\MSc_dissertation\results\prediction\P_L=1__T=AAPL__FR=daily__T_O_D=stock__FO=6__C_L_T_S=245__FT_S_D=2022-07-07__S_D=2023-07-07__E_D=2024-07-07__FT_L=251__D_L=251.csv'.


# Models

In [None]:
# arima


In [None]:
# lag llama

# Fine tuning an estimator

In [None]:
#preparing the training data for lag llama fine tuning
ft_train_data = lag_llama.prepare_data(data=ft_data, 
                                       prediction_length=0, 
                                       frequency=FREQUENCY)

In [None]:
# creating the lag llama predictor object 
predictor = lag_llama_ft.get_predictor(prediction_length=PREDICTION_LENGTH, 
                                       context_length=CONTEXT_LENGTH, 
                                       batch_size=BATCH_SIZE, 
                                       max_epochs=MAX_EPOCHS)

In [None]:
# fine-tuning the predictor object
predictor = predictor.train(ft_train_data, 
                            cache_data = True, 
                            shuffle_buffer_length = 1000)

# SKLearn TSCV

In [None]:
r, p, a = get_tscv_results(data = data,
                           prediction_horizon=PREDICTION_LENGTH,
                           context_length=CONTEXT_LENGTH, 
                           folds=FOLDS, 
                           frequency=FREQUENCY, 
                           predictor=predictor)

In [None]:
#creating the summary for each model
s = [get_summary(r[i]) for i in range(len(r))]

In [None]:
means_df, medians_df, stds_df = extract_metrics(s, MODELS)

# Visualisation

In [None]:
graphs.standard_visualisation(MODELS, METRICS, s)

In [None]:
graphs.interactive_visualisation(MODELS, METRICS, s)

In [None]:
graphs.prediction_visualisation(MODELS, p, a)

LABEL the graph
label the axes with timestamps
show the history of the actual
table of comparison, not just graph