# Imports

In [1]:
from modules.data import data_loader, data_splitter, data_reader
from modules.models import arima, lag_llama
from modules.evaluation import evaluate
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from modules.experiment.experiment import run_experiment
from modules.data.tscv import create_tscv_dataset
from modules.experiment.tscv import get_tscv_results, mda


import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from itertools import islice
from sklearn.model_selection import TimeSeriesSplit

# Parameters

In [2]:
PREDICTION_LENGTH = 10 # currentlz only works for PREDICTION_LENGTH > 1
FREQUENCY = "daily" # currently we only have dailz frequency
TYPE_OF_DATA = "stock" # currently we only have stock prices saved
MODELS = ["arima", "llama"] # currentlz works onlz for these two

# want to add
PREDICTION_HORIZON = 3 # can use anything as long as it complies with data length
#TRAIN_PERIOD = # context lenghts. Should take a look into this
#TRAIN_SIZE = 
FOLDS = 10# for TSCV
CONTEXT_LENGTH = 64



# Notes

 - autoregressor
 - mean directional accuracy
 - ask
 - for each time-series create a whole dataframe for TSCV
  - problem with this is with time horizon that is >1 
  - prediction horizon is only for that value in the future

# TO DO

 - create the TSCV experiment. It needs to create a table. Metrics for each fold
 - lit review
 - content for the presentation by monday!!
 - review data leakage risk
 - writing of the dissertation
 - autoregressor
 - MDA metric
 - frequency

 Writing
 - lit review
 - lit reading

# Loading the data

In [3]:
# loading the data for all 500 S&P500 stocks
data = data_reader.read_data(TYPE_OF_DATA)[0]
simple_data = data_loader.get_simle_data()

[*********************100%%**********************]  1 of 1 completed


# Prepare the data

In [4]:
# pass full data at maximum granularity and produce according to frequency parameter

In [6]:
tscv_data = create_tscv_dataset(data = simple_data, context_length=CONTEXT_LENGTH, n_folds=FOLDS, prediction_horizon=PREDICTION_HORIZON, max_folds=False)

# Models

In [7]:
# arima


In [8]:
# lag llama

# Run regular experiment

In [None]:
results = run_experiment(data = data, prediction_length = PREDICTION_LENGTH)

# Evaluation

In [None]:
results

# Run TSCV experiment

look up tzpes of TSCV

In [10]:
tscv_results, prediction = run_experiment(data=tscv_data, tscv=True)

y0 done  1
------------
y1 done  2
------------
y2 done  3
------------
y3 done  4
------------
y4 done  5
------------
y5 done  6
------------
y6 done  7
------------
y7 done  8
------------
y8 done  9
------------
y9 done  10
------------


In [11]:
tscv_results

{'r2': {'arima': -0.8036159605688358, 'llama': -4.690321882253741},
 'mse': {'arima': 9.014638013311012, 'llama': 28.440750730306206},
 'mae': {'arima': 2.712121007999187, 'llama': 4.178632465124013},
 'rmse': {'arima': 3.0024386776936867, 'llama': 5.332987036390226}}

# SKLearn TSCV

In [3]:
simple_data = data_loader.get_simle_data()

[*********************100%%**********************]  1 of 1 completed


In [13]:
# sklearn tscv object
tscv = TimeSeriesSplit(n_splits=FOLDS, test_size=PREDICTION_LENGTH)


In [14]:
# extracting the time-series data from original data
series = simple_data["y"]

In [15]:
# initializing the list of models, metrics and emptz result dict
models=["arima", "llama"]
metrics=["r2", "mse", "mae", "rmse"]
results = {metric: {model: {f"fold_{i}": [] for i in range(FOLDS)} for model in models} for metric in metrics}


i = 0
# iterating over all the folds
for train_index, test_index in tscv.split(series):
    # subsetting the original data according to train/test split
    train = simple_data.iloc[train_index]
    valid = list(simple_data.iloc[test_index]["y"])


    # inputting data into the models
    arima_model = arima.get_autoarima(train)
    autoarima_predictions = arima.autoarima_predictions(arima_model, PREDICTION_LENGTH)
    lag_llama_predictions, tss = lag_llama.get_lam_llama_forecast(train, PREDICTION_LENGTH, context_length=CONTEXT_LENGTH)
    lag_llama_predictions = list(lag_llama_predictions[0].samples.mean(axis = 0))

    # for my own testing purposes
    """
    print(autoarima_predictions)
    print(lag_llama_predictions)
    print(valid)
    """

    # recording the metrics
    results["r2"]["arima"][f"fold_{i}"].append(r2_score(valid, autoarima_predictions))
    results["mse"]["arima"][f"fold_{i}"].append(mean_squared_error(valid, autoarima_predictions))
    results["mae"]["arima"][f"fold_{i}"].append(mean_absolute_error(valid, autoarima_predictions))
    results["rmse"]["arima"][f"fold_{i}"].append(np.sqrt(mean_squared_error(valid, autoarima_predictions)))

    results["r2"]["llama"][f"fold_{i}"].append(r2_score(valid, lag_llama_predictions))
    results["mse"]["llama"][f"fold_{i}"].append(mean_squared_error(valid, lag_llama_predictions))
    results["mae"]["llama"][f"fold_{i}"].append(mean_absolute_error(valid, lag_llama_predictions))
    results["rmse"]["llama"][f"fold_{i}"].append(np.sqrt(mean_squared_error(valid, lag_llama_predictions)))

    i += 1

In [None]:
results

In [4]:
r = get_tscv_results(simple_data, PREDICTION_HORIZON, CONTEXT_LENGTH, FOLDS)

KeyError: 'mda'

In [5]:
r

{'r2': {'arima': {'fold_0': [-3.691877661141132],
   'fold_1': [-0.010542006383208902],
   'fold_2': [-5.611156232296624],
   'fold_3': [-2.0666207482946946],
   'fold_4': [-0.7687106529191812],
   'fold_5': [-0.3431518091946324],
   'fold_6': [-5.351096735700188],
   'fold_7': [-1.6938904126202843],
   'fold_8': [-0.011681380345721193],
   'fold_9': [-4.330864243718463]},
  'llama': {'fold_0': [-9.938959190778798],
   'fold_1': [-130.6562234861198],
   'fold_2': [-95.39530034216908],
   'fold_3': [-94.28116563747474],
   'fold_4': [-5.02102572543716],
   'fold_5': [-341.32386826008013],
   'fold_6': [-85.48149383149917],
   'fold_7': [-111.13917794113777],
   'fold_8': [-112.20774898633088],
   'fold_9': [-161.13711291878053]}},
 'mse': {'arima': {'fold_0': [13.875785862988721],
   'fold_1': [0.31214081152770684],
   'fold_2': [5.642126724971973],
   'fold_3': [2.991871755262922],
   'fold_4': [123.78820905652573],
   'fold_5': [3.945837239182269],
   'fold_6': [51.01199731770291],
  