In [74]:
import yfinance as yf

# Fetch historical data for Google (GOOGL)
data_google = yf.download("GOOGL", start="2015-01-01", end="2023-01-01")
print(data_google.head())
type(data_google)
data_google.columns

[*********************100%***********************]  1 of 1 completed

Price                      Adj Close      Close       High        Low  \
Ticker                         GOOGL      GOOGL      GOOGL      GOOGL   
Date                                                                    
2015-01-02 00:00:00+00:00  26.381865  26.477501  26.790001  26.393999   
2015-01-05 00:00:00+00:00  25.879185  25.973000  26.399500  25.887501   
2015-01-06 00:00:00+00:00  25.240501  25.332001  26.060499  25.277500   
2015-01-07 00:00:00+00:00  25.166271  25.257500  25.574499  25.182501   
2015-01-08 00:00:00+00:00  25.253954  25.345501  25.375000  24.750999   

Price                           Open    Volume  
Ticker                         GOOGL     GOOGL  
Date                                            
2015-01-02 00:00:00+00:00  26.629999  26480000  
2015-01-05 00:00:00+00:00  26.357500  41182000  
2015-01-06 00:00:00+00:00  26.025000  54456000  
2015-01-07 00:00:00+00:00  25.547501  46918000  
2015-01-08 00:00:00+00:00  25.075500  73054000  





MultiIndex([('Adj Close', 'GOOGL'),
            (    'Close', 'GOOGL'),
            (     'High', 'GOOGL'),
            (      'Low', 'GOOGL'),
            (     'Open', 'GOOGL'),
            (   'Volume', 'GOOGL')],
           names=['Price', 'Ticker'])

In [75]:
data_google = data_google.reset_index()
print(data_google['Date'].head())
data_google.columns

0   2015-01-02 00:00:00+00:00
1   2015-01-05 00:00:00+00:00
2   2015-01-06 00:00:00+00:00
3   2015-01-07 00:00:00+00:00
4   2015-01-08 00:00:00+00:00
Name: Date, dtype: datetime64[ns, UTC]


MultiIndex([(     'Date',      ''),
            ('Adj Close', 'GOOGL'),
            (    'Close', 'GOOGL'),
            (     'High', 'GOOGL'),
            (      'Low', 'GOOGL'),
            (     'Open', 'GOOGL'),
            (   'Volume', 'GOOGL')],
           names=['Price', 'Ticker'])

In [76]:
import pandas as pd

# Ensure 'Date' column is present
#print(data_google.columns)

data_google.columns = ['_'.join(col).strip() for col in data_google.columns.values]

# Inspect the DataFrame

data_google = data_google.rename(columns={'Date_': 'Date'})

# Melt the dataframe to long format
data_google_long = data_google.melt(id_vars="Date", var_name="Price_Ticker", value_name="Value")

# Split the 'Price_Ticker' column into 'Price' and 'Ticker'
data_google_long[['Price', 'Ticker']] = data_google_long['Price_Ticker'].str.split('_', expand=True)
data_google_long = data_google_long.drop(columns=['Price_Ticker'])

# Add a time_idx column
data_google_long["time_idx"] = data_google_long.groupby(["Ticker"]).cumcount()

# Add a group_id column
data_google_long["group_id"] = data_google_long["Ticker"].astype("category").cat.codes

# Filter for relevant columns
data_google_long = data_google_long[["Date", "time_idx", "group_id", "Price", "Value"]]

# Pivot to get each price type as a separate column
data_google_wide = data_google_long.pivot(index=["Date", "time_idx", "group_id"], columns="Price", values="Value").reset_index()

# Rename columns
data_google_wide.columns.name = None
print(data_google_wide.head())

                       Date  time_idx  group_id  Adj Close      Close  \
0 2015-01-02 00:00:00+00:00         0         0  26.381865        NaN   
1 2015-01-02 00:00:00+00:00      2014         0        NaN  26.477501   
2 2015-01-02 00:00:00+00:00      4028         0        NaN        NaN   
3 2015-01-02 00:00:00+00:00      6042         0        NaN        NaN   
4 2015-01-02 00:00:00+00:00      8056         0        NaN        NaN   

        High        Low       Open  Volume  
0        NaN        NaN        NaN     NaN  
1        NaN        NaN        NaN     NaN  
2  26.790001        NaN        NaN     NaN  
3        NaN  26.393999        NaN     NaN  
4        NaN        NaN  26.629999     NaN  


In [77]:
from pytorch_forecasting import TimeSeriesDataSet, GroupNormalizer

# Define parameters
max_prediction_length = 30  # Predict the next 30 days
max_encoder_length = 120  # Use the past 120 days as input
training_cutoff = data_google_wide["time_idx"].max() - max_prediction_length

# Create the dataset
training = TimeSeriesDataSet(
    data_google_wide[lambda x: x.time_idx <= training_cutoff],
    time_idx="time_idx",
    target="Close",  # Predict the 'Close' price
    group_ids=["group_id"],
    max_encoder_length=max_encoder_length,
    max_prediction_length=max_prediction_length,
    static_categoricals=["group_id"],
    static_reals=[],
    time_varying_known_categoricals=[],
    time_varying_known_reals=["time_idx"],
    time_varying_unknown_categoricals=[],
    time_varying_unknown_reals=["Open", "High", "Low", "Close", "Adj Close", "Volume"],
    target_normalizer=GroupNormalizer(groups=["group_id"], transformation="softplus"),
    add_relative_time_idx=True,
    add_target_scales=True,
    add_encoder_length=True,
)

# Create validation dataset
validation = TimeSeriesDataSet.from_dataset(training, data_google_wide, min_prediction_idx=training_cutoff + 1)

# Create dataloaders
batch_size = 64
train_dataloader = training.to_dataloader(train=True, batch_size=batch_size, num_workers=0)
val_dataloader = validation.to_dataloader(train=False, batch_size=batch_size, num_workers=0)

ValueError: Data type of category group_id was found to be numeric - use a string type / categorified string