In [1]:
#Conda environment python 3.12.10 chronos_project
#Run as administrator

#pip install datasets
import datasets
import pandas as pd 
#In Anaconda prompt: conda activate chronos_project\ conda install matplotlib
import matplotlib.pyplot as plt

#Database libraries
from autogluon.timeseries import TimeSeriesPredictor
import torch
from chronos import BaseChronosPipeline
from autogluon.timeseries import TimeSeriesDataFrame, TimeSeriesPredictor
import random as rn

#Progress bar
from tqdm import tqdm
print(torch.__version__)
print(torch.version.cuda)

2.5.1+cu121
12.1


# Big dataset

In [4]:
def to_pandas_streaming_all(ds: datasets.Dataset) -> pd.DataFrame:
    """Convierte todo el shard de un dataset en streaming a un DataFrame en formato 'long'.
    
    Se recolecta cada muestra del shard, se crea el DataFrame y se aplica explode
    sobre las columnas que almacenan secuencias, para luego inferir los tipos de datos.
    """
    # Colect all the samples in the shard
    data_list = [sample for sample in tqdm(ds_split, desc="Procesando shard")]
    
    #As the loading is with streaming, the data is not in a list, so we need to convert it to a list
    df = pd.DataFrame(data_list)
    
    # Identtify sequence columns in the dataset
    sequence_columns = [col for col in ds.features if isinstance(ds.features[col], datasets.Sequence)]
    
    #Apply explode to the sequence columns, so we divide every value in the sequence into a new row
    for col in sequence_columns:
        if col in df.columns:
            df = df.explode(col)
    
    return df.infer_objects()

In [5]:
#Load dataset
#Training corpus with  10M TSMixup augmentations of real-world data
ds = datasets.load_dataset(
    "autogluon/chronos_datasets",
    "training_corpus_tsmixup_10m",
    streaming=True,
    split="train"
)
#ds.set_format("numpy")
  # sequences returned as numpy arrays

Resolving data files:   0%|          | 0/200 [00:00<?, ?it/s]

In [6]:
#The data can be split it into 200 parts
print(ds)

IterableDataset({
    features: ['target', 'id', 'timestamp'],
    num_shards: 200
})


In [7]:
print(ds.features)
#Verify the type of data that the features have, all of them are concurrent to the description of the dataset

{'target': Sequence(feature=Value(dtype='float64', id=None), length=-1, id=None), 'id': Value(dtype='string', id=None), 'timestamp': Sequence(feature=Value(dtype='timestamp[ms]', id=None), length=-1, id=None)}


In [8]:
# Shard the dataset an take one part.
ds_split = ds.shard(num_shards=200, index=0)

In [9]:
print(ds_split)

IterableDataset({
    features: ['target', 'id', 'timestamp'],
    num_shards: 1
})


In [15]:
df_shard0=pd.DataFrame(ds_split)

In [16]:
df_shard0.head(10)

Unnamed: 0,target,id,timestamp
0,"[0.613463059343371, 0.5711616398408842, 0.5182...",T0000000,"[1970-01-01 00:00:00, 1970-01-01 01:00:00, 197..."
1,"[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...",T0000001,"[1970-01-01 00:00:00, 1970-01-01 01:00:00, 197..."
2,"[1.0770299311167582, 4.308119724467033, 1.0770...",T0000002,"[1970-01-01 00:00:00, 1970-01-01 01:00:00, 197..."
3,"[0.004624637833987918, 0.00547248810355237, 0....",T0000003,"[1970-01-01 00:00:00, 1970-01-01 01:00:00, 197..."
4,"[0.4764301971138915, 0.49643825495907135, 0.53...",T0000004,"[1970-01-01 00:00:00, 1970-01-01 01:00:00, 197..."
5,"[0.43618684847808536, 0.42986687631831216, 0.4...",T0000005,"[1970-01-01 00:00:00, 1970-01-01 01:00:00, 197..."
6,"[2.401216908955567, 2.4339980271665644, 2.4831...",T0000006,"[1970-01-01 00:00:00, 1970-01-01 01:00:00, 197..."
7,"[0.5051982397841839, 0.5233518735046003, 0.522...",T0000007,"[1970-01-01 00:00:00, 1970-01-01 01:00:00, 197..."
8,"[0.5321381187621922, 0.5171483125998776, 0.479...",T0000008,"[1970-01-01 00:00:00, 1970-01-01 01:00:00, 197..."
9,"[-8.913489815619967e-06, -1.1187203725208862e-...",T0000009,"[1970-01-01 00:00:00, 1970-01-01 01:00:00, 197..."


In [17]:
df_shard0.info()
#Originally, the dataset has 50.000 timeseries, whith a sequence of more than 1.000 values each one.
#As we need process each value as a new row, we will use just a sample to be able to process it.

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   target     50000 non-null  object
 1   id         50000 non-null  object
 2   timestamp  50000 non-null  object
dtypes: object(3)
memory usage: 1.1+ MB


In [18]:
#Choose a random raw to compare characteristics
a=rn.randint(0,5000)

In [19]:
len(df_shard0["target"][0])==len(df_shard0["target"][a]) 
#Not all the sequences have the same length

False

In [None]:
type(df_shard0["target"][0])

list

In [None]:
#Convert the dataset to a pandas dataframe -> long format
df_shard0 = to_pandas_streaming_all(ds_split)
#Save df_shard0 to csv
#df_shard0.to_csv("df_shard0.csv", index=False)
#As this corpus is too big, it is preferable to work with a smaller databse such as m4_daily to run the experiments

# M4 dataset

The schema from the dataset follows the next structure: 

* Each dataset row corresponds to a single (univariate or multivariate) time series.

* There exists one column with name id and type string that contains the unique identifier of each time series.

* There exists one column of type Sequence with dtype timestamp[ms]. This column contains the timestamps of the observations. Timestamps are guaranteed to have a regular frequency that can be obtained with pandas.infer_freq.

* There exists at least one column of type Sequence with numeric (float, double, or int) dtype. These columns can be interpreted as target time series.

* For each row, all columns of type Sequence have same length. Remaining columns of types other than Sequence (e.g., string or float) can be interpreted as static covariates.

Database M4: 
This dataset is a collection of 100,000 time series used for the fourth edition of the Makridakis forecasting Competition. The M4 dataset consists of time series of yearly, quarterly, monthly and other (weekly, daily and hourly) data, which were used as part of the training corpus of the Chronos model.

https://paperswithcode.com/dataset/m4

In [None]:
ds = datasets.load_dataset("autogluon/chronos_datasets", "m4_daily", split="train")
ds.set_format("numpy")  # sequences returned as numpy arrays


train-00000-of-00001.parquet:   0%|          | 0.00/65.5M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/4227 [00:00<?, ? examples/s]

In [3]:
print(ds)

Dataset({
    features: ['id', 'timestamp', 'target', 'category'],
    num_rows: 4227
})


In [4]:
print(ds.features)


{'id': Value(dtype='string', id=None), 'timestamp': Sequence(feature=Value(dtype='timestamp[ms]', id=None), length=-1, id=None), 'target': Sequence(feature=Value(dtype='float64', id=None), length=-1, id=None), 'category': Value(dtype='string', id=None)}


In [5]:
ds.set_format("numpy")  # sequences returned as numpy arrays

In [6]:
def to_pandas(ds: datasets.Dataset) -> "pd.DataFrame":
    """Convert dataset to long data frame format."""
    sequence_columns = [col for col in ds.features if isinstance(ds.features[col], datasets.Sequence)]
    return ds.to_pandas().explode(sequence_columns).infer_objects()


In [9]:
m4_df= to_pandas(ds)

## EDA

In [10]:
m4_df.head(10)

Unnamed: 0,id,timestamp,target,category
0,T000000,1994-03-01 12:00:00,1017.1,Macro
0,T000000,1994-03-02 12:00:00,1019.3,Macro
0,T000000,1994-03-03 12:00:00,1017.0,Macro
0,T000000,1994-03-04 12:00:00,1019.2,Macro
0,T000000,1994-03-05 12:00:00,1018.7,Macro
0,T000000,1994-03-06 12:00:00,1015.6,Macro
0,T000000,1994-03-07 12:00:00,1018.5,Macro
0,T000000,1994-03-08 12:00:00,1018.3,Macro
0,T000000,1994-03-09 12:00:00,1018.4,Macro
0,T000000,1994-03-10 12:00:00,1021.5,Macro


In [None]:
m4_df.dtypes 
#The datatype od the timestamp is adequate just as target

id                   object
timestamp    datetime64[ns]
target              float64
category             object
dtype: object

In [None]:
#DAta type of id
#Data type of category
#How many categories do we have
#How many id, do we have? 
#It is the same id corresponding to the same category? 
#Does exist null data? 
#Are there all the days in the timestamp? or
#  do we need to rpoduce null data to the sequences? 
#How is the behavior of the timestamp: graph the current timestamp
#cuál es el inicio y el final del timestamp? 
#How many values do we have in the target?

#How do I need to pass the data to chronos? 
