In [108]:
import pandas as pd
from app.utils.data_loader import load_csv
from config import get_historical_files_path
import re
import torch
import torch.nn as nn
from torch.utils.data import  TensorDataset, Dataset, DataLoader

from sklearn.model_selection import TimeSeriesSplit

In [199]:
file_path = get_historical_files_path('Bleached Softwood Kraft Pulp Futures Historical Data.csv')
data = load_csv(file_path)

In [200]:
def change_data_type(input: pd.DataFrame):

    df = input.copy()
    cols = ['Price', 'Open', 'High', 'Low', 'Vol.', 'Change %']

    for col in cols:
        if col == 'Vol.':
            df[col] = data[col].apply(lambda x: float(re.sub('K', '', x))*1000 if x != '-' else 0)
        elif col == 'Change %':
            df[col] = data[col].apply(lambda x: float(re.sub('%', '', x)) /100)
        else:
            df[col] = data[col].apply(lambda x: float(re.sub(',', '', x)))
    return df

In [201]:
def data_preprocessing(raw_data):
    data = raw_data.copy()

    data['Date'] = pd.to_datetime(data['Date'])
    data['Year'] = data['Date'].dt.year
    data['Month'] = data['Date'].dt.month
    data['Day'] = data['Date'].dt.day

    data = change_data_type(data)

    data = data.sort_values('Date').reset_index(drop=True)


    data = data.drop('Date', axis=1)

    return data

In [202]:
data = data_preprocessing(data)

In [203]:
data

Unnamed: 0,Price,Open,High,Low,Vol.,Change %,Year,Month,Day
0,4744.0,4748.0,4752.0,4734.0,170.0,0.0013,2020,1,2
1,4740.0,4738.0,4750.0,4730.0,160.0,-0.0008,2020,1,3
2,4706.0,4722.0,4734.0,4688.0,740.0,-0.0072,2020,1,6
3,4704.0,4698.0,4720.0,4688.0,590.0,-0.0004,2020,1,7
4,4764.0,4718.0,4800.0,4718.0,1520.0,0.0128,2020,1,8
...,...,...,...,...,...,...,...,...,...
1138,5626.0,5616.0,5670.0,5612.0,360.0,-0.0004,2024,9,10
1139,5546.0,5602.0,5624.0,5502.0,2180.0,-0.0142,2024,9,11
1140,5536.0,5566.0,5582.0,5510.0,180.0,-0.0018,2024,9,12
1141,5552.0,5532.0,5578.0,5526.0,310.0,0.0029,2024,9,13


### Prepare dataset: TensorDataset

In [204]:
df_features = data.drop('Price', axis=1)
df_label = data['Price']

features = df_features.columns
label = ['Price']

In [205]:
df_features

Unnamed: 0,Open,High,Low,Vol.,Change %,Year,Month,Day
0,4748.0,4752.0,4734.0,170.0,0.0013,2020,1,2
1,4738.0,4750.0,4730.0,160.0,-0.0008,2020,1,3
2,4722.0,4734.0,4688.0,740.0,-0.0072,2020,1,6
3,4698.0,4720.0,4688.0,590.0,-0.0004,2020,1,7
4,4718.0,4800.0,4718.0,1520.0,0.0128,2020,1,8
...,...,...,...,...,...,...,...,...
1138,5616.0,5670.0,5612.0,360.0,-0.0004,2024,9,10
1139,5602.0,5624.0,5502.0,2180.0,-0.0142,2024,9,11
1140,5566.0,5582.0,5510.0,180.0,-0.0018,2024,9,12
1141,5532.0,5578.0,5526.0,310.0,0.0029,2024,9,13


In [206]:
X = torch.tensor(df_features.to_numpy(), dtype=torch.float32)
y = torch.tensor(df_label.to_numpy(), dtype=torch.float32)

In [207]:
df_train = TensorDataset(X, y)

Prepare dataset: Custom Dataset

In [208]:
class CustomDataset(Dataset):
    def __init__(self, df_input: pd.DataFrame, features: list, label: str, accelerator='cpu'):
        """
        Args:
            df_input (pd.DataFrame): input DataFrame
            features (list): list of column names to use as features.
            label (str): column name to use as the target.
        """
        self.features = torch.FloatTensor(df_input[features].to_numpy()).to(accelerator)
        self.label = torch.FloatTensor(df_input[label].to_numpy()).to(accelerator)

    def __len__(self):
        """
        Returns the total number of data
        """
        return len(self.features)
    
    def __getitem__(self, index):
        """
        Retrieve one sample at the given index

        Args:
            idx(int): index of the sample to retrieve

        Returns:
            tuple(feature, target): as tensor
        """
        features = self.features[index]
        target = self.label[index]
        return features, target

In [211]:
class DataModule(nn.Module):
    def __init__(self, df_train, batch_size, features, label, N_fold, df_valid=None, accelerator='cpu'):
        super().__init__()
        self.df = df_train
        self.batch_size = batch_size
        self.df_valid = df_valid
        self.accelerator = accelerator
        
        # initial the datasets as None
        self.tain_dataset = None
        self.valid_dataset = None

        self.features = features
        self.label = label
        self.N_fold = N_fold

        self.setup()

    def setup(self, test_days=30):    
        self.index_dict = {}
        tss = TimeSeriesSplit(n_splits=self.N_fold, test_size=test_days)
        for i, (train_idx, val_idx) in enumerate(tss.split(data)):
            self.index_dict[i] = {
                "train_idx": train_idx,
                "val_idx": val_idx
            }


    def train_loader(self, fold, num_workers=0):
        self.train_dataset = CustomDataset(
            self.df[self.df.index.isin(self.index_dict[fold]['train_idx'])],
            features=self.features,
            label=self.label,
            accelerator=self.accelerator
        )
        return DataLoader(self.train_dataset, batch_size=self.batch_size, shuffle=False, num_workers=num_workers)
    
    def valid_loader(self, fold, num_workers=0):
        self.valid_dataset = CustomDataset(
            self.df[self.df.index.isin(self.index_dict[fold]['val_idx'])],
            features=self.features,
            label=self.label,
            accelerator=self.accelerator
        )
        return Dataset(self.valid_dataset, batch_size=self.batch_size, shuffle=False, num_workers=num_workers)


In [212]:
N_fold = 5
data_module = DataModule(df_train=data, batch_size=500, features=features, label=label, N_fold=N_fold, accelerator='cpu')

In [213]:
for fold in range(N_fold):
    data_module.train_loader(fold)

In [214]:
data_module.train_dataset.features.shape

torch.Size([1113, 8])