In [1]:
import os
import pandas as pd
from app.utils.data_loader import load_csv
import re
from configs import Configs
import torch
import torch.nn as nn
from torch.utils.data import  TensorDataset, Dataset, DataLoader

from sklearn.model_selection import TimeSeriesSplit

from pytorch_lightning import LightningDataModule, LightningModule, Trainer

In [2]:
file_path = os.path.join(Configs.HISTORICAL_DATA_DIR, 'Bleached Softwood Kraft Pulp Futures Historical Data.csv')
data = load_csv(file_path)

In [3]:
class TrainingConfig:
    def __init__(self):

        self.usegpu = True
        self.seed = 42
        self.batch_size = 300
        self.lr = 1e-3
        self.n_hidden = [16, 32, 32, 16]
        self.epochs = 100
        self.N_fold = 5
        self.weight_decay = 1e-4

config = TrainingConfig()

In [4]:
# for defining the variable type
from dataclasses import dataclass, field

@dataclass
class Product:
    name: str
    price: int
    qty: int

    # withi field(init=False), amount is not required when init
    amount: int = field(init=False)

    def __post_init__(self):
        self.amount = self.price * self.qty

itemA = Product(name='itemA', price=100, qty=2)
print(itemA)

Product(name='itemA', price=100, qty=2, amount=200)


In [5]:
deivce = torch.device('cuda' if torch.cuda.is_available() and config.usegpu else 'cpu' )

In [None]:
def change_data_type(input: pd.DataFrame):

    df = input.copy()
    cols = ['Price', 'Open', 'High', 'Low', 'Vol.', 'Change %']

    for col in cols:
        if col == 'Vol.':
            df[col] = df[col].apply(lambda x: float(re.sub('K', '', x))*1000 if x != '-' else 0)
        elif col == 'Change %':
            df[col] = df[col].apply(lambda x: float(re.sub('%', '', x)) /100)
        else:
            df[col] = df[col].apply(lambda x: float(re.sub(',', '', x)))
    return df

In [7]:
def data_preprocessing(raw_data):
    data = raw_data.copy()

    data['Date'] = pd.to_datetime(data['Date'])
    data['Year'] = data['Date'].dt.year
    data['Month'] = data['Date'].dt.month
    data['Day'] = data['Date'].dt.day

    data = change_data_type(data)

    data = data.sort_values('Date').reset_index(drop=True)


    data = data.drop('Date', axis=1)

    return data

In [8]:
data = data_preprocessing(data)

In [9]:
data

Unnamed: 0,Price,Open,High,Low,Vol.,Change %,Year,Month,Day
0,4744.0,4748.0,4752.0,4734.0,170.0,0.0013,2020,1,2
1,4740.0,4738.0,4750.0,4730.0,160.0,-0.0008,2020,1,3
2,4706.0,4722.0,4734.0,4688.0,740.0,-0.0072,2020,1,6
3,4704.0,4698.0,4720.0,4688.0,590.0,-0.0004,2020,1,7
4,4764.0,4718.0,4800.0,4718.0,1520.0,0.0128,2020,1,8
...,...,...,...,...,...,...,...,...,...
1138,5626.0,5616.0,5670.0,5612.0,360.0,-0.0004,2024,9,10
1139,5546.0,5602.0,5624.0,5502.0,2180.0,-0.0142,2024,9,11
1140,5536.0,5566.0,5582.0,5510.0,180.0,-0.0018,2024,9,12
1141,5552.0,5532.0,5578.0,5526.0,310.0,0.0029,2024,9,13


### Prepare dataset: TensorDataset

In [10]:
df_features = data.drop('Price', axis=1)
df_label = data['Price']

features = df_features.columns
label = ['Price']

In [11]:
df_features

Unnamed: 0,Open,High,Low,Vol.,Change %,Year,Month,Day
0,4748.0,4752.0,4734.0,170.0,0.0013,2020,1,2
1,4738.0,4750.0,4730.0,160.0,-0.0008,2020,1,3
2,4722.0,4734.0,4688.0,740.0,-0.0072,2020,1,6
3,4698.0,4720.0,4688.0,590.0,-0.0004,2020,1,7
4,4718.0,4800.0,4718.0,1520.0,0.0128,2020,1,8
...,...,...,...,...,...,...,...,...
1138,5616.0,5670.0,5612.0,360.0,-0.0004,2024,9,10
1139,5602.0,5624.0,5502.0,2180.0,-0.0142,2024,9,11
1140,5566.0,5582.0,5510.0,180.0,-0.0018,2024,9,12
1141,5532.0,5578.0,5526.0,310.0,0.0029,2024,9,13


In [12]:
X = torch.tensor(df_features.to_numpy(), dtype=torch.float32)
y = torch.tensor(df_label.to_numpy(), dtype=torch.float32)

In [13]:
df_train = TensorDataset(X, y)

Prepare dataset: Custom Dataset

In [14]:
class CustomDataset(Dataset):
    def __init__(self, df_input: pd.DataFrame, features: list, label: str, accelerator='cpu'):
        """
        Args:
            df_input (pd.DataFrame): input DataFrame
            features (list): list of column names to use as features.
            label (str): column name to use as the target.
        """
        self.features = torch.FloatTensor(df_input[features].to_numpy()).to(accelerator)
        self.label = torch.FloatTensor(df_input[label].to_numpy()).to(accelerator)

    def __len__(self):
        """
        Returns the total number of data
        """
        return len(self.features)
    
    def __getitem__(self, index):
        """
        Retrieve one sample at the given index

        Args:
            idx(int): index of the sample to retrieve

        Returns:
            tuple(feature, target): as tensor
        """
        features = self.features[index]
        target = self.label[index]
        return features, target

In [15]:
class DataModule(nn.Module):
    def __init__(self, df_train, batch_size, features, label, N_fold, accelerator):
        super().__init__()
        self.df = df_train
        self.batch_size = batch_size
        self.accelerator = accelerator
        
        # initial the datasets as None
        self.tain_dataset = None
        self.valid_dataset = None

        self.features = features
        self.label = label
        self.N_fold = N_fold



        self.setup()

    def setup(self, test_days=30):    
        self.index_dict = {}
        tss = TimeSeriesSplit(n_splits=self.N_fold, test_size=test_days)
        for i, (train_idx, val_idx) in enumerate(tss.split(data)):
            self.index_dict[i] = {
                "train_idx": train_idx,
                "val_idx": val_idx
            }


    def train_loader(self, fold, num_workers=0):
        self.train_dataset = CustomDataset(
            self.df[self.df.index.isin(self.index_dict[fold]['train_idx'])],
            features=self.features,
            label=self.label,
            accelerator=self.accelerator
        )
        return DataLoader(self.train_dataset, batch_size=self.batch_size, shuffle=False, num_workers=num_workers)
    
    def valid_loader(self, fold, num_workers=0):
        self.valid_dataset = CustomDataset(
            self.df[self.df.index.isin(self.index_dict[fold]['val_idx'])],
            features=self.features,
            label=self.label,
            accelerator=self.accelerator
        )
        return DataLoader(self.valid_dataset, batch_size=self.batch_size, shuffle=False, num_workers=num_workers)


In [16]:
data_module = DataModule(df_train=data, batch_size=config.batch_size, features=features, label=label, N_fold=config.N_fold, accelerator=deivce)

In [17]:
for fold in range(config.N_fold):
    data_module.train_loader(fold)
    data_module.valid_loader(fold)

In [18]:
data_module.valid_dataset.features.shape

torch.Size([30, 8])

### Model

In [21]:
class NN(LightningModule):
    def __init__(self, input_size, hidden_dims, lr, weight_decay, dropouts=None):
        super().__init__()
        layers = []
        input_dim = input_size

        for i, hidden_dim in enumerate(hidden_dims):
            layers.append(nn.BatchNorm1d(input_dim))

            if i > 0:
                layers.append(nn.ReLU())

            layers.append(nn.Linear(input_dim, hidden_dim))

            # if i < len(dropouts):
            #     layers.append(nn.Dropout(dropouts[i]))

            input_dim = hidden_dim
            
        # now the input_dim is the final round of hidden layer
        layers.append(nn.Linear(input_dim, 1))

        self.model = nn.Sequential(*layers)
        self.lr = lr
        self.weight_decay = weight_decay

        self.criterion = nn.MSELoss()


    def forward(self, x):
        # 1 means run one times
        return self.model(x).squeeze(-1) * 1
    
    def training_step(self, batch):
        x, y = batch
        y_pred = self(x)

        loss = self.criterion(y_pred, y)

        self.log('train_loss', loss, on_step=False, on_epoch=True, batch_size=x.size(0))

        return loss
    

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=self.lr, weight_decay=self.weight_decay)
        schedular = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer=optimizer, mode='min', factor=0.1, patience=5, verbose=True)

        return {
            'optimizer': optimizer,
            'lr_schedular': {
                'schedular': schedular,
                'monitor': 'val_loss'
            }
        }
    
    

### Train

In [24]:
for fold in range(config.N_fold):
    
    input_size = data_module.train_dataset.features.shape[1]

    model = NN(
        input_size=input_size,
        hidden_dims=config.n_hidden,
        lr=config.lr,
        weight_decay=config.weight_decay
    )

    test_trainer = Trainer(
        fast_dev_run=True,
    )

    test_trainer.fit(
        model,
        data_module.train_loader(fold),
        data_module.valid_loader(fold)
    )

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
Running in `fast_dev_run` mode: will run the requested loop using 1 batch(es). Logging and checkpointing is suppressed.
c:\Users\sean.chang\AppData\Local\anaconda3\envs\kmp\lib\site-packages\pytorch_lightning\trainer\configuration_validator.py:68: You passed in a `val_dataloader` but have no `validation_step`. Skipping val loop.
c:\Users\sean.chang\AppData\Local\anaconda3\envs\kmp\lib\site-packages\pytorch_lightning\core\optimizer.py:377: Found unsupported keys in the optimizer configuration: {'lr_schedular'}

  | Name      | Type       | Params | Mode 
-------------------------------------------------
0 | model     | Sequential | 2.5 K  | train
1 | criterion | MSELoss    | 0      | train
-------------------------------------------------
2.5 K     Trainable params
0         Non-trainable params
2.5 K     Total params
0.010     Total estimated model params size (MB)
14        M

Epoch 0: 100%|██████████| 1/1 [00:00<00:00, 43.34it/s]

  return F.mse_loss(input, target, reduction=self.reduction)
`Trainer.fit` stopped: `max_steps=1` reached.


Epoch 0: 100%|██████████| 1/1 [00:00<00:00, 36.25it/s]

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
Running in `fast_dev_run` mode: will run the requested loop using 1 batch(es). Logging and checkpointing is suppressed.
c:\Users\sean.chang\AppData\Local\anaconda3\envs\kmp\lib\site-packages\pytorch_lightning\trainer\configuration_validator.py:68: You passed in a `val_dataloader` but have no `validation_step`. Skipping val loop.
c:\Users\sean.chang\AppData\Local\anaconda3\envs\kmp\lib\site-packages\pytorch_lightning\core\optimizer.py:377: Found unsupported keys in the optimizer configuration: {'lr_schedular'}

  | Name      | Type       | Params | Mode 
-------------------------------------------------
0 | model     | Sequential | 2.5 K  | train
1 | criterion | MSELoss    | 0      | train
-------------------------------------------------
2.5 K     Trainable params
0         Non-trainable params
2.5 K     Total params
0.010     Total estimated model params size (MB)
14        M


Epoch 0: 100%|██████████| 1/1 [00:00<00:00, 41.54it/s]

  return F.mse_loss(input, target, reduction=self.reduction)
`Trainer.fit` stopped: `max_steps=1` reached.


Epoch 0: 100%|██████████| 1/1 [00:00<00:00, 38.33it/s]


GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
Running in `fast_dev_run` mode: will run the requested loop using 1 batch(es). Logging and checkpointing is suppressed.
c:\Users\sean.chang\AppData\Local\anaconda3\envs\kmp\lib\site-packages\pytorch_lightning\trainer\configuration_validator.py:68: You passed in a `val_dataloader` but have no `validation_step`. Skipping val loop.
c:\Users\sean.chang\AppData\Local\anaconda3\envs\kmp\lib\site-packages\pytorch_lightning\core\optimizer.py:377: Found unsupported keys in the optimizer configuration: {'lr_schedular'}

  | Name      | Type       | Params | Mode 
-------------------------------------------------
0 | model     | Sequential | 2.5 K  | train
1 | criterion | MSELoss    | 0      | train
-------------------------------------------------
2.5 K     Trainable params
0         Non-trainable params
2.5 K     Total params
0.010     Total estimated model params size (MB)
14        M

Epoch 0: 100%|██████████| 1/1 [00:00<00:00, 52.65it/s]

  return F.mse_loss(input, target, reduction=self.reduction)
`Trainer.fit` stopped: `max_steps=1` reached.


Epoch 0: 100%|██████████| 1/1 [00:00<00:00, 45.45it/s]

GPU available: False, used: False





TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
Running in `fast_dev_run` mode: will run the requested loop using 1 batch(es). Logging and checkpointing is suppressed.
c:\Users\sean.chang\AppData\Local\anaconda3\envs\kmp\lib\site-packages\pytorch_lightning\trainer\configuration_validator.py:68: You passed in a `val_dataloader` but have no `validation_step`. Skipping val loop.
c:\Users\sean.chang\AppData\Local\anaconda3\envs\kmp\lib\site-packages\pytorch_lightning\core\optimizer.py:377: Found unsupported keys in the optimizer configuration: {'lr_schedular'}

  | Name      | Type       | Params | Mode 
-------------------------------------------------
0 | model     | Sequential | 2.5 K  | train
1 | criterion | MSELoss    | 0      | train
-------------------------------------------------
2.5 K     Trainable params
0         Non-trainable params
2.5 K     Total params
0.010     Total estimated model params size (MB)
14        Modules in train mode
0         Mod

Epoch 0: 100%|██████████| 1/1 [00:00<00:00, 49.19it/s]

  return F.mse_loss(input, target, reduction=self.reduction)
`Trainer.fit` stopped: `max_steps=1` reached.


Epoch 0: 100%|██████████| 1/1 [00:00<00:00, 43.02it/s]

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
Running in `fast_dev_run` mode: will run the requested loop using 1 batch(es). Logging and checkpointing is suppressed.





c:\Users\sean.chang\AppData\Local\anaconda3\envs\kmp\lib\site-packages\pytorch_lightning\trainer\configuration_validator.py:68: You passed in a `val_dataloader` but have no `validation_step`. Skipping val loop.
c:\Users\sean.chang\AppData\Local\anaconda3\envs\kmp\lib\site-packages\pytorch_lightning\core\optimizer.py:377: Found unsupported keys in the optimizer configuration: {'lr_schedular'}

  | Name      | Type       | Params | Mode 
-------------------------------------------------
0 | model     | Sequential | 2.5 K  | train
1 | criterion | MSELoss    | 0      | train
-------------------------------------------------
2.5 K     Trainable params
0         Non-trainable params
2.5 K     Total params
0.010     Total estimated model params size (MB)
14        Modules in train mode
0         Modules in eval mode
c:\Users\sean.chang\AppData\Local\anaconda3\envs\kmp\lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:424: The 'train_dataloader' does not have many worker

Epoch 0:   0%|          | 0/1 [00:00<?, ?it/s] 

  return F.mse_loss(input, target, reduction=self.reduction)


Epoch 0: 100%|██████████| 1/1 [00:00<00:00, 12.58it/s]

`Trainer.fit` stopped: `max_steps=1` reached.


Epoch 0: 100%|██████████| 1/1 [00:00<00:00, 12.16it/s]


In [9]:
from enum import Enum

class LLMType(Enum):
    
    OpenAIChat = 'openai_chat'
    AzureOpenAIChat = 'azure_openai_chat'
    
    def __repr__(self) -> str:
        # return as a string, like "azure_openai_chat"
        return f"'{self.value}'"

In [10]:
LLMType.AzureOpenAIChat

'azure_openai_chat'

In [11]:
import os
from pydantic import BaseModel, Field
from llm.config.enum import LLMType

class LLMParameter(BaseModel):
    """LLM Parameters model."""
    
    type: LLMType = Field(
        description="The type of LLM model to use.", default=LLMType.AzureOpenAIChat
    )
    
    api_key: str | None = Field(
        description="The API key to use for the LLM service.",
        default_factory=lambda: os.getenv("OPENAI_API_KEY"),  # Load from .env by default
    )
    
    api_base: str | None = Field(
        description="The base URL for the LLM API.",
        default_factory=lambda: os.getenv("OPENAI_API_BASE"), 
    )
    
    api_version: str | None = Field(
        description="The version of the LLM API to use.",
        default_factory=lambda: os.getenv("OPENAI_API_VERSION"), 
    )
    
    deployment_name: str | None = Field(
        description="The deployment name to use for the LLM service.",
        default_factory=lambda: os.getenv("OPENAI_DEPLOYMENT_NAME"),  
    )
    
    temperature: float | None = Field(
        description="The temperature to use for token generation.",
        default=0,
    )


In [13]:
class Settings(BaseSettings):
    field_one: Optional[str]

    model_config = SettingsConfigDict(env_file='local.env')

In [14]:
settings = Settings()
settings.model_dump(mode='json')

{'field_one': 'one'}

In [3]:
from devtools import pformat
from pydantic import BaseModel, Field
from llm.config.enum import LLMType
from llm.config.llm_parameters import LLMParameters
import yaml
class LLMConfig(BaseModel):
    def __repr__(self) -> str:
        """Get a string representation."""
        return pformat(self, highlight=False)
    
    type: LLMType = Field(
        description='The type of LLM model to use', default=LLMType.AzureOpenAIChat
    )
    
    llm: LLMParameter = Field(
        description="The LLM configuration to use.", default_factory=LLMParameters
    )
    



def load_config_from_file(file_path: str) -> LLMConfig:
    """Load LLM configuration from a YAML file."""
    with open(file_path, "r") as file:
        config_data = yaml.safe_load(file)
    return LLMConfig(**config_data.get("llm", {}))

# Example usage
config = load_config_from_file("test.yaml")
print(config)


NameError: name 'LLMParameter' is not defined

In [4]:
from llm.config.enum import LLMType
from langchain_openai import AzureChatOpenAI
from llm.config.llm_config import LLMConfig


class LLMBase:
    def __init__(self, config: LLMConfig):
        self.config = config
    
    def get_llm(self, llm_type=LLMType.AzureOpenAIChat):
        if llm_type == LLMType.AzureOpenAIChat:
            self.llm = self.get_aoai_llm()
        else:
            raise NotImplementedError(f"LLM Type {llm_type} is not supported yet.")
        
    def get_aoai_llm(self):

        return AzureChatOpenAI(
            azure_endpoint=self.config.llm.api_base,
            openai_api_version=self.config.llm.api_version,
            azure_deployment=self.config.llm.deployment_name,
            openai_api_key=self.config.llm.api_key,
            temperature=self.config.llm.temperature
        )

In [9]:
from llm.bots.base import LLMBase
from llm.config.llm_config import LLMConfig
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain.prompts.chat import ChatPromptTemplate

class SummaryBot(LLMBase):
    def __init__(self, config: LLMConfig):
        super().__init__(config)
        print(config.llm.api_base)
        self.model = self.get_llm()

    def summarize(self, input_string, prompt):
        
        prompt = ChatPromptTemplate.from_messages(
            [
                (
                    "system", prompt
                ),
                (
                    "human","{input_string}"
                )
            ]
        )
        
        chain = (
            {"input_string": RunnablePassthrough()}
            | prompt
            | self.model
            | StrOutputParser()
        )
        
        return chain.invoke(input_string)

In [10]:
SummaryBot(config=LLMConfig)

<class 'llm.config.llm_config.LLMConfig'>


AttributeError: llm