In [1]:
import torch
import mlflow
import datetime
import logging
import yaml

from dataclasses import dataclass, field
from torch.optim import Adam
from torch.nn import BCEWithLogitsLoss
from torch.utils.data import DataLoader
from torchmetrics import Metric, MetricCollection
from torchmetrics.classification import (
    BinaryAccuracy, BinaryAUROC, BinaryF1Score, BinaryMatthewsCorrCoef,
    MulticlassAccuracy, MulticlassAUROC, MulticlassF1Score)
from pathlib import Path

from src.datasets.dual_input import DualInputSequenceDataset
from src.models.gru import GRUModel
from src.data.pipeline import IngestionPipeline
from src.train import train_model
from src.utils.utils import CustomReduceLROnPlateau, collate_with_macro

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S"
)

def load_yaml_file(path):
    with open(path) as stream:
        try:
            config_dict=yaml.safe_load(stream)
            return config_dict
        except yaml.YAMLError as e:
            TypeError(f"Config file could not be loaded: {e}")
    

@dataclass
class TrainConfig:
    firm_data: str
    macro_data: list[str]
    bankruptcy_col: str
    company_col: str
    revenue_cap: int = 3_000
    num_classes: int = 2
    batch_size: int = 32
    epochs: int = 40
    lr: float = 1e-3
    hidden_size: int = 64
    num_layers: int = 2
    dropout: float = .2
    threshold: float = 0.5
    scheduler_factor: float=0.85
    scheduler_patience: int = 50
    min_lr:float = 0.0
    decay_ih: float = 1e-5
    decay_hh: float = 1e-5
    decay_other: float = 1e-5
    train_fract: float = .8
    seed: int = 42
    device: str = "cuda" if torch.cuda.is_available() else "mps" if torch.mps.is_available() else "cpu"
    metrics: list[str] = field(default_factory=lambda: ["f1", "accuracy"])
    
    def get_metrics(self) -> MetricCollection:
        """Constructs a MetricCollection from the specified config"""
        if self.num_classes == 2:
            available = {
                "f1": BinaryF1Score(self.threshold),
                "accuracy": BinaryAccuracy(self.threshold),
                "auroc": BinaryAUROC(),
                "matthews": BinaryMatthewsCorrCoef(self.threshold)
            }
        else:
            available = {
                "f1": MulticlassF1Score(num_classes=self.num_classes),
                "accuracy": MulticlassAccuracy(num_classes=self.num_classes),
                "auroc": MulticlassAUROC(num_classes=self.num_classes)
            }
        selected = {k: available[k] for k in self.metrics if k in available}
        return MetricCollection(selected)
    
def _make_class_weights(labels: torch.Tensor, num_classes:int) -> torch.Tensor:
    """Helper function to compute class weights."""
    counts = torch.bincount(labels.long(), minlength=num_classes)
    weights = counts.sum() / (num_classes * counts.float())
    return weights
    
def train_model_from_config(cfg: TrainConfig) -> GRUModel:
    """Main training function"""
    return train(
        company_data_path = Path(cfg.firm_data),
        macro_data_path = [Path(path) for path in cfg.macro_data],
        bankruptcy_col = cfg.bankruptcy_col,
        company_col=cfg.company_col,
        revenue_cap=cfg.revenue_cap,
        metrics=cfg.get_metrics().to(cfg.device),
        device=cfg.device,
        num_layers=cfg.num_classes,
        hidden_size=cfg.hidden_size,
        output_size=cfg.num_classes,
        epochs=cfg.epochs,
        lr=cfg.lr,
        train_fract=cfg.train_fract,
        dropout=cfg.dropout,
        scheduler_factor=cfg.scheduler_factor,
        scheduler_patience=cfg.scheduler_patience,
        decay_ih=cfg.decay_ih,
        decay_hh=cfg.decay_hh,
        decay_other=cfg.decay_other,
        seed=cfg.seed
    )

def train(
    company_data_path: str,
    macro_data_path: list[str],
    bankruptcy_col: str,
    company_col: str,
    revenue_cap: int,
    metrics: list[Metric],
    seed: int,
    num_layers: int = 2,
    hidden_size: int = 64,
    output_size: int = 1,
    epochs: int = 50,
    lr: float = 1e-3,
    train_fract: float = 0.8,
    dropout: float = 0.2,
    scheduler_factor: float = 0.85,
    scheduler_patience: int = 50,
    min_lr: float = 0.0,
    decay_ih:float = 1e-5,
    decay_hh:float = 1e-5,
    decay_other:float = 1e-5,
    device: str="cuda" if torch.cuda.is_available() else "mps" if torch.mps.is_available() else "cpu"
):  
    logger = logging.getLogger(__name__)
    logging.basicConfig(level=logging.INFO)
    
    ingestion = IngestionPipeline(
        company_data_path=company_data_path,
        macro_data_path=macro_data_path,
        company_col=company_col,
        bankruptcy_col=bankruptcy_col,
        sheet_name="Results",
        revenue_cap=revenue_cap
    )
    ingestion.load()
    series = ingestion.process_data()
    financials, macro, labels = series.export_tensors()
    
    dataset = DualInputSequenceDataset(
        firm_tensor = financials,
        macro_tensor = macro,
        labels = labels
    )
    
    train_ds, val_ds, seed = dataset.stratified_split(train_fract)
    train_loader = DataLoader(train_ds, batch_size=32, shuffle=True, collate_fn=collate_with_macro)
    val_loader = DataLoader(val_ds, batch_size=32, shuffle=False, collate_fn=collate_with_macro)

    logger.info(f"Device: {device}")
    
    metrics_dict = dict()
    for metric in metrics:
        metrics_dict[metric._get_name()] = metric.to(device)
    metrics = metrics_dict 
    
    train_ds = train_ds.to_device(device)
    val_ds = val_ds.to_device(device)
    
    firm_input_size, macro_input_size = dataset.input_dims()
    
    mlflow.set_tracking_uri('http://127.0.0.1:8080')
    mlflow.set_experiment('bankruptcy-predictions')
    
    mlflow.log_param("seed", seed)
    
    with mlflow.start_run():
        model = GRUModel(
            firm_input_size=firm_input_size,
            macro_input_size=macro_input_size,
            hidden_size=hidden_size,
            output_size=output_size,
            num_layers=num_layers,
            dropout=dropout
        )
        
        model = model.to(device)
        
        pos_weight = dataset.pos_weight()
        loss_fn = BCEWithLogitsLoss(pos_weight=pos_weight)
        
        # Logging hyperparameters
        mlflow.log_param("hidden_size", hidden_size)
        mlflow.log_param("output_size", output_size)
        mlflow.log_param("num_layers", num_layers)
        mlflow.log_param("dropout", dropout)
        mlflow.log_param("lr", lr)

        ih_params = []
        hh_params = []
        other_params = []

        for name, param in model.named_parameters():
            if 'weight_ih' in name:
                ih_params.append(param)
            elif 'weight_hh' in name:
                hh_params.append(param)
            else:
                other_params.append(param)
        
        optimizer = Adam([
                {'params': ih_params, 'weight_decay': decay_ih},
                {'params': hh_params, 'weight_decay': decay_hh},
                {'params': other_params, 'weight_decay': decay_other},
            ], lr=lr
        )
        scheduler=CustomReduceLROnPlateau(
            optimizer=optimizer,
            mode="min",
            factor=scheduler_factor,
            patience=scheduler_patience,
            min_lr=min_lr
        )
        
        train_model(
            model=model,
            train_loader=train_loader,
            val_loader=val_loader,
            loss_fn=loss_fn,
            optimizer=optimizer,
            scheduler=scheduler,
            device=device,
            epochs=epochs,
            metrics=metrics
        )
        
        mlflow.pytorch.log_model(model, f"model_{datetime.datetime.now()}")
        torch.save(obj = model.state_dict(), f = f"model_{datetime.datetime.now()}.pth")
    
    return model

In [2]:
config_dict = load_yaml_file("config/model_config.yml")
cfg = TrainConfig(**config_dict)

company_data_path = Path(cfg.firm_data)
macro_data_path = [Path(path) for path in cfg.macro_data]
bankruptcy_col = str(cfg.bankruptcy_col)
company_col=str(cfg.company_col)
revenue_cap=int(cfg.revenue_cap)
metrics=cfg.get_metrics().to(cfg.device)
device=str(cfg.device)
num_layers=int(cfg.num_classes)
hidden_size=int(cfg.hidden_size)
output_size=1
epochs=int(cfg.epochs)
lr=float(cfg.lr)
train_fract=float(cfg.train_fract)
dropout=int(cfg.dropout)
scheduler_factor=float(cfg.scheduler_factor)
scheduler_patience=int(cfg.scheduler_patience)
decay_ih=float(cfg.decay_ih)
decay_hh=float(cfg.decay_hh)
decay_other=float(cfg.decay_other)
seed=int(cfg.seed)

ingestion = IngestionPipeline(
    company_path=company_data_path,
    macro_paths=macro_data_path,
    company_col=company_col,
    bankruptcy_col=bankruptcy_col,
    revenue_cap=revenue_cap
)

In [3]:
ingestion.run()

INFO:src.data.loaders:Reading file: data/demo_data.xlsx
INFO:src.data.loaders:Dropping high-revenue outliers...
INFO:src.data.loaders:Loading 3 macroeconomic series...
  df["Date"]=pd.to_datetime(df["Date"], errors="coerce")
  df["Date"]=pd.to_datetime(df["Date"], errors="coerce")
  df["Date"]=pd.to_datetime(df["Date"], errors="coerce")
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /var/folders/h1/hrjhnsw55w3fh7wq8fc7_bcm0000gn/T/tmpb0h5k1st/tq29nhl7.json
DEBUG:cmdstanpy:input tempfile: /var/folders/h1/hrjhnsw55w3fh7wq8fc7_bcm0000gn/T/tmpb0h5k1st/nwfxs8k3.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/Users/guillaumedecina-halmi/miniforge3/lib/python3.12/site-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=4432', 'data', '

In [4]:
X, M, y = ingestion.get_tensors()

INFO:src.data.tensor_factory:Converting financial series to tensors...
INFO:src.data.tensor_factory:Scaling financial data with RobustScaler...
INFO:src.data.tensor_factory:Shaped financial data tensor: (6338, 3, 4)
INFO:src.data.tensor_factory:Shaped macro data tensor: torch.Size([3, 422])


In [5]:
dataset = DualInputSequenceDataset(
        firm_tensor = X,
        macro_tensor = M,
        labels = y
    )

train_ds, val_ds, seed = dataset.stratified_split(train_fract)

In [6]:
train_loader = DataLoader(train_ds, batch_size=32, shuffle=True, collate_fn=collate_with_macro)
val_loader = DataLoader(val_ds, batch_size=32, shuffle=False, collate_fn=collate_with_macro)

In [7]:
metrics.to(device)
train_ds = train_ds.to_device(device)
val_ds = val_ds.to_device(device)

firm_input_size, macro_input_size = dataset.input_dims()

mlflow.set_tracking_uri('http://127.0.0.1:8080')
mlflow.set_experiment('bankruptcy-predictions')

mlflow.log_param("seed", seed)

2947

In [16]:
mlflow.end_run()

🏃 View run caring-tern-39 at: http://127.0.0.1:8080/#/experiments/387584985157093548/runs/53ffe63e18164788be8088bf5f677b88
🧪 View experiment at: http://127.0.0.1:8080/#/experiments/387584985157093548


In [9]:
with mlflow.start_run():
    model = GRUModel(
        firm_input_size=firm_input_size,
        macro_input_size=macro_input_size,
        hidden_size=hidden_size,
        output_size=output_size,
        num_layers=num_layers,
        dropout=dropout
    )
    
    model = model.to(device)
    
    pos_weight = dataset.pos_weight()
    loss_fn = BCEWithLogitsLoss(pos_weight=pos_weight)
    
    # Logging hyperparameters
    mlflow.log_param("hidden_size", hidden_size)
    mlflow.log_param("output_size", output_size)
    mlflow.log_param("num_layers", num_layers)
    mlflow.log_param("dropout", dropout)
    mlflow.log_param("lr", lr)

    ih_params = []
    hh_params = []
    other_params = []

    for name, param in model.named_parameters():
        if 'weight_ih' in name:
            ih_params.append(param)
        elif 'weight_hh' in name:
            hh_params.append(param)
        else:
            other_params.append(param)
    
    optimizer = Adam([
            {'params': ih_params, 'weight_decay': decay_ih},
            {'params': hh_params, 'weight_decay': decay_hh},
            {'params': other_params, 'weight_decay': decay_other},
        ], lr=lr
    )
    scheduler=CustomReduceLROnPlateau(
        optimizer=optimizer,
        mode="min",
        factor=scheduler_factor,
        patience=scheduler_patience,
        min_lr=0.0
    )
    
    train_model(
        model=model,
        train_loader=train_loader,
        val_loader=val_loader,
        loss_fn=loss_fn,
        optimizer=optimizer,
        scheduler=scheduler,
        device=device,
        epochs=epochs,
        metrics=metrics
    )
    
    mlflow.pytorch.log_model(model, f"model_{datetime.datetime.now()}")
    torch.save(obj = model.state_dict(), f = f"model_{datetime.datetime.now()}.pth")

Epoch 10/100 | Loss: 1.00572 | ACCURACY: 0.70158 | AUROC: 0.82659 | F1: 0.07688 | MATTHEWS: 0.13534 | LR: 0.00100:  10%|█         | 10/100 [00:39<05:51,  3.90s/it]

Early stopping at epoch 10





🏃 View run painted-sheep-140 at: http://127.0.0.1:8080/#/experiments/387584985157093548/runs/bffc2f977fb04db49aa670bd988c60a5
🧪 View experiment at: http://127.0.0.1:8080/#/experiments/387584985157093548


In [10]:
print(X.shape)
print(M.shape)
print(y.shape)

(6338, 3, 4)
torch.Size([3, 422])
(6338,)


In [14]:
import torch
import mlflow
import datetime
import logging
import yaml

from dataclasses import dataclass, field
from torch.optim import Adam
from torch.nn import BCEWithLogitsLoss
from torch.utils.data import DataLoader
from torchmetrics import Metric, MetricCollection
from torchmetrics.classification import (
    BinaryAccuracy, BinaryAUROC, BinaryF1Score, BinaryMatthewsCorrCoef,
    MulticlassAccuracy, MulticlassAUROC, MulticlassF1Score)
from pathlib import Path

from src.datasets.dual_input import DualInputSequenceDataset
from src.models.gru import GRUModel
from src.data.pipeline import IngestionPipeline
from src.train import train_model
from src.utils.utils import CustomReduceLROnPlateau, collate_with_macro

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S"
)

def load_yaml_file(path):
    with open(path) as stream:
        try:
            config_dict=yaml.safe_load(stream)
            return config_dict
        except yaml.YAMLError as e:
            TypeError(f"Config file could not be loaded: {e}")
    

@dataclass
class TrainConfig:
    firm_data: str
    macro_data: list[str]
    bankruptcy_col: str
    company_col: str
    revenue_cap: int = 3_000
    num_classes: int = 2
    batch_size: int = 32
    epochs: int = 40
    lr: float = 1e-3
    hidden_size: int = 64
    num_layers: int = 2
    dropout: float = .2
    threshold: float = 0.5
    scheduler_factor: float=0.85
    scheduler_patience: int = 50
    min_lr:float = 0.0
    decay_ih: float = 1e-5
    decay_hh: float = 1e-5
    decay_other: float = 1e-5
    train_fract: float = .8
    seed: int = 42
    device: str = "cuda" if torch.cuda.is_available() else "mps" if torch.mps.is_available() else "cpu"
    metrics: list[str] = field(default_factory=lambda: ["f1", "accuracy"])
    
    def get_metrics(self) -> MetricCollection:
        """Constructs a MetricCollection from the specified config"""
        if self.num_classes == 2:
            available = {
                "f1": BinaryF1Score(self.threshold),
                "accuracy": BinaryAccuracy(self.threshold),
                "auroc": BinaryAUROC(),
                "matthews": BinaryMatthewsCorrCoef(self.threshold)
            }
        else:
            available = {
                "f1": MulticlassF1Score(num_classes=self.num_classes),
                "accuracy": MulticlassAccuracy(num_classes=self.num_classes),
                "auroc": MulticlassAUROC(num_classes=self.num_classes)
            }
        selected = {k: available[k] for k in self.metrics if k in available}
        return MetricCollection(selected)
    
def train_model_from_config(cfg: TrainConfig) -> GRUModel:
    """Main training function"""
    return train(
        company_path = Path(cfg.firm_data),
        macro_paths = [Path(path) for path in cfg.macro_data],
        bankruptcy_col = str(cfg.bankruptcy_col),
        company_col=str(cfg.company_col),
        revenue_cap=int(cfg.revenue_cap),
        metrics=cfg.get_metrics().to(cfg.device),
        device=str(cfg.device),
        num_layers=int(cfg.num_classes),
        hidden_size=int(cfg.hidden_size),
        output_size=1,
        epochs=int(cfg.epochs),
        lr=float(cfg.lr),
        train_fract=float(cfg.train_fract),
        dropout=int(cfg.dropout),
        scheduler_factor=float(cfg.scheduler_factor),
        scheduler_patience=int(cfg.scheduler_patience),
        decay_ih=float(cfg.decay_ih),
        decay_hh=float(cfg.decay_hh),
        decay_other=float(cfg.decay_other),
        seed=int(cfg.seed)
    )

def train(
    company_path: str,
    macro_paths: list[str],
    bankruptcy_col: str,
    company_col: str,
    revenue_cap: int,
    metrics: list[Metric],
    seed: int,
    num_layers: int = 2,
    hidden_size: int = 64,
    output_size: int = 1,
    epochs: int = 50,
    lr: float = 1e-3,
    train_fract: float = 0.8,
    dropout: float = 0.2,
    scheduler_factor: float = 0.85,
    scheduler_patience: int = 50,
    min_lr: float = 0.0,
    decay_ih:float = 1e-5,
    decay_hh:float = 1e-5,
    decay_other:float = 1e-5,
    device: str="cuda" if torch.cuda.is_available() else "mps" if torch.mps.is_available() else "cpu"
):  
    logger = logging.getLogger(__name__)
    logging.basicConfig(level=logging.INFO)
    
    ingestion = IngestionPipeline(
        company_path=company_path,
        macro_paths=macro_paths,
        company_col=company_col,
        bankruptcy_col=bankruptcy_col,
        revenue_cap=revenue_cap
    )
    
    ingestion.run()
    X, M, y = ingestion.get_tensors()
    
    dataset = DualInputSequenceDataset(
        firm_tensor = X,
        macro_tensor = M,
        labels = y
    )
    
    train_ds, val_ds, seed = dataset.stratified_split(train_fract)
    train_loader = DataLoader(train_ds, batch_size=32, shuffle=True, collate_fn=collate_with_macro)
    val_loader = DataLoader(val_ds, batch_size=32, shuffle=False, collate_fn=collate_with_macro)

    logger.info(f"Device: {device}")
    
    metrics.to(device)
    train_ds = train_ds.to_device(device)
    val_ds = val_ds.to_device(device)
    
    firm_input_size, macro_input_size = dataset.input_dims()
    
    mlflow.set_tracking_uri('http://127.0.0.1:8080')
    mlflow.set_experiment('bankruptcy-predictions')
    
    mlflow.log_param("seed", seed)
    
    with mlflow.start_run():
        model = GRUModel(
            firm_input_size=firm_input_size,
            macro_input_size=macro_input_size,
            hidden_size=hidden_size,
            output_size=output_size,
            num_layers=num_layers,
            dropout=dropout
        )
        
        model = model.to(device)
        
        pos_weight = dataset.pos_weight()
        loss_fn = BCEWithLogitsLoss(pos_weight=pos_weight)
        
        # Logging hyperparameters
        mlflow.log_param("hidden_size", hidden_size)
        mlflow.log_param("output_size", output_size)
        mlflow.log_param("num_layers", num_layers)
        mlflow.log_param("dropout", dropout)
        mlflow.log_param("lr", lr)

        ih_params = []
        hh_params = []
        other_params = []

        for name, param in model.named_parameters():
            if 'weight_ih' in name:
                ih_params.append(param)
            elif 'weight_hh' in name:
                hh_params.append(param)
            else:
                other_params.append(param)
        
        optimizer = Adam([
                {'params': ih_params, 'weight_decay': decay_ih},
                {'params': hh_params, 'weight_decay': decay_hh},
                {'params': other_params, 'weight_decay': decay_other},
            ], lr=lr
        )
        scheduler=CustomReduceLROnPlateau(
            optimizer=optimizer,
            mode="min",
            factor=scheduler_factor,
            patience=scheduler_patience,
            min_lr=min_lr
        )
        
        train_model(
            model=model,
            train_loader=train_loader,
            val_loader=val_loader,
            loss_fn=loss_fn,
            optimizer=optimizer,
            scheduler=scheduler,
            device=device,
            epochs=epochs,
            metrics=metrics
        )
        
        mlflow.pytorch.log_model(model, f"model_{datetime.datetime.now()}")
        torch.save(obj = model.state_dict(), f = f"model_{datetime.datetime.now()}.pth")
    
    return model

In [12]:
cfg

TrainConfig(firm_data='data/demo_data.xlsx', macro_data=['insee/serie_000857176_04042025/valeurs_mensuelles.csv', 'insee/serie_000857180_04042025/valeurs_mensuelles.csv', 'insee/serie_001763782_04042025/valeurs_mensuelles.csv'], bankruptcy_col='Status date', company_col='Company name Latin alphabet', revenue_cap=3000, num_classes=2, batch_size=32, epochs=100, lr='1e-3', hidden_size=64, num_layers=2, dropout=0.4, threshold=0.4, scheduler_factor=0.85, scheduler_patience=50, min_lr=0.0, decay_ih='1e-5', decay_hh='1e-4', decay_other='1e-5', train_fract=0.8, seed=2025, device='mps', metrics=['f1', 'accuracy', 'auroc', 'matthews'])

In [26]:
train_model_from_config(cfg)

INFO:src.data.loaders:Reading file: data/demo_data.xlsx
INFO:src.data.loaders:Dropping high-revenue outliers...
INFO:src.data.loaders:Loading 3 macroeconomic series...
  df["Date"]=pd.to_datetime(df["Date"], errors="coerce")
  df["Date"]=pd.to_datetime(df["Date"], errors="coerce")
  df["Date"]=pd.to_datetime(df["Date"], errors="coerce")
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /var/folders/h1/hrjhnsw55w3fh7wq8fc7_bcm0000gn/T/tmpb0h5k1st/v3to4gc7.json
DEBUG:cmdstanpy:input tempfile: /var/folders/h1/hrjhnsw55w3fh7wq8fc7_bcm0000gn/T/tmpb0h5k1st/dtv1fhag.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/Users/guillaumedecina-halmi/miniforge3/lib/python3.12/site-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=15692', 'data', 

Exception: Run with UUID 06653c474f224c5086793f7ff21e61b6 is already active. To start a new run, first end the current run with mlflow.end_run(). To start a nested run, call start_run with nested=True

In [27]:
mlflow.end_run()

🏃 View run painted-wren-983 at: http://127.0.0.1:8080/#/experiments/387584985157093548/runs/06653c474f224c5086793f7ff21e61b6
🧪 View experiment at: http://127.0.0.1:8080/#/experiments/387584985157093548
