In [48]:
import sys
from pathlib import Path

ROOT = Path.cwd().parent
sys.path.append(str(ROOT))

import numpy as np
import yaml

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedShuffleSplit

from src.data.pipeline import IngestionPipeline
from src.utils.utils import TrainConfig

In [49]:
with open("../config/model_config.yml") as stream:
        config=yaml.safe_load(stream)
        config=TrainConfig(**config)

macro_paths = ["../" + path for path in config.macro_data]

pipeline = IngestionPipeline(
    company_path = "../" + config.firm_data,
    macro_paths = macro_paths,
    company_col = config.company_col,
    bankruptcy_col = config.bankruptcy_col
)

In [50]:
pipeline.run()
X, M, y = pipeline.get_tensors()

INFO:src.data.loaders:Reading file: ../data/demo_data.xlsx
INFO:src.data.loaders:Dropping high-revenue outliers...
INFO:src.data.loaders:Loading 3 macroeconomic series...
  df["Date"]=pd.to_datetime(df["Date"], errors="coerce")
  df["Date"]=pd.to_datetime(df["Date"], errors="coerce")
  df["Date"]=pd.to_datetime(df["Date"], errors="coerce")
INFO:prophet:Disabling weekly seasonality. Run prophet with weekly_seasonality=True to override this.
INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /var/folders/h1/hrjhnsw55w3fh7wq8fc7_bcm0000gn/T/tmprrdkzvtf/5ce_78n9.json
DEBUG:cmdstanpy:input tempfile: /var/folders/h1/hrjhnsw55w3fh7wq8fc7_bcm0000gn/T/tmprrdkzvtf/c5bhgqen.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/Users/guillaumedecina-halmi/miniforge3/lib/python3.12/site-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=77904', 'data

In [76]:
X = X.numpy().reshape(X.shape[0], -1)

In [77]:
train_fract=0.8

splitter = StratifiedShuffleSplit(n_splits=1, train_size=train_fract)
train_idx, val_idx = next(splitter.split(X, y))

In [78]:
X_train, y_train = X[train_idx], y[train_idx]
X_test, y_test = X[val_idx], y[val_idx]

In [79]:
lr = LogisticRegression()
lr.fit(X_train, y_train)

In [80]:
X_train

array([[ 0.03897084, -0.8744176 , -0.61292404, ..., -0.07124566,
        -0.3768401 , -0.26937994],
       [ 2.1605031 ,  2.7355533 ,  0.7877937 , ...,  8.599379  ,
         2.721751  ,  5.474661  ],
       [ 0.01651829,  0.542785  ,  1.9950451 , ..., -0.23679025,
         1.6432627 , -0.18814312],
       ...,
       [ 0.5735824 ,  0.25557426,  1.4074199 , ...,  2.153363  ,
         2.2307642 ,  1.3750393 ],
       [-0.14426002,  0.13396592,  0.3086626 , ...,  0.6034317 ,
         0.60357577,  0.43246192],
       [ 0.06948314,  0.24853243, -0.13493608, ...,  0.26337755,
        -0.3342412 ,  0.03833745]], dtype=float32)

In [81]:
preds = lr.predict(X_test)

In [82]:
f1_score(y_test, preds)

0.022222222222222223