In [1]:
from data import get_recessions, RecessionDatasetBuilder

builder = RecessionDatasetBuilder()
data = builder.create_data(features_config={
    "Real GDP": 3,
    "Unemployment Rate": 3,
    "Inflation": 2
}, window=6)

recessions = get_recessions(builder.start_date)

In [8]:
data

Unnamed: 0,Real GDP (t-2),Real GDP (t-1),Real GDP (t-0),Unemployment Rate (t-2),Unemployment Rate (t-1),Unemployment Rate (t-0),Inflation (t-1),Inflation (t-0),Recession
1968-02-01,0.946076,0.753853,2.039271,-0.1,-0.1,0.1,3.651861,3.673819,0
1968-03-01,0.946076,0.753853,2.039271,-0.1,0.1,-0.1,3.673819,4.142164,0
1968-04-01,0.753853,2.039271,1.670373,0.1,-0.1,-0.2,4.142164,4.155828,0
1968-05-01,0.753853,2.039271,1.670373,-0.1,-0.2,0.0,4.155828,4.088245,0
1968-06-01,0.753853,2.039271,1.670373,-0.2,0.0,0.2,4.088245,4.545569,0
...,...,...,...,...,...,...,...,...,...
2024-08-01,0.404802,0.738980,0.759510,0.1,0.1,0.0,4.179707,4.112096,0
2024-09-01,0.404802,0.738980,0.759510,0.1,0.0,-0.1,4.112096,4.012724,0
2024-10-01,0.738980,0.759510,0.607065,0.0,-0.1,0.0,4.012724,3.963395,0
2024-11-01,0.738980,0.759510,0.607065,-0.1,0.0,0.1,3.963395,3.876177,0


In [20]:
from sklearn.utils import compute_sample_weight

X = data.drop(columns=["Recession"])
y = data["Recession"]

sample_weights = compute_sample_weight(class_weight="balanced", y=y)

In [10]:
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
from sklearn.pipeline import Pipeline
import pandas as pd

def tune_model(X: pd.DataFrame, y: pd.Series, model: Pipeline, param_grid: dict) -> Pipeline:
    tscv = TimeSeriesSplit(n_splits=5)

    search = GridSearchCV(
        model,
        param_grid=param_grid,
        scoring="average_precision",
        cv=tscv
    )

    search.fit(X, y)

    return search.best_estimator_

In [None]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

lr = make_pipeline(
    StandardScaler(),
    LogisticRegression(
        penalty="l1",
        class_weight="balanced",
        solver="liblinear",
        random_state=42
    )
)

lr = tune_model(X, y, lr, {
    "logisticregression__tol": [1e-5, 1e-4, 1e-3],
    "logisticregression__max_iter": [100, 500, 1000],
})

In [25]:
from sklearn.metrics import average_precision_score, roc_auc_score, fbeta_score, accuracy_score, precision_score, recall_score
import numpy as np

# training metrics
lr.fit(X, y)
lr_proba = lr.predict_proba(X)[:, 1]

print(f"Average precision: {average_precision_score(y, lr_proba)}")
print(f"Weighted average precision: {average_precision_score(y, lr_proba, sample_weight=sample_weights)}")
print(f"ROC AUC: {roc_auc_score(y, lr_proba)}")

thresholds = np.linspace(0, 1, 100)
lr_f2_scores = [fbeta_score(y, lr_proba >= t, beta=2) for t in thresholds]
lr_threshold = thresholds[np.argmax(lr_f2_scores)]

print(f"Threshold: {lr_threshold}")

lr_pred = lr_proba >= lr_threshold

print(f"Accuracy: {accuracy_score(y, lr_pred)}")
print(f"Precision: {precision_score(y, lr_pred)}")
print(f"Weighted precision: {precision_score(y, lr_pred, sample_weight=sample_weights)}")
print(f"Recall: {recall_score(y, lr_pred)}")
print(f"Weighted recall: {recall_score(y, lr_pred, sample_weight=sample_weights)}")

Average precision: 0.20951442917501995
Weighted average precision: 0.72258105138959
ROC AUC: 0.7350763271815902
Threshold: 0.4747474747474748
Accuracy: 0.726207906295754
Precision: 0.18357487922705315
Weighted precision: 0.7157104235506154
Recall: 0.6785714285714286
Weighted recall: 0.6785714285714288
