In [1]:
%load_ext autoreload
%autoreload 2

In [None]:
import numpy as np
import pandas as pd
import statsmodels.api as sm

from utils import (
    ROOTDIR,
    AdjacencyMethod,
    DistanceTransformation,
    Normalizer,
    encode_variables_to_filename,
    read_dataset,
    setup_plotting,
)

setup_plotting()

In [15]:
METRIC_COLUMNS = [
    "global_efficiency",  # x1
    "entropy",  # x2
    "hurst_rs",  # x3
    "fractal",  # x4
    "modularity",  # x5
    "assortativity",  # x6
    "estrada_index",  # x7
    "avg_katz",  # x8
    "avg_closeness",  # x9
    "avg_betweenness",  # x10
    "avg_laplacian",  # x11
]

## Choose configuration

Depending on the configuration desired, we get the dataset

In [13]:
def get_dataset_filename(
    event_filename: str = "all.txt",
    input_data: bool = False,
    use_threshold: bool = False,
) -> pd.DataFrame:
    filename = encode_variables_to_filename(
        event_filename=event_filename,
        imput_data=input_data,
        use_threshold=use_threshold,
    )

    dataset_df = read_dataset(ROOTDIR / "data" / filename)
    return dataset_df

### Configuration 1

* Default data per event: `event_filename = "all.txt"`
* Without inputing data: `input_data = False`
* If the column have nan, drop it: `use_threshold = False`

In [None]:
event_filename = "all.txt"
input_data = False
use_threshold = False

dataset_df_conf1 = get_dataset_filename(
    event_filename="all.txt",
    input_data=False,
    use_threshold=False,
)

In [22]:
distance_transformation = DistanceTransformation.EXPONENTIAL
norm_method = Normalizer.Z_SCORE
adjacency_method = AdjacencyMethod.MANHATTAN

df0 = dataset_df_conf1[
    (dataset_df_conf1["transformation"] == distance_transformation.value)
    & (dataset_df_conf1["normalization"] == norm_method.value)
    & (dataset_df_conf1["adjacency_method"] == adjacency_method.value)
].sort_values(by="intensity")[[*METRIC_COLUMNS, "intensity"]]

x0 = df0.drop(columns=["intensity"]).astype(float).values
y = (
    df0["intensity"]
    .replace(
        ["G1", "G2", "G3", "G4", "G5", "Unknown", np.nan, "G4/G5", "G3/G4"],
        [1, 2, 3, 4, 5, 0, 0, 4.5, 3.5],
    )
    .astype(float)
    .values
)

x0 = sm.add_constant(x0)  # Adds a constant term to the predictor
model = sm.OLS(y, x0).fit()
print(model.summary())


                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.404
Model:                            OLS   Adj. R-squared:                  0.131
Method:                 Least Squares   F-statistic:                     1.480
Date:                Mon, 09 Feb 2026   Prob (F-statistic):              0.203
Time:                        00:58:17   Log-Likelihood:                -48.283
No. Observations:                  36   AIC:                             120.6
Df Residuals:                      24   BIC:                             139.6
Df Model:                          11                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         51.4512    108.306      0.475      0.6

## Configuration 2

* Imputed data previously: `event_filename = "all.imp.txt"`
* Without imputing data on my own: `input_data = False`
* If the column have nans, drop it: `use_threshold = False`

In [8]:
dataset_df_conf2 = get_dataset_filename(
    event_filename="all.imp.txt",
    input_data=False,
    use_threshold=False,
)

In [23]:
distance_transformation = DistanceTransformation.EXPONENTIAL
norm_method = Normalizer.Z_SCORE
adjacency_method = AdjacencyMethod.MANHATTAN

df0 = dataset_df_conf2[
    (dataset_df_conf2["transformation"] == distance_transformation.value)
    & (dataset_df_conf2["normalization"] == norm_method.value)
    & (dataset_df_conf2["adjacency_method"] == adjacency_method.value)
].sort_values(by="intensity")[[*METRIC_COLUMNS, "intensity"]]

x0 = df0.drop(columns=["intensity"]).astype(float).values
y = (
    df0["intensity"]
    .replace(
        ["G1", "G2", "G3", "G4", "G5", "Unknown", np.nan, "G4/G5", "G3/G4"],
        [1, 2, 3, 4, 5, 0, 0, 4.5, 3.5],
    )
    .astype(float)
    .values
)

x0 = sm.add_constant(x0)  # Adds a constant term to the predictor
model = sm.OLS(y, x0).fit()
print(model.summary())


                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.397
Model:                            OLS   Adj. R-squared:                  0.120
Method:                 Least Squares   F-statistic:                     1.435
Date:                Mon, 09 Feb 2026   Prob (F-statistic):              0.221
Time:                        00:58:29   Log-Likelihood:                -48.506
No. Observations:                  36   AIC:                             121.0
Df Residuals:                      24   BIC:                             140.0
Df Model:                          11                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const        118.5142     73.474      1.613      0.1