In [1]:
%%capture
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import pandas as pd
import statsmodels.api as sm

from utils import (
    ROOTDIR,
    AdjacencyMethod,
    DistanceTransformation,
    Normalizer,
    encode_variables_to_filename,
    read_dataset,
    setup_plotting,
)

setup_plotting()

In [3]:
METRIC_COLUMNS = [
    "global_efficiency",  # x1
    "entropy",  # x2
    "hurst_rs",  # x3
    "fractal",  # x4
    "modularity",  # x5
    "assortativity",  # x6
    "estrada_index",  # x7
    "avg_katz",  # x8
    "avg_closeness",  # x9
    "avg_betweenness",  # x10
    "avg_laplacian",  # x11
]

## Choose configuration

Depending on the configuration desired, we get the dataset

In [4]:
def get_dataset_filename(
    event_filename: str = "all.txt",
    input_data: bool = False,
    use_threshold: bool = False,
) -> pd.DataFrame:
    filename = encode_variables_to_filename(
        event_filename=event_filename,
        imput_data=input_data,
        use_threshold=use_threshold,
    )

    dataset_df = read_dataset(ROOTDIR / "data" / filename)
    return dataset_df

### Configuration 1

* Default data per event: `event_filename = "all.txt"`
* Without inputing data: `input_data = False`
* If the column have nan, drop it: `use_threshold = False`

In [15]:
event_filename = "all.txt"
input_data = False
use_threshold = False

dataset_df_conf1 = get_dataset_filename(
    event_filename="all.txt",
    input_data=False,
    use_threshold=False,
)

In [20]:
print(dataset_df_conf1)

      event_date   drop intensity    dst transformation    normalization  \
0     2024-05-10  15.32        G5 -412.0           none          min_max   
1     2024-05-10  15.32        G5 -412.0           none          min_max   
2     2024-05-10  15.32        G5 -412.0           none          z_score   
3     2024-05-10  15.32        G5 -412.0           none          z_score   
4     2024-05-10  15.32        G5 -412.0           none           robust   
...          ...    ...       ...    ...            ...              ...   
1075  2005-09-11  12.25        G3 -139.0    exponential           robust   
1076  2005-09-11  12.25        G3 -139.0    exponential  decimal_scaling   
1077  2005-09-11  12.25        G3 -139.0    exponential  decimal_scaling   
1078  2005-09-11  12.25        G3 -139.0    exponential             none   
1079  2005-09-11  12.25        G3 -139.0    exponential             none   

     adjacency_method  global_efficiency  estrada_index   entropy   fractal  \
0       

In [None]:
distance_transformation = DistanceTransformation.EXPONENTIAL
norm_method = Normalizer.Z_SCORE
adjacency_method = AdjacencyMethod.MANHATTAN

df0 = dataset_df_conf1[
    (dataset_df_conf1["transformation"] == distance_transformation.value)
    & (dataset_df_conf1["normalization"] == norm_method.value)
    & (dataset_df_conf1["adjacency_method"] == adjacency_method.value)
].sort_values(by="intensity")[[*METRIC_COLUMNS, "intensity"]]

x0 = df0.drop(columns=["intensity"]).astype(float).values
y = (
    df0["intensity"]
    .replace(
        ["G1", "G2", "G3", "G4", "G5", "Unknown", np.nan, "G4/G5", "G3/G4"],
        [1, 2, 3, 4, 5, 0, 0, 4.5, 3.5],
    )
    .astype(float)
    .values
)

x0 = sm.add_constant(x0)  # Adds a constant term to the predictor
model = sm.OLS(y, x0).fit()
print(model.summary())


                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.275
Model:                            OLS   Adj. R-squared:                 -0.058
Method:                 Least Squares   F-statistic:                    0.8262
Date:                Mon, 09 Feb 2026   Prob (F-statistic):              0.617
Time:                        11:23:56   Log-Likelihood:                -51.824
No. Observations:                  36   AIC:                             127.6
Df Residuals:                      24   BIC:                             146.7
Df Model:                          11                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         30.5570     89.008      0.343      0.7

## Configuration 2

* Imputed data previously: `event_filename = "all.imp.txt"`
* Without imputing data on my own: `input_data = False`
* If the column have nans, drop it: `use_threshold = False`

In [5]:
dataset_df_conf2 = get_dataset_filename(
    event_filename="all.imp.txt",
    input_data=False,
    use_threshold=False,
)

In [6]:
distance_transformation = DistanceTransformation.EXPONENTIAL
norm_method = Normalizer.Z_SCORE
adjacency_method = AdjacencyMethod.MINKOWSKI

df0 = dataset_df_conf2[
    (dataset_df_conf2["transformation"] == distance_transformation.value)
    & (dataset_df_conf2["normalization"] == norm_method.value)
    & (dataset_df_conf2["adjacency_method"] == adjacency_method.value)
].sort_values(by="intensity")[[*METRIC_COLUMNS, "intensity"]]

x0 = df0.drop(columns=["intensity"]).astype(float).values
y = (
    df0["intensity"]
    .replace(
        ["G1", "G2", "G3", "G4", "G5", "Unknown", np.nan, "G4/G5", "G3/G4"],
        [1, 2, 3, 4, 5, 0, 0, 4.5, 3.5],
    )
    .astype(float)
    .values
)

x0 = sm.add_constant(x0)  # Adds a constant term to the predictor
model = sm.OLS(y, x0).fit()
print(model.summary())


                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.516
Model:                            OLS   Adj. R-squared:                  0.294
Method:                 Least Squares   F-statistic:                     2.322
Date:                Mon, 09 Feb 2026   Prob (F-statistic):             0.0409
Time:                        11:58:26   Log-Likelihood:                -44.557
No. Observations:                  36   AIC:                             113.1
Df Residuals:                      24   BIC:                             132.1
Df Model:                          11                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const        190.1928     77.360      2.459      0.0

In [7]:
y

array([2. , 3. , 3. , 3. , 3. , 3. , 3. , 3. , 3. , 3. , 3.5, 4. , 4. ,
       4. , 4. , 4. , 4. , 4. , 4. , 4. , 4. , 4. , 4. , 4. , 4.5, 5. ,
       5. , 5. , 5. , 5. , 5. , 5. , 5. , 5. , 0. , 0. ])

In [27]:
df0.drop(columns=["intensity"]).corr()

Unnamed: 0,global_efficiency,entropy,hurst_rs,fractal,modularity,assortativity,estrada_index,avg_katz,avg_closeness,avg_betweenness,avg_laplacian
global_efficiency,1.0,-0.772833,0.274574,0.324925,-0.780694,-0.144407,-0.712417,0.768031,0.933455,0.025886,0.803475
entropy,-0.772833,1.0,-0.144287,0.20724,0.925154,0.233505,0.959461,-0.969014,-0.533329,-0.594264,-0.930193
hurst_rs,0.274574,-0.144287,1.0,0.249878,-0.236727,0.165672,-0.186299,0.15765,0.31495,-0.151638,0.184276
fractal,0.324925,0.20724,0.249878,1.0,0.137385,0.189448,0.27697,-0.21617,0.569329,-0.756687,-0.15771
modularity,-0.780694,0.925154,-0.236727,0.137385,1.0,0.177673,0.924817,-0.927476,-0.556661,-0.520991,-0.920655
assortativity,-0.144407,0.233505,0.165672,0.189448,0.177673,1.0,0.239021,-0.259351,-0.001636,-0.268552,-0.285952
estrada_index,-0.712417,0.959461,-0.186299,0.27697,0.924817,0.239021,1.0,-0.985843,-0.451041,-0.676532,-0.961058
avg_katz,0.768031,-0.969014,0.15765,-0.21617,-0.927476,-0.259351,-0.985843,1.0,0.508541,0.647026,0.987402
avg_closeness,0.933455,-0.533329,0.31495,0.569329,-0.556661,-0.001636,-0.451041,0.508541,1.0,-0.318701,0.552372
avg_betweenness,0.025886,-0.594264,-0.151638,-0.756687,-0.520991,-0.268552,-0.676532,0.647026,-0.318701,1.0,0.593942
