In [21]:
%%capture
%load_ext autoreload
%autoreload 2

In [None]:
import itertools
from typing import Any, Dict

import numpy as np
import pandas as pd
import statsmodels.api as sm

from utils import (
    METRIC_COLUMNS,
    ROOTDIR,
    AdjacencyMethod,
    DistanceTransformation,
    Normalizer,
    encode_variables_to_filename,
    get_dataset_filename,
    setup_plotting,
)

setup_plotting()

## Choose configuration

Depending on the configuration desired, we get the dataset

In [None]:
def h0_condition(p_values: pd.Series, alpha: float = 0.09) -> pd.Series:
    return p_values < alpha


def get_summary_statistics(
    dataset_conf: pd.DataFrame,
    transformation: DistanceTransformation,
    norm_method: Normalizer,
    adjacency_method: AdjacencyMethod,
    event_filename: str,
    input_data: bool,
    use_threshold: bool,
) -> Dict[str, Any]:
    df0 = dataset_conf[
        (dataset_conf["transformation"] == transformation.value)
        & (dataset_conf["normalization"] == norm_method.value)
        & (dataset_conf["adjacency_method"] == adjacency_method.value)
    ].sort_values(by="intensity")[[*METRIC_COLUMNS, "intensity"]]

    x0 = df0.drop(columns=["intensity"]).astype(float).values
    y = (
        df0["intensity"]
        .replace(
            ["G1", "G2", "G3", "G4", "G5", "Unknown", np.nan, "G4/G5", "G3/G4"],
            [1, 2, 3, 4, 5, 0, 0, 4.5, 3.5],
        )
        .astype(float)
        .values
    )

    try:
        x0 = sm.add_constant(x0)  # Adds a constant term to the predictor
        model = sm.OLS(y, x0).fit()
        h0s = model.summary2().tables[1]["P>|t|"].drop("const")
        h0_count = h0s[h0_condition(h0s)].count()
    except Exception as e:
        dataset_name = encode_variables_to_filename(
            event_filename=event_filename,
            imput_data=input_data,
            use_threshold=use_threshold,
        )
        print(f"Error processing dataset: {dataset_name} -- {e}")
        print(
            f"transformation={transformation.value}, normalization={norm_method.value}, "
            f"adjacency_method={adjacency_method.value}"
        )

        raise e
    
    return {
        "model": model,
        "count": h0_count,
    }


### Bests configurations?

Trying to do grid search, and take the bests combinations, what is the best combination? In this case, I'll take those combinations with the mosts amount of variables < 0.09, and save it in a .csv to read it later

In [None]:
dataset_combinations = list(itertools.product(
    ["all.txt", "all.original.txt", "all.imp.txt"],
    [False, True],  # input_data == use_threshold
))
combinations_per_dataset = list(itertools.product(
    DistanceTransformation,
    Normalizer,
    [AdjacencyMethod.MANHATTAN, AdjacencyMethod.MINKOWSKI],
))

dataset = {
    "event_filename": [],
    "input_data": [],
    "use_threshold": [],
    "transformation": [],
    "normalization": [],
    "adjacency_method": [],
    "count": [],
}
for event_filename, input_data in dataset_combinations:
    use_threshold = input_data
    dataset_conf = get_dataset_filename(
        event_filename=event_filename,
        input_data=input_data,
        use_threshold=use_threshold,
    )
    print(
        "Processing dataset: "
        f"{encode_variables_to_filename(event_filename, input_data, use_threshold)}"
    )

    for transformation, norm_method, adjacency_method in combinations_per_dataset:
        results = get_summary_statistics(
            dataset_conf=dataset_conf,
            transformation=transformation,
            norm_method=norm_method,
            adjacency_method=adjacency_method,
            event_filename=event_filename,
            input_data=input_data,
            use_threshold=use_threshold,
        )
        h0_count = results["count"]

        dataset["event_filename"].append(event_filename)
        dataset["input_data"].append(input_data)
        dataset["use_threshold"].append(use_threshold)
        dataset["transformation"].append(transformation.value)
        dataset["normalization"].append(norm_method.value)
        dataset["adjacency_method"].append(adjacency_method.value)
        dataset["count"].append(int(h0_count))

dataset_df = pd.DataFrame(dataset)
dataset_df.to_csv(ROOTDIR / "data" / "dataset_summary.csv", index=False)

Processing dataset: dataset_all_imput-False_threshold-False.csv
Processing dataset: dataset_all_imput-True_threshold-True.csv
Processing dataset: dataset_all.original_imput-False_threshold-False.csv
Processing dataset: dataset_all.original_imput-True_threshold-True.csv
Processing dataset: dataset_all.imp_imput-False_threshold-False.csv
Processing dataset: dataset_all.imp_imput-True_threshold-True.csv


## Read best configurations

In [60]:
dataset_df = pd.read_csv(
    ROOTDIR / "data" / "dataset_summary.csv", index_col=False
).sort_values(by="count", ascending=False)

dataset_df

Unnamed: 0,event_filename,input_data,use_threshold,transformation,normalization,adjacency_method,count
140,all.imp.txt,False,False,exponential,min_max,manhattan,4
170,all.imp.txt,True,True,exponential,min_max,manhattan,4
68,all.original.txt,False,False,none,none,manhattan,4
52,all.txt,True,True,exponential,z_score,manhattan,4
84,all.original.txt,False,False,exponential,robust,manhattan,3
...,...,...,...,...,...,...,...
167,all.imp.txt,True,True,log,decimal_scaling,minkowski,0
175,all.imp.txt,True,True,exponential,robust,minkowski,0
172,all.imp.txt,True,True,exponential,z_score,manhattan,0
177,all.imp.txt,True,True,exponential,decimal_scaling,minkowski,0


In [67]:
for row in dataset_df.head(5).itertuples():
    event_filename = str(row.event_filename)
    input_data = bool(row.input_data)
    use_threshold = bool(row.use_threshold)
    transformation = row.transformation
    normalization = row.normalization
    adjacency_method = row.adjacency_method

    dataset_conf = get_dataset_filename(
        event_filename=event_filename,
        input_data=input_data,
        use_threshold=use_threshold,
    )

    model = get_summary_statistics(
        dataset_conf=dataset_conf,
        transformation=DistanceTransformation(transformation),
        norm_method=Normalizer(normalization),
        adjacency_method=AdjacencyMethod(adjacency_method),
        event_filename=event_filename,
        input_data=input_data,
        use_threshold=use_threshold,
    )["model"]

    print(
        "Dataset name: "
        f"{encode_variables_to_filename(event_filename, input_data, use_threshold)}"
    )
    print(
        f"Transformation: {transformation}, Normalization: {normalization}, "
        f"Adjacency Method: {adjacency_method}"
    )
    print(model.summary())

    print("\n\n")

Dataset name: dataset_all.imp_imput-False_threshold-False.csv
Transformation: exponential, Normalization: min_max, Adjacency Method: manhattan
                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.361
Model:                            OLS   Adj. R-squared:                  0.041
Method:                 Least Squares   F-statistic:                     1.129
Date:                Mon, 09 Feb 2026   Prob (F-statistic):              0.386
Time:                        19:05:10   Log-Likelihood:                -34.631
No. Observations:                  34   AIC:                             93.26
Df Residuals:                      22   BIC:                             111.6
Df Model:                          11                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025 