In [None]:
%load_ext autoreload
%autoreload 2

In [7]:
from typing import List

import matplotlib.pyplot as plt
import networkx as nx
import pandas as pd

from utils import NAN_THRESHOLD, AdjacencyMethod, GraphEvent, load_events

In [2]:
events = load_events()

## Preprocess DataFrames

Somes events have 2-min resolution (we download them for 1-min though), so we decided to standardize all events to 2-min. And, apparently, the data cannot have any NaN value

In [3]:
def invalid_stations(station_df: pd.DataFrame, use_threshold: bool) -> List[str]:
    """Check if a station has less than NAN_THRESHOLD percentage of NaN values."""
    nan_ratios = station_df.isna().mean()

    if use_threshold:
        invalid_stations = nan_ratios[nan_ratios >= NAN_THRESHOLD].index.tolist()
    else:
        invalid_stations = nan_ratios[nan_ratios > 0].index.tolist()

    return invalid_stations

In [4]:
# Fill data preprocessing
for event_name, data in events.items():
    df = data["raw"]

    # Drop stations with too many NaNs
    stations_to_drop = invalid_stations(df, use_threshold=False)
    df = df.drop(columns=stations_to_drop)

    # Resample to 2-minute intervals using median
    events[event_name]["raw"] = df.resample("2min").median()

## Create Graphs

Based on [001_tests_with_graphs.ipynb](./001_tests_with_graphs.ipynb), the best methods are: `MANHATTAN` & `MINKOWSKI`, so I'll be using only those

In [17]:
valid_methods = (AdjacencyMethod.MANHATTAN, AdjacencyMethod.MINKOWSKI)

In [20]:
dataset = []

for event_date, data in events.items():
    for method in valid_methods:
        df = data["raw"].reset_index(drop=True)
        graph_event = GraphEvent(
            data=df,
            metadata={},
        )

        graph = graph_event.get_graph_networkx(method)
        events[event_date]["graphs"][method] = graph

        # Global graph metrics
        dataset.append({
            "event_date": event_date,
            "adjacency_method": method,
            "intensity": data["intensity"],
            "diameter": nx.diameter(graph) if nx.is_connected(graph) else float('inf'),
            "radius": nx.radius(graph) if nx.is_connected(graph) else float('inf'),
            "global_efficiency": nx.global_efficiency(graph),
            "average_clustering": nx.average_clustering(graph),
            "transitivity": nx.transitivity(graph),
            "connectivity": nx.number_connected_components(graph),
            "modularity": nx.algorithms.community.modularity(
                graph,
                list(nx.algorithms.community.greedy_modularity_communities(graph))
            ),
        })

In [None]:
dataset

[{'event_date': '2023-04-23',
  'adjacency_method': <AdjacencyMethod.MANHATTAN: 'manhattan'>,
  'intensity': 'G4',
  'diameter': 1,
  'radius': 1,
  'global_efficiency': 1.0,
  'average_clustering': 1.0,
  'transitivity': 1.0,
  'connectivity': 1,
  'modularity': 0.0},
 {'event_date': '2023-04-23',
  'adjacency_method': <AdjacencyMethod.MINKOWSKI: 'minkowski'>,
  'intensity': 'G4',
  'diameter': 1,
  'radius': 1,
  'global_efficiency': 1.0,
  'average_clustering': 1.0,
  'transitivity': 1.0,
  'connectivity': 1,
  'modularity': 1.1102230246251565e-16},
 {'event_date': '2024-03-24',
  'adjacency_method': <AdjacencyMethod.MANHATTAN: 'manhattan'>,
  'intensity': 'G4',
  'diameter': 1,
  'radius': 1,
  'global_efficiency': 1.0,
  'average_clustering': 1.0,
  'transitivity': 1.0,
  'connectivity': 1,
  'modularity': 0.0},
 {'event_date': '2024-03-24',
  'adjacency_method': <AdjacencyMethod.MINKOWSKI: 'minkowski'>,
  'intensity': 'G4',
  'diameter': 1,
  'radius': 1,
  'global_efficiency': 1

: 