In [2]:
import pandas as pd
from pathlib import Path

from typing import TypedDict

from utils import read_metrics_file
from utils.constants import WINDOW_SIZE, Events, EWM_ALPHA, METRICS, datetimes

import multiprocessing as mp

import warnings


warnings.filterwarnings("ignore")

In [3]:
class MetricsSummary(TypedDict):
    """
    Structure for metrics summary data.

    Attributes:
        event (str): The name of the event.
        date (str): The date of the event.
        station (str): The station name.
        metric (str): The name of the calculated metric.
        index (str): The calculated metrics for the station.
    """

    event: str
    date: str
    station: str
    metric: str
    index: str


EVENT: Events = "Forbush Decrease"
MAX_SAMPLES: int = 10  # Samples per date
REPETITION: bool = True  # If True, it will repeat stations already calculated
EWM: bool = True  # If True, it will calculate EWM metrics

event_replace: str = EVENT.replace(" ", "")

In [4]:
plot_stations: dict[str, list[str]] = {
    date: list(
        set(
            map(
                # Get Station name from filename
                lambda filename: filename.name.strip().split("_", 1)[0].upper(),
                Path(f"./data/{event_replace}/{date}").glob("*.csv"),
            )
        )
    )
    for date in datetimes
}

## Summary

In [5]:
def valid_interval(
    event: Events,
    date: str,
    station: str,
    data: pd.DataFrame = None,
) -> pd.DataFrame:
    if data is None:
        suffix = f"-ewm_alpha_{EWM_ALPHA}" if EWM_ALPHA and EWM else ""
        data = read_metrics_file(
            event,
            date,
            station,
            WINDOW_SIZE,
            datetime_cols={"datetime": None},
            suffix=suffix,
        ).set_index("datetime")

    if station in datetimes[date]["stations"] and datetimes[date]["stations"][station]:
        max_datetime = datetimes[date]["stations"][station][1]
    else:
        max_datetime = datetimes[date]["bounds"][1]

    data = data[(data["window_shape"] == WINDOW_SIZE) & (data.index <= max_datetime)]
    return data


def process_derivatives(
    event: Events, date: str, station: str, percentil: int = 0.95
) -> list[MetricsSummary]:
    assert 0 < percentil < 1.0, "Percentil must be between 0.0 and 1.0"

    suffix = f"-ewm_alpha_{EWM_ALPHA}" if EWM_ALPHA and EWM else ""
    data = read_metrics_file(
        event,
        date,
        station,
        WINDOW_SIZE,
        datetime_cols={"datetime": None},
        suffix=suffix,
    ).set_index("datetime")

    metrics_columns = list(filter(lambda col: col in METRICS, data.columns))
    metrics_columns += ["value"]

    valid_indexes = valid_interval(event, date, station, data).index
    diff = data[metrics_columns].diff()
    interest_df = diff[diff.index.isin(valid_indexes)]
    quantiles = interest_df.quantile(percentil)

    results: list[MetricsSummary] = []
    for col in metrics_columns:
        quantil = quantiles[col]
        points = interest_df[interest_df[col] >= quantil][col]
        if len(points) < 0:
            continue

        interest_index = points.idxmax()  # Maybe this operation can be changed
        results.append(
            {
                "event": event,
                "date": date,
                "station": station,
                "metric": col,
                "index": str(interest_index),
            }
        )

    return results

In [6]:
percentil: float = 0.9

arguments: list[tuple[Events, str, str, int]] = list(
    map(
        lambda date, station: ("Forbush Decrease", date, station, percentil),
        *zip(
            *[
                (date, station)
                for date, stations in plot_stations.items()
                for station in stations
            ]
        ),
    )
)

with mp.Pool(processes=mp.cpu_count()) as pool:
    results = pool.starmap(
        process_derivatives,
        arguments,
    )

In [15]:
df = pd.DataFrame(columns=["date", "station", "metric", "index"])
for res in results:
    df = pd.concat([df, pd.DataFrame(res)], ignore_index=True)

df["index"] = pd.to_datetime(df["index"])
df

Unnamed: 0,date,station,metric,index,event
0,2023-04-23,AATB,entropy,2023-04-23 01:05:00,Forbush Decrease
1,2023-04-23,AATB,sampen,2023-04-23 07:26:00,Forbush Decrease
2,2023-04-23,AATB,permutation_entropy,2023-04-23 20:51:00,Forbush Decrease
3,2023-04-23,AATB,shannon_entropy,2023-04-23 02:20:00,Forbush Decrease
4,2023-04-23,AATB,spectral_entropy,2023-04-23 17:01:00,Forbush Decrease
...,...,...,...,...,...
475,2024-05-10,IRK3,katz_fd,2024-05-10 12:52:00,Forbush Decrease
476,2024-05-10,IRK3,petrosian_fd,2024-05-10 22:03:00,Forbush Decrease
477,2024-05-10,IRK3,lepel_ziv,2024-05-10 10:41:00,Forbush Decrease
478,2024-05-10,IRK3,corr_dim,2024-05-10 13:55:00,Forbush Decrease


In [20]:
suffix = f"-ewm_alpha_{EWM_ALPHA}" if EWM_ALPHA and EWM else ""
df.to_csv(
    f"./data/{event_replace}/summary_derivatives{suffix}.csv",
    index=False,
)

## Trying to see the results

In [21]:
suffix = f"-ewm_alpha_{EWM_ALPHA}" if EWM_ALPHA and EWM else ""
df = pd.read_csv(f"./data/{event_replace}/summary_derivatives{suffix}.csv")
df["index"] = pd.to_datetime(df["index"])
df

Unnamed: 0,date,station,metric,index,event
0,2023-04-23,AATB,entropy,2023-04-23 01:05:00,Forbush Decrease
1,2023-04-23,AATB,sampen,2023-04-23 07:26:00,Forbush Decrease
2,2023-04-23,AATB,permutation_entropy,2023-04-23 20:51:00,Forbush Decrease
3,2023-04-23,AATB,shannon_entropy,2023-04-23 02:20:00,Forbush Decrease
4,2023-04-23,AATB,spectral_entropy,2023-04-23 17:01:00,Forbush Decrease
...,...,...,...,...,...
475,2024-05-10,IRK3,katz_fd,2024-05-10 12:52:00,Forbush Decrease
476,2024-05-10,IRK3,petrosian_fd,2024-05-10 22:03:00,Forbush Decrease
477,2024-05-10,IRK3,lepel_ziv,2024-05-10 10:41:00,Forbush Decrease
478,2024-05-10,IRK3,corr_dim,2024-05-10 13:55:00,Forbush Decrease


In [17]:
# filter by a particular event
date = "2023-04-23"
station = "PWNK"

df_event = df[(df["date"] == date) & (df["station"] == station)].reset_index(drop=True)
df_event

Unnamed: 0,date,station,metric,index,event
0,2023-04-23,PWNK,entropy,2023-04-23 01:05:00,Forbush Decrease
1,2023-04-23,PWNK,sampen,2023-04-23 09:27:00,Forbush Decrease
2,2023-04-23,PWNK,permutation_entropy,2023-04-23 19:07:00,Forbush Decrease
3,2023-04-23,PWNK,shannon_entropy,2023-04-23 08:11:00,Forbush Decrease
4,2023-04-23,PWNK,spectral_entropy,2023-04-23 18:50:00,Forbush Decrease
5,2023-04-23,PWNK,app_entropy,2023-04-23 16:56:00,Forbush Decrease
6,2023-04-23,PWNK,hurst,2023-04-23 11:36:00,Forbush Decrease
7,2023-04-23,PWNK,dfa,2023-04-23 03:49:00,Forbush Decrease
8,2023-04-23,PWNK,mfhurst_b,2023-04-23 19:23:00,Forbush Decrease
9,2023-04-23,PWNK,higuchi_fd,2023-04-23 08:07:00,Forbush Decrease
