In [1]:
import pandas as pd
from pathlib import Path
import glob

time_column: str = "Time"
satellite_name: str = "grifex"


def read_satellite_data(path: Path, threshold: float = 0.001) -> pd.DataFrame:
    all_files = glob.glob(f"{path}/*.csv")
    df = (
        pd.concat((pd.read_csv(f) for f in all_files), ignore_index=True)
        .groupby(time_column, as_index=False)
        .mean()
    )
    df[time_column] = pd.to_datetime(df[time_column]).dt.normalize()
    df = df.select_dtypes(include=["number", "bool", "datetime"])

    non_null_count = df.notnull().sum()
    min_non_null_count = int(threshold * len(df))

    df = df.loc[:, non_null_count >= min_non_null_count]

    return df


satellite_data = read_satellite_data(f"../data/satellites/{satellite_name}")
satellite_columns: list[str] = satellite_data.drop(time_column, axis=1).columns

print(satellite_data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 757510 entries, 0 to 757509
Data columns (total 47 columns):
 #   Column            Non-Null Count   Dtype         
---  ------            --------------   -----         
 0   Time              757510 non-null  datetime64[ns]
 1   x-axis            2614 non-null    float64       
 2   y-axis            2614 non-null    float64       
 3   y-axis.1          2614 non-null    float64       
 4   Battery Bus       2627 non-null    float64       
 5   Battery           2631 non-null    float64       
 6   5V                2627 non-null    float64       
 7   3.3V              2627 non-null    float64       
 8   C1 Input          2610 non-null    float64       
 9   C1 Output         2610 non-null    float64       
 10  C2 Input          2610 non-null    float64       
 11  C2 Output         2610 non-null    float64       
 12  C3 Input          2610 non-null    float64       
 13  C3 Output         2610 non-null    float64       
 14  C4 I

In [2]:
def read_solar_data(file_path: Path, date_column: str) -> pd.DataFrame:
    df = pd.read_json(file_path)
    df[date_column] = pd.to_datetime(df[date_column])
    return df


swpc_observed_ssn = read_solar_data(
    "../data/solar/swpc/swpc_observed_ssn.json", "Obsdate"
)
print(swpc_observed_ssn.info())

swpc_observed_solar_cycle_indicies = read_solar_data(
    "../data/solar/swpc/observed-solar-cycle-indices.json", "time-tag"
)
print(swpc_observed_solar_cycle_indicies.info())


swpc_dgd = pd.read_csv("../data/solar/swpc/dgd.csv")
swpc_dgd["Date"] = pd.to_datetime(swpc_dgd["Date"])
print(swpc_dgd.info())

solar_columns: list[str] = (
    swpc_observed_ssn.columns.drop("Obsdate").tolist()
    + swpc_observed_solar_cycle_indicies.columns.drop("time-tag").tolist()
    + swpc_dgd.columns.drop("Date").tolist()
)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9137 entries, 0 to 9136
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   Obsdate   9137 non-null   datetime64[ns]
 1   swpc_ssn  9137 non-null   int64         
dtypes: datetime64[ns](1), int64(1)
memory usage: 142.9 KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3309 entries, 0 to 3308
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   time-tag           3309 non-null   datetime64[ns]
 1   ssn                3309 non-null   float64       
 2   smoothed_ssn       3309 non-null   float64       
 3   observed_swpc_ssn  3309 non-null   float64       
 4   smoothed_swpc_ssn  3309 non-null   float64       
 5   f10.7              3309 non-null   float64       
 6   smoothed_f10.7     3309 non-null   float64       
dtypes: datetime64[ns](1), float64(6)
memory 

In [4]:
dynamics = pd.merge(
    satellite_data, swpc_observed_ssn, left_on=time_column, right_on="Obsdate", how="left"
).drop(columns=["Obsdate"])

dynamics = pd.merge(
    dynamics,
    swpc_observed_solar_cycle_indicies,
    left_on=time_column,
    right_on="time-tag",
    how="left",
).drop(columns=["time-tag"])

dynamics = pd.merge(
    dynamics,
    swpc_dgd,
    left_on=time_column,
    right_on="Date",
    how="left",
).drop(columns=["Date"])

print(dynamics.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 757510 entries, 0 to 757509
Data columns (total 81 columns):
 #   Column                  Non-Null Count   Dtype         
---  ------                  --------------   -----         
 0   Time                    757510 non-null  datetime64[ns]
 1   x-axis                  2614 non-null    float64       
 2   y-axis                  2614 non-null    float64       
 3   y-axis.1                2614 non-null    float64       
 4   Battery Bus             2627 non-null    float64       
 5   Battery                 2631 non-null    float64       
 6   5V                      2627 non-null    float64       
 7   3.3V                    2627 non-null    float64       
 8   C1 Input                2610 non-null    float64       
 9   C1 Output               2610 non-null    float64       
 10  C2 Input                2610 non-null    float64       
 11  C2 Output               2610 non-null    float64       
 12  C3 Input                2610 n

In [5]:
import plotly.graph_objects as go


def plot_general_dynamic(df: pd.DataFrame) -> None:
    fig = go.Figure()

    columns: list[str] = [f"{column}" for column in df.columns if column != time_column]

    for column in columns:
        fig.add_trace(
            go.Scatter(
                x=df[time_column],
                y=df[column],
                mode="lines",
                name=column,
                hoverinfo="text",
                text=column,
            )
        )

    fig.update_layout(
        title="Dynamics of Solar Parameters and Lightsail Values Over Time",
        title_x=0.5,
        title_y=0.01,
        xaxis_title="Time",
        yaxis_title="Values",
        xaxis_tickangle=-45,
        legend=dict(
            title="Parameters",
            orientation="h",
            yanchor="bottom",
            y=1.02,
            xanchor="right",
            x=1,
            itemsizing="constant",
        ),
        template="plotly_white",
        margin=dict(l=40, r=40, t=40, b=100),
    )

    fig.show()


def get_normalized_dataframe(df: pd.DataFrame, exclude_column: str) -> pd.DataFrame:
    columns_to_normalize = df.columns[df.columns != exclude_column]

    normalized_df = df.copy()

    normalized_df[columns_to_normalize] = (
        df[columns_to_normalize] - df[columns_to_normalize].mean()
    ) / df[columns_to_normalize].std()

    return normalized_df


normalized_dynamics = get_normalized_dataframe(dynamics, exclude_column=time_column)
normalized_dynamics.to_csv(f"artifacts/{satellite_name}.csv", index=False, encoding='utf-8')

#plot_general_dynamic(normalized_dynamics)

# Dependency Graph Based on Highest Correlation

In this analysis, we compute the correlation between different variables in a dataset and visualize the relationships using a dependency graph. The following mathematical concepts are involved in this process:

## 1. Correlation Coefficient

The correlation coefficient quantifies the degree to which two variables are related. It is calculated using the formula:

$$
r_{xy} = \frac{\text{Cov}(X, Y)}{\sigma_X \sigma_Y}
$$


Where:
- $r_{xy}$ is the correlation coefficient between variables $X$ and $Y$.
- $\text{Cov}(X, Y)$ is the covariance between $X$ and $Y$.
- $\sigma_X$ is the standard deviation of variable $X$.
- $\sigma_Y$ is the standard deviation of variable $Y$.

## 2. Covariance

Covariance measures how much two random variables vary together. It is defined as:

$$
\text{Cov}(X, Y) = E\left[(X - E[X])(Y - E[Y])\right]
$$


Where:
- $E[X]$ is the expected value (mean) of $X$.
- $E[Y]$ is the expected value (mean) of $Y$.

## 3. Maximum Correlation

For each variable in the dataset, we find the variable with which it has the highest correlation. This is represented mathematically as:

$$
\text{max\_corr}(X) = \arg\max_{Y} r_{XY}
$$


Where:
- $X$ is a variable from the dataset.
- $Y$ represents all other variables in the dataset.
- $r_{XY}$ is the correlation coefficient between $X$ and $Y$.

## 4. Graph Representation

The relationships are represented as a graph where:
- Each node represents a variable.
- Each edge represents a relationship based on maximum correlation.

### Nodes
Each unique variable is added as a node:

$$
\text{nodes} = \{ "name": X, "name": Y, ...\}
$$


### Edges
An edge is created from each variable to its maximum correlated variable:

$$
\text{edges} = \{ "source": X, "target": Y\}
$$


## Conclusion

The resulting graph visualizes how each variable relates to its most correlated counterpart, providing insights into dependencies within the dataset.


In [6]:
import pandas as pd
from pyecharts import options as opts
from pyecharts.charts import Graph

non_time_measurements = normalized_dynamics.drop(time_column, axis=1)

def render_dependency_graph(df: pd.DataFrame, top_n: int = 5) -> Graph:
    nodes: Graph.Sequence[Graph.GraphNode] = []
    edges: Graph.Sequence[Graph.GraphLink] = []

    correlation_matrix = df.corr()

    for column in df.columns:
        correlations = correlation_matrix[column].drop(column).dropna()

        if not correlations.empty:
            top_correlations = correlations.nlargest(top_n)

            for _, (corr_col, corr_value) in enumerate(top_correlations.items()):
                if {"name": column} not in nodes:
                    nodes.append({"name": column})
                if {"name": corr_col} not in nodes:
                    nodes.append({"name": corr_col})

                edges.append({"source": column, "target": corr_col, "value": corr_value})

    return (
        Graph()
        .add("", nodes=nodes, links=edges, layout="circular", is_rotate_label=True)
        .set_global_opts(
            title_opts=opts.TitleOpts(
                title="Dependency Graph Based on Highest Correlations"
            ),
            tooltip_opts=opts.TooltipOpts(trigger="item", formatter="{c}"),
        )
    )

render_dependency_graph(non_time_measurements, top_n=5).render_notebook()

## Mathematical Representation

The objective function can be represented mathematically as:

$$
L(y, \hat{y}) = \frac{1}{N} \sum_{i=1}^{N} (y_i - \hat{y}_i)^2
$$


where:
- $y$ is the true value,
- $\hat{y}$ is the predicted value, and
- $N$ is the number of instances.

This formula represents the mean squared error (MSE), which measures the average of the squares of the errors—that is, the average squared difference between the estimated values ($\hat{y}$) and the actual value ($y$).


# Cross-Correlation Calculation in Polaris-ML

This document describes the mathematical representation of cross-correlation as implemented in Polaris-ML, referencing the configuration provided for XGBoost.

## Mathematical Representation

Cross-correlation measures the similarity between two signals as a function of the time-lag applied to one of them. For discrete signals $ A $ and $ B $, the cross-correlation $ R_{AB}(\tau) $ can be defined as:

$$
R_{AB}(\tau) = \sum_{n=-\infty}^{\infty} A[n] B[n + \tau]
$$


where:
- $ R_{AB}(\tau) $ is the cross-correlation function at lag $ \tau $,
- $ A[n] $ is the signal $ A $ at time index $ n $,
- $ B[n + \tau] $ is the signal $ B $ shifted by $ \tau $.

In practical applications, when dealing with finite-length sequences, this can be expressed as:

$$
R_{AB}(\tau) = \sum_{n=0}^{N-1} A[n] B[n + \tau]
$$


for $ -M < \tau < N-1 $, where:
- $ N $ is the length of signal $ A $,
- $ M $ is the length of signal $ B $.

### Normalization

To reduce the influence of changes in brightness and contrast, normalization is often applied. The normalized cross-correlation can be defined as:

$$
C_{AB}(\tau) = \frac{R_{AB}(\tau)}{\sqrt{R_{AA}(0) R_{BB}(0)}}
$$


where:
- $ R_{AA}(0) = R_{AA}(\tau = 0) = \sum_{n=0}^{N-1} A[n]^2 $
- $ R_{BB}(0) = R_{BB}(\tau = 0) = \sum_{n=0}^{M-1} B[n]^2 $

This normalization ensures that the cross-correlation values are bounded between -1 and 1, allowing for a more interpretable measure of similarity.

In [9]:
from modules.learn.analysis import cross_correlate

cross_correlate(
    input_file=f"artifacts/{satellite_name}.csv",
    output_graph_file=f"artifacts/{satellite_name}_graph.json",
    index_column=time_column,
    xcorr_configuration_file="../cfg/model.json",
    dropna=True
)

Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


Cannot find RMS Error for +Y External
Cannot find RMS Error for +Y Internal
Cannot find RMS Error for +X External
Cannot find RMS Error for +X Internal
Cannot find RMS Error for +X
Cannot find RMS Error for +Y


KeyboardInterrupt: 

In [8]:
import json
from pyecharts import options as opts
from pyecharts.charts import Graph

with open(f"artifacts/{satellite_name}_graph.json", "r") as file:
    loaded_json = json.load(file)

data = loaded_json["graph"]

nodes = set()
links = []

for link in data["links"]:
    nodes.add(link["source"])
    nodes.add(link["target"])
    links.append(
        {"source": link["source"], "target": link["target"], "value": link["value"]}
    )

node_list = []
for node in nodes:
    node_list.append(
        {
            "name": node,
            "symbolSize": 20,
            "value": f"{node} - {len([link for link in links if link['source'] == node or link['target'] == node])} bound(s)",
        }
    )

graph = (
    Graph()
    .add("", node_list, links, repulsion=8000)
    .set_global_opts(
        title_opts=opts.TitleOpts(title="2D Dependency Graph"),
        tooltip_opts=opts.TooltipOpts(trigger="item", formatter="{b}: {c}"),
    )
)

graph.render_notebook()