In [None]:
import pandas as pd
from pathlib import Path
import glob


def read_satellite_data(path: Path) -> pd.DataFrame:
    all_files = glob.glob(f"{path}/*.csv")
    df = (
        pd.concat((pd.read_csv(f) for f in all_files), ignore_index=True)
        .groupby("Time", as_index=False)
        .mean()
    )
    df["Time"] = pd.to_datetime(df["Time"]).dt.normalize()
    return df


lightsail2_data = read_satellite_data("data/satellites/lightsail-2")

print(lightsail2_data)

In [None]:
def read_solar_data(file_path: Path, date_column: str) -> pd.DataFrame:
    df = pd.read_json(file_path)
    df[date_column] = pd.to_datetime(df[date_column])
    return df


swpc_observed_ssn = read_solar_data(
    "data/solar/swpc/swpc_observed_ssn.json", "Obsdate"
)
print(swpc_observed_ssn)

swpc_observed_solar_cycle_indicies = read_solar_data(
    "data/solar/swpc/observed-solar-cycle-indices.json", "time-tag"
)
print(swpc_observed_solar_cycle_indicies)

In [None]:
def merge_dataframes(
    df1: pd.DataFrame, df2: pd.DataFrame, left_on: str, right_on: str
) -> pd.DataFrame:
    merged_df = pd.merge(df1, df2, left_on=left_on, right_on=right_on, how="left")
    return merged_df


dynamics = merge_dataframes(lightsail2_data, swpc_observed_ssn, "Time", "Obsdate")
dynamics.drop(columns=["Obsdate"], inplace=True)

dynamics = merge_dataframes(
    dynamics, swpc_observed_solar_cycle_indicies, "Time", "time-tag"
)
dynamics.drop(columns=["time-tag"], inplace=True)

print(dynamics)

In [None]:
import plotly.graph_objects as go


def plot_general_dynamic(df: pd.DataFrame) -> None:
    fig = go.Figure()

    columns: list[str] = [f"{column}" for column in df.columns if column != "Time"]

    for column in columns:
        fig.add_trace(
            go.Scatter(
                x=df["Time"],
                y=df[column],
                mode="lines",
                name=column,
                hoverinfo="text",
                text=column,
            )
        )

    fig.update_layout(
        title="Dynamics of Solar Parameters and Lightsail Values Over Time",
        title_x=0.5,
        title_y=0.01,
        xaxis_title="Time",
        yaxis_title="Values",
        xaxis_tickangle=-45,
        legend=dict(
            title="Parameters",
            orientation="h",
            yanchor="bottom",
            y=1.02,
            xanchor="right",
            x=1,
            itemsizing="constant",
        ),
        template="plotly_white",
        margin=dict(l=40, r=40, t=40, b=100),
    )

    fig.show()


plot_general_dynamic(dynamics)

In [None]:
def get_normalized_dataframe(df: pd.DataFrame, exclude_column: str) -> pd.DataFrame:
    columns_to_normalize = df.columns[df.columns != exclude_column]

    normalized_df = df.copy()

    normalized_df[columns_to_normalize] = (
        df[columns_to_normalize] - df[columns_to_normalize].mean()
    ) / df[columns_to_normalize].std()

    return normalized_df


normalized_dynamics = get_normalized_dataframe(dynamics, exclude_column="Time")

plot_general_dynamic(normalized_dynamics)

# Dependency Graph Based on Highest Correlation

In this analysis, we compute the correlation between different variables in a dataset and visualize the relationships using a dependency graph. The following mathematical concepts are involved in this process:

## 1. Correlation Coefficient

The correlation coefficient quantifies the degree to which two variables are related. It is calculated using the formula:

$$
r_{xy} = \frac{\text{Cov}(X, Y)}{\sigma_X \sigma_Y}
$$


Where:
- $r_{xy}$ is the correlation coefficient between variables $X$ and $Y$.
- $\text{Cov}(X, Y)$ is the covariance between $X$ and $Y$.
- $\sigma_X$ is the standard deviation of variable $X$.
- $\sigma_Y$ is the standard deviation of variable $Y$.

## 2. Covariance

Covariance measures how much two random variables vary together. It is defined as:

$$
\text{Cov}(X, Y) = E\left[(X - E[X])(Y - E[Y])\right]
$$


Where:
- $E[X]$ is the expected value (mean) of $X$.
- $E[Y]$ is the expected value (mean) of $Y$.

## 3. Maximum Correlation

For each variable in the dataset, we find the variable with which it has the highest correlation. This is represented mathematically as:

$$
\text{max\_corr}(X) = \arg\max_{Y} r_{XY}
$$


Where:
- $X$ is a variable from the dataset.
- $Y$ represents all other variables in the dataset.
- $r_{XY}$ is the correlation coefficient between $X$ and $Y$.

## 4. Graph Representation

The relationships are represented as a graph where:
- Each node represents a variable.
- Each edge represents a relationship based on maximum correlation.

### Nodes
Each unique variable is added as a node:

$$
\text{nodes} = \{ "name": X, "name": Y, ...\}
$$


### Edges
An edge is created from each variable to its maximum correlated variable:

$$
\text{edges} = \{ "source": X, "target": Y\}
$$


## Conclusion

The resulting graph visualizes how each variable relates to its most correlated counterpart, providing insights into dependencies within the dataset.


In [None]:
from pyecharts import options as opts
from pyecharts.charts import Graph


def render_dependency_graph(df: pd.DataFrame) -> Graph:
    nodes: Graph.Sequence[Graph.GraphNode] = []
    edges: Graph.Sequence[Graph.GraphLink] = []

    correlation_matrix = df.corr()

    for column in df.columns:
        correlations = correlation_matrix[column].drop(column).dropna()

        if not correlations.empty:
            max_corr: int = correlations.idxmax()

            if {"name": column} not in nodes:
                nodes.append({"name": column})
            if {"name": max_corr} not in nodes:
                nodes.append({"name": max_corr})

            edges.append({"source": column, "target": max_corr})

    return (
        Graph()
        .add("", nodes=nodes, links=edges)
        .set_global_opts(
            title_opts=opts.TitleOpts(
                title="Dependency Graph Based on Highest Correlation"
            ),
            tooltip_opts=opts.TooltipOpts(trigger="item"),
        )
    )


render_dependency_graph(dynamics).render_notebook()

In [None]:
import graphviz


def render_graphviz_dependency_graph(df: pd.DataFrame) -> None:
    dot = graphviz.Digraph(format="png")

    correlation_matrix = df.corr()

    for column in df.columns:
        dot.node(column)
        
        correlations = correlation_matrix[column].drop(column).dropna()
        
        if not correlations.empty:
            max_corr: int = correlations.idxmax()
            dot.edge(column, max_corr)

    dot.render("dependency_graph", cleanup=True)

    from IPython.display import Image, display

    display(Image(filename="dependency_graph.png"))
    
render_graphviz_dependency_graph(dynamics)

In [None]:
import matplotlib.pyplot as plt


def plot_columns(df: pd.DataFrame, cols1: list[str]):
    time_col: str = "Time"
    cols2_set: list[str] = df.columns.drop(time_col).tolist()

    for col1 in cols1:
        for col in cols2_set:
            fig, ax1 = plt.subplots(figsize=(12, 6))

            ax1.plot(df[time_col], df[col1], label=col1, marker="o", color="red")
            ax1.set_xlabel(time_col)
            ax1.set_ylabel(col1, color="red")
            ax1.tick_params(axis="y", labelcolor="red")

            ax2 = ax1.twinx()
            ax2.plot(df[time_col], df[col], label=col, marker="x", color="blue")
            ax2.set_ylabel("Values", color="blue")
            ax2.tick_params(axis="y", labelcolor="blue")

            plt.title(f"{col1} and {col} over time")
            plt.xticks(rotation=45)

            lines, labels = ax1.get_legend_handles_labels()
            lines2, labels2 = ax2.get_legend_handles_labels()
            ax2.legend(lines + lines2, labels + labels2)

            plt.grid()
            fig.tight_layout()
            plt.show()
            print(f"{col1} and {col} Over Time")


solar_columns: list[str] = (
    swpc_observed_ssn.columns.drop("Obsdate").tolist()
    + swpc_observed_solar_cycle_indicies.columns.drop("time-tag").tolist()
)

plot_columns(normalized_dynamics, solar_columns)

In [None]:
def print_correlations(df: pd.DataFrame, target_column: str) -> None:
    methods: list[str] = ["pearson", "kendall", "spearman"]

    for method in methods:
        correlations = (
            df.corr(method=method)
            .drop("Time")[target_column]
            .sort_values(ascending=False)
            .dropna()
        )
        print(f"Correlations using {method}:\n{correlations}\n")

for solar_column in solar_columns:
    print_correlations(dynamics, solar_column)

# Statistical Analysis Interpretation

In statistical analysis, particularly when examining the relationship between two variables, several key metrics provide insights into the nature and strength of that relationship. Here’s a breakdown of these metrics:

## 1. Correlation Coefficient
- **Definition**: The correlation coefficient quantifies the degree to which two variables are related.
- **Interpretation**:
  - A value close to **+1** indicates a strong positive correlation (as one variable increases, the other tends to increase).
  - A value close to **-1** indicates a strong negative correlation (as one variable increases, the other tends to decrease).
  - A value around **0** suggests little to no linear relationship.

## 2. Regression Equation
- **Definition**: The regression equation represents the fitted line from the regression analysis.
- **Form**: 
  $$
  y = b_0 + b_1x
  $$
  Where:
  - \(y\) is the dependent variable.
  - \(b_0\) is the intercept (expected value of \(y\) when \(x = 0\)).
  - \(b_1\) is the slope (change in \(y\) for a one-unit increase in \(x\)).
- **Interpretation**: The slope indicates how much change in the dependent variable is expected for each unit change in the independent variable.

## 3. R-squared (R²)
- **Definition**: R-squared measures how well the independent variable(s) explain the variability of the dependent variable.
- **Interpretation**:
  - An R² value close to **1** means that a large proportion of the variance in \(y\) can be explained by changes in \(x\).
  - An R² value close to **0** indicates that changes in \(x\) do not explain much of the variance in \(y\).

## 4. Standard Error of the Slope
- **Definition**: This statistic provides an estimate of how much variability exists in the slope estimate.
- **Interpretation**: A smaller standard error relative to the slope suggests greater confidence in the slope estimate.

## 5. P-value
- **Definition**: The p-value tests the null hypothesis that there is no relationship between the independent and dependent variables.
- **Interpretation**:
  - A small p-value (typically < 0.05) suggests strong evidence against the null hypothesis, indicating a statistically significant relationship between variables.

## 6. Confidence Interval (CI)
- **Definition**: The confidence interval provides a range within which we expect the true parameter (e.g., slope) to fall with a certain level of confidence (typically 95%).
- **Interpretation**: If the confidence interval does not include zero, it supports that there is a statistically significant relationship between variables.

In [None]:
import statsmodels.api as sm
import seaborn as sns


def plot_regression(dynamics: pd.DataFrame, solar_param: str) -> None:
    columns_to_plot: list[str] = dynamics.columns.difference([solar_param, "Time"])

    for column in columns_to_plot:
        valid_data = dynamics[[column, solar_param]].dropna()

        model = sm.OLS(
            valid_data[column], sm.add_constant(valid_data[solar_param])
        ).fit()

        plt.figure(figsize=(10, 6))
        sns.scatterplot(data=valid_data, x=solar_param, y=column)
        sns.regplot(
            data=valid_data, x=solar_param, y=column, scatter=False, color="red"
        )

        intercept, slope = model.params
        r_squared = model.rsquared
        conf_int = model.conf_int().loc[solar_param]

        textstr = (
            f"y = {intercept:.2f} + {slope:.2f}x\n"
            f"R²: {r_squared:.2f}\n"
            f"95% CI: [{conf_int[0]:.2f}, {conf_int[1]:.2f}]"
        )

        plt.gca().text(
            0.05,
            0.95,
            textstr,
            transform=plt.gca().transAxes,
            fontsize=12,
            verticalalignment="top",
            bbox=dict(boxstyle="round", facecolor="white", alpha=0.5),
        )

        plt.title(f"Scatter Plot of {solar_param} vs {column}")
        plt.xlabel(solar_param)
        plt.ylabel(column)
        plt.grid()

        plt.show()
        print(f"Scatter Plot of {solar_param} vs {column}")


for solar_column in solar_columns:
    plot_regression(dynamics, solar_column)