In [None]:
import json
import logging
import os
from typing import Union

import numpy as np
import pandas as pd
from matplotlib.colors import Normalize
from pandas import Timestamp

In [None]:
logger = logging.getLogger(__name__)

logging.basicConfig(level=logging.DEBUG,
                    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')

logger.setLevel(logging.INFO)

In [None]:
def remove_values(values: list, to_remove: list) -> list:
    """
    Removes values from a list based on another list.
  
    Args:
        values: The list from which elements will be removed.
        to_remove: The list containing the values to be removed.
  
    Returns:
        A new list with the values from values_list that are not present in to_remove_list.
    """

    # Use list comprehension for a concise and efficient approach
    return [value for value in values if value not in to_remove]

In [None]:
satellite_json: os.path = "../data/LightSail-2/cache/normalized_frames.json"

with open(satellite_json, 'r') as file:
    json_stream: str = file.read()

json_data: dict = json.loads(json_stream)

if not json_data:
    logger.error("Json data is empty")
    exit(1)


class Satellite:
    def __init__(self, norad_id: int, name: str, observations_amount: int):
        self.norad_id: int = norad_id
        self.name: str = name
        self.observations_amount: int = observations_amount


metadata: dict = json_data['metadata']

satellite: Satellite = Satellite(
    metadata['satellite_norad'],
    metadata['satellite_name'],
    metadata['total_frames']
)

# Note '...' means there might be more same data

# column_tags look like:
# "column_tags": {
#               "some_variable": "variable",
#               "some_constant": "constant",
#               ... 
#               "some_status": "status"
#             }
solar_and_satellite_parameters_names: dict[str, str] = metadata['analysis'][
    'column_tags']

# feature_columns looks like:
# "feature_columns": [
#                 "Fredericksburg A",
#                 "Fredericksburg K 0-3"
#                 ]
solar_parameters_names: list[str] = metadata['analysis']['feature_columns']

satellite_parameters_names: list[str] = [value for value in
                                         solar_and_satellite_parameters_names if
                                         value not in solar_parameters_names]

# frames look like (fields might contain both satellite and solar parameters):
# "frames": [
#         {
#             "time": "2019-07-02 18:15:08",
#             "measurement": "",
#             "tags": {
#                 "satellite": "",
#                 "decoder": "Lightsail2",
#                 "station": "",
#                 "observer": "",
#                 "source": "",
#                 "version": "1.66.0"
#             },
#             "fields": {
#                 "dest_callsign": {
#                     "value": "N6CP  ",
#                     "unit": null
#                 },
#                 "Fredericksburg A": {
#                     "value": 3.3125,
#                     "unit": "V"
#                 },
#                 ...
#             }
#         },
#         ...
#     ]

frames: dict = json_data['frames']

allowed_types: Union = Union[int, float, bool, str]

observation_data: list[dict[str, Timestamp]] = []

# 'time' is unix timestamps, convertor - https://www.epochconverter.com/
for frame in frames:
    observation_time: Timestamp = pd.to_datetime(frame['time'])

    fields: dict[str, allowed_types] = {key: value['value'] for key, value in
                                        frame['fields'].items()}

    observation_data.append(
        {'time': observation_time, **fields})  # Unpack fields dictionary

observation_dataframe: pd.DataFrame = pd.DataFrame(observation_data).set_index(
    'time').dropna().select_dtypes(include=np.number)

observation_dataframe = observation_dataframe.loc[:,
                        observation_dataframe.nunique() > 1]  # Select columns with more than 1 unique value

In [None]:
solar_parameters_names = [param for param in solar_parameters_names if
                          param in observation_dataframe.columns]

satellite_parameters_names = [param for param in satellite_parameters_names if
                              param in observation_dataframe.columns]

In [None]:
def calculate_correlations(df: pd.DataFrame, list1_columns: list[str],
                           list2_columns: list[str]) -> dict[str, float]:
    """
    Calculates Pearson correlation coefficients between columns in two lists from a DataFrame.
  
    Args:
        df (pd.DataFrame): DataFrame containing observations with potential missing values.
        list1_columns (list): List of column names from the first list.
        list2_columns (list): List of column names from the second list.
  
    Returns:
        dict: A dictionary containing correlation coefficients between corresponding pairs 
              of columns from the two lists.
    """

    result: dict[str, float] = {}

    for col1 in list1_columns:
        for col2 in list2_columns:
            if col1 != col2:  # Avoid redundant calculations
                _correlation: float = df[[col1, col2]].corr().iloc[0, 1]
                result[f"{col1} vs. {col2}"] = _correlation

    return result


correlations: dict[str, float] = calculate_correlations(observation_dataframe,
                                                        satellite_parameters_names,
                                                        solar_parameters_names)

sorted_correlations: dict[str, float] = dict(
    sorted(correlations.items(), key=lambda item: abs(item[1]), reverse=True))

for pair, correlation in sorted_correlations.items():
    print(f"Correlation between {pair}: {correlation:.4f}")

In [None]:
import matplotlib.pyplot as plt


def plot_sat_vs_solar(df: pd.DataFrame, sat_param_name: str,
                      solar_params_names: list[str],
                      figsize=(12, 8), cmap='viridis',
                      legend_title='Data Comparison',
                      grid_style='--', grid_width=0.5, labelsize=10):
    """
    Creates aesthetically pleasing and informative scatter plots for each satellite parameter
    vs. solar parameters, employing advanced Matplotlib techniques for customization.

    Args:
        df (pd.DataFrame): DataFrame containing observations with timestamp index.
        sat_param_name (str): Name of the satellite parameter (y-axis).
        solar_params_names (list): List of solar parameter names (x-axes).
        figsize (tuple, optional): Desired figure size. Defaults to (12, 8).
        cmap (str, optional): Colormap for scatter points (e.g., 'viridis', 'plasma'). Defaults to 'viridis'.
        legend_title (str, optional): Title for the legend. Defaults to 'Data Comparison'.
        grid_style (str, optional): Linestyle for the grid (e.g., '-', '--'). Defaults to '--'.
        grid_width (float, optional): Width of the grid lines. Defaults to 0.5.
        labelsize (int, optional): Font size for labels and ticks. Defaults to 10.
    """
    figure, axis = plt.subplots(figsize=figsize)

    # Vectorized scatter plots
    if len(solar_params_names) > 1:
        norm: Normalize = plt.Normalize(vmin=min(df[solar_params_names].min()),
                                        vmax=max(df[solar_params_names].max()))
        for i, solar_param in enumerate(solar_params_names):
            axis.scatter(df[solar_param], df[sat_param_name], alpha=0.8,
                         label=solar_param)
    else:
        for solar_param in solar_params_names:
            axis.scatter(df[solar_param], df[sat_param_name], alpha=0.8,
                         label=solar_param)

    # Customization and informative labels
    axis.set_xlabel("Solar Parameters", fontsize=labelsize)
    axis.set_ylabel(sat_param_name, fontsize=labelsize)
    axis.set_title(f"{sat_param_name} vs. Solar Parameters", fontsize=labelsize)
    axis.grid(True, which='both', linestyle=grid_style, linewidth=grid_width)
    axis.legend(loc='upper left', fontsize=labelsize, title=legend_title,
                bbox_to_anchor=(1, 1))
    plt.tick_params(bottom=True, top=True, left=True, right=True, which='both',
                    labelsize=labelsize)

    plt.tight_layout()
    plt.show()
    plt.clf()


for sat_param in satellite_parameters_names:
    logger.info(f"Plotting {sat_param} vs. Solar Parameters")
    plot_sat_vs_solar(observation_dataframe, sat_param,
                      solar_parameters_names.copy())