In [3]:
import numpy as np

test_data = np.load("/mnt/nas05/data01/francesco/sdo_img2img/sde_mag2mag_v2/progetto_simone/data/pickled_maps/map_2024_12_1_0_0_0.npy", allow_pickle=True)

In [None]:
test_data

In [None]:
import matplotlib.pyplot as plt

plt.imshow(test_data[0])

In [None]:
test_data[0].shape

In [None]:
np.std(test_data[0])

In [1]:
import pandas as pd

stats_df = pd.read_csv(r"/mnt/nas05/data01/francesco/sdo_img2img/sde_mag2mag_v2/npy_metrics.csv")

In [None]:
sorted(stats_df['filename'])[0]

In [None]:
stats_df['filename'].iloc[0]

In [None]:
stats_df["timestamp_str"] = stats_df["filename"].str.extract(r"map_(\d+_\d+_\d+_\d+_\d+_\d+)\.npy")


In [None]:
stats_df["datetime"] = pd.to_datetime(stats_df["timestamp_str"], format="%Y_%m_%d_%H_%M_%S")


In [None]:
stats_df = stats_df.sort_values(by="datetime", ascending=True)

In [None]:
(stats_df['mean']).mean(), stats_df['std'].mean()  # or better: use RMS of stds


stats_df

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime

stats = ['std', 'max', 'min', 'mean']
# Plot
for value in stats:
    plt.figure(figsize=(12, 5))
    plt.plot(stats_df['datetime'], stats_df[f'{value}'], color='black')

    # Labels and formatting
    plt.xlabel("Year", fontsize=25)
    plt.ylabel(f"{value}", fontsize=25)
    plt.title(f"{value}", fontsize=25)
    plt.xticks(rotation=45, fontsize=15)
    plt.yticks(fontsize=15)
    plt.grid(True)
    plt.show()
    plt.tight_layout()
    plt.savefig(f"/mnt/nas05/data01/francesco/sdo_img2img/sde_mag2mag_v2/plots/{value}.png")
    plt.close()


In [None]:
import sys
sys.path.append("/mnt/nas05/data01/francesco/progetto_simone/ionosphere")  # add current dir, where src/ is located

from src.data.dataset import IonoDataset

In [None]:
train_dataset = IonoDataset(
    path="/mnt/nas05/data01/francesco/sdo_img2img/sde_mag2mag_v2/progetto_simone/data/pickled_maps",
    transforms=True,
    split='train',
    seed=42
)

In [None]:
train_dataset[0]

In [None]:
import torch
torch.max(train_dataset[0][0]), torch.min(train_dataset[0][0]), torch.mean(train_dataset[0][0]), torch.std(train_dataset[0][0])

In [None]:
import matplotlib.pyplot as plt
plt.imshow(train_dataset[0][0][0].cpu().numpy())
plt.colorbar()

In [None]:
from src.data.dataset import get_data_objects

train_dataset, train_sampler, train_data_loader = get_data_objects(
    path="/mnt/nas05/data01/francesco/sdo_img2img/sde_mag2mag_v2/progetto_simone/data/pickled_maps",
    batch_size=4,
    distributed=False,
    num_data_workers=1,
    split='train',
    seed=42
)

In [None]:
train_dataset[0]

In [None]:
import matplotlib.pyplot as plt
plt.imshow(train_dataset[0][0][0].cpu().numpy())
plt.colorbar()

In [7]:
import pandas as pd

df = pd.read_csv("/mnt/nas05/data01/francesco/sdo_img2img/sde_mag2mag_v2/progetto_simone/data/params.csv")

In [None]:
df

In [None]:
import matplotlib.pyplot as plt

plt.hist(df['float3'])
df['float3'].min()

In [3]:
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler

features = df[["float1", "float2", "float3", "float4"]]
scaler = StandardScaler()
X_scaled = scaler.fit_transform(features)

In [None]:
# cluster parameters
min_cluster = 2
max_cluster = 50
from yellowbrick.cluster import KElbowVisualizer, SilhouetteVisualizer, InterclusterDistance
from sklearn.cluster import KMeans, MeanShift

model = KMeans()
visualizer = KElbowVisualizer(model, k=(min_cluster, max_cluster + 1))
visualizer.fit(X_scaled)  # Fit the data to the visualizer
visualizer.show() 

In [4]:
# with optimal value of the elbo
from sklearn.cluster import KMeans
optimal_k = 3 #visualizer.elbow_value_
kmeans = KMeans(n_clusters=optimal_k, random_state=42)
df["cluster"] = kmeans.fit_predict(X_scaled)

In [7]:
df.to_csv("/mnt/nas05/data01/francesco/sdo_img2img/sde_mag2mag_v2/progetto_simone/data/params_withcluster.csv", index=False)

In [4]:
import pandas as pd
df = pd.read_csv("/mnt/nas05/data01/francesco/sdo_img2img/sde_mag2mag_v2/progetto_simone/data/params_withcluster.csv")

In [None]:
import numpy as np
np.unique(df['cluster'], return_counts=True)

In [None]:
import matplotlib.pyplot as plt
# from sklearn.manifold import TSNE
from openTSNE import TSNE

# # Reduce dimensionality to 2D for visualization
# tsne = TSNE(n_components=2, random_state=42, perplexity=50, learning_rate=200)
# X_embedded = tsne.fit_transform(X_scaled)
tsne = TSNE(
    n_components=2,
    perplexity=50,
    metric="cosine",
    n_jobs=8,  # number of threads
    random_state=42
)
X_embedded = tsne.fit(X_scaled)

# Add t-SNE coordinates to DataFrame for plotting
df["tsne_1"] = X_embedded[:, 0]
df["tsne_2"] = X_embedded[:, 1]

# Set figure size to match 1920x1080 pixels with dpi=100
fig, ax = plt.subplots(figsize=(19.2, 10.8), dpi=100)

# Plot each cluster
for cluster_id in sorted(df["cluster"].unique()):
    subset = df[df["cluster"] == cluster_id]
    ax.scatter(subset["tsne_1"], subset["tsne_2"], s=10, label=f"Cluster {cluster_id}", alpha=0.7)

# Titles and labels
ax.set_title(f"t-SNE Projection of KMeans Clusters (k={optimal_k})", fontsize=16)
ax.set_xlabel("t-SNE 1", fontsize=12)
ax.set_ylabel("t-SNE 2", fontsize=12)
ax.legend(markerscale=2, fontsize="small", loc="best")
ax.grid(True)

# Save as Full HD PNG
plt.tight_layout()
# plt.savefig("tsne_clusters_fullhd.png", dpi=100)  # or dpi=200 for 4K
plt.show()



In [15]:
df_0 = df[df["cluster"] == 0]
df_1 = df[df["cluster"] == 1]
df_2 = df[df["cluster"] == 2]

In [None]:
from datetime import datetime
import matplotlib.pyplot as plt

dates = []
for f in df['filename']:
    parts = f.replace('.npy', '').split('_')
    year, month, day, hour, minute, second = map(int, parts[1:])
    dates.append(datetime(year, month, day, hour, minute, second))

df['dates'] = dates

df['month'] = df['dates'].dt.month
monthly_counts = df.groupby(['month', 'cluster']).size().unstack(fill_value=0)

monthly_counts.plot(kind='bar', stacked=True, colormap='Set1', figsize=(19.2, 10.8))
plt.title("Cluster Distribution by Month", fontsize=25)
plt.xlabel("Month", fontsize=20)
plt.xticks(fontsize=15)
plt.yticks(fontsize=15)
plt.ylabel("Number of Samples", fontsize=20)
plt.legend(title='Cluster', fontsize=15)
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
df_2

In [None]:
# Assuming you have a DataFrame df with float1, float2, float3 and cluster
summary = df.groupby("cluster")[["float1", "float2", "float3", "float4"]].agg(["mean", "std", "min", "max"])

# Optional: flatten MultiIndex columns
summary.columns = ['_'.join(col) for col in summary.columns]

# Reset index for readability
summary = summary.reset_index()

# If you want long-format version for easier plotting:
long_summary = pd.melt(
    summary,
    id_vars="cluster",
    var_name="feature_stat",
    value_name="value"
)

# Split 'float1_mean' into 'feature' and 'stat'
long_summary[["feature", "stat"]] = long_summary["feature_stat"].str.extract(r"(float\d)_(\w+)")
long_summary = long_summary.drop(columns="feature_stat")

# Optional: pivot to have mean/std/min/max columns per feature per cluster
tidy = long_summary.pivot_table(index=["cluster", "feature"], columns="stat", values="value").reset_index()

# Pretty print
print(tidy)


In [None]:
for col in ["float1", "float2", "float3", "float4"]:
    import seaborn as sns
    plt.figure(figsize=(6, 4))
    sns.violinplot(x="cluster", y=col, data=df, inner="quartile")
    plt.title(f"{col} violin plot per cluster")
    plt.show()


In [None]:
import seaborn as sns
sns.pairplot(df, vars=["float1", "float2", "float3", "float4"], hue="cluster", corner=True)


In [None]:
for cluster_id in [0, 1, 2]:
    corr = df[df["cluster"] == cluster_id][["float1", "float2", "float3", "float4"]].corr()
    print(f"\nCluster {cluster_id} Correlation:\n{corr}\n")


In [None]:
missing_by_month = df[df["cluster"] == 2].groupby("month").size()
missing_by_month.plot(kind="bar", title="Missing Data Frequency by Month", ylabel="# of Missing Samples")


In [None]:
import numpy as np
import matplotlib.pyplot as plt
import os

# Parameters
clusters = [0, 1, 2]
num_samples = 10
image_shape = (24, 360)
base_path = "/mnt/nas05/data01/francesco/sdo_img2img/sde_mag2mag_v2/progetto_simone/data/pickled_maps/"

# Create large grid: 3 rows x 10 columns
fig, axes = plt.subplots(nrows=3, ncols=10, figsize=(20, 5), 
                         gridspec_kw={'wspace': 0.02, 'hspace': 0.0001}, dpi=600)

for row_idx, cluster_id in enumerate(clusters):
    sample_df = df[df["cluster"] == cluster_id].sample(n=num_samples, random_state=42)
    
    for col_idx, (idx, row) in enumerate(sample_df.iterrows()):
        img_path = os.path.join(base_path, row["filename"])
        ax = axes[row_idx, col_idx]

        try:
            img = np.load(img_path, allow_pickle=True)[0]
            if img.shape != image_shape:
                print(f"Skipping image with wrong shape: {img_path}")
                ax.axis("off")
                continue
        except Exception as e:
            print(f"Failed to load {img_path}: {e}")
            ax.axis("off")
            continue
        
        ax.imshow(img, cmap="viridis", aspect='equal')
        ax.axis("off")

        if col_idx == 0:
            ax.set_ylabel(f"Cluster {cluster_id}", fontsize=10)

plt.tight_layout(pad=0.0)
plt.show()



In [30]:
import sys
sys.path.append("/mnt/nas05/data01/francesco/progetto_simone/ionosphere")
from src.data.dataset import get_sequence_data_objects

train_dataset, train_sampler, train_dl = get_sequence_data_objects(
    csv_path="/mnt/nas05/data01/francesco/sdo_img2img/sde_mag2mag_v2/npy_metrics.csv",
    batch_size=2,
    distributed=False,
    num_data_workers=1,
    split='train',
    seed=42,
    sequence_length=60
)

In [31]:
data_test = train_dataset[4]

In [5]:
data_test[2]

[datetime.datetime(2024, 8, 31, 10, 0),
 datetime.datetime(2024, 8, 31, 10, 2),
 datetime.datetime(2024, 8, 31, 10, 4),
 datetime.datetime(2024, 8, 31, 10, 6),
 datetime.datetime(2024, 8, 31, 10, 8),
 datetime.datetime(2024, 8, 31, 10, 10),
 datetime.datetime(2024, 8, 31, 10, 12),
 datetime.datetime(2024, 8, 31, 10, 14),
 datetime.datetime(2024, 8, 31, 10, 16),
 datetime.datetime(2024, 8, 31, 10, 18),
 datetime.datetime(2024, 8, 31, 10, 20),
 datetime.datetime(2024, 8, 31, 10, 22),
 datetime.datetime(2024, 8, 31, 10, 24),
 datetime.datetime(2024, 8, 31, 10, 26),
 datetime.datetime(2024, 8, 31, 10, 28),
 datetime.datetime(2024, 8, 31, 10, 30),
 datetime.datetime(2024, 8, 31, 10, 32),
 datetime.datetime(2024, 8, 31, 10, 34),
 datetime.datetime(2024, 8, 31, 10, 36),
 datetime.datetime(2024, 8, 31, 10, 38)]

In [3]:
data_test[3].shape

torch.Size([20, 1, 24, 360])

In [13]:
import matplotlib.pyplot as plt
import imageio
import numpy as np
import torch

# data_test[0] is shape [20, 1, 24, 360]
frames = []
data_seq = data_test[0]  # shape: [20, 1, 24, 360]
# Dynamically set figsize based on image shape for better fit
img_h, img_w = data_seq.shape[2], data_seq.shape[3]
aspect = img_w / img_h
base_height = 4  # inches
figsize = (base_height * aspect, base_height)

for t in range(data_seq.shape[0]):
    img = data_seq[t, 0].cpu().numpy()  # shape: [24, 360]
    fig, ax = plt.subplots(figsize=figsize)
    im = ax.imshow(img, cmap='viridis', aspect='auto')
    ax.set_title(f"Time step {t}")
    ax.axis('off')
    fig.subplots_adjust(left=0, right=1, top=1, bottom=0)
    # Convert plot to image array
    fig.canvas.draw()
    frame = np.frombuffer(fig.canvas.tostring_rgb(), dtype=np.uint8)
    frame = frame.reshape(fig.canvas.get_width_height()[::-1] + (3,))
    frames.append(frame)
    plt.close(fig)

# Save as gif
imageio.mimsave('sequence.gif', frames, duration=1)
print("GIF saved as sequence.gif")


GIF saved as sequence.gif


In [15]:
data_test[1][0]

tensor([1.0600, 8.0900, 4.0900, 0.0000])

In [28]:
data_test[1].shape

torch.Size([20, 4])

In [32]:
import numpy as np
import matplotlib.pyplot as plt
from scipy.interpolate import griddata
import imageio

for item in data_test[1]:
    item[-1] = item[-1] * -1


# Prepare frames for animation
frames = []
cond_names = ['Bx', 'By', 'Bz', 'Vwind']

# Convert all condition data to numpy for easier slicing
cond_data_all = np.array([d.numpy() if hasattr(d, 'numpy') else np.array(d) for d in data_test[1]])

# Time axis for lineplots (use integer steps or actual time if available)
timesteps = np.arange(data_test[0].shape[0])

# Adjust figure size for better subplot visibility
fig_width = 18  # inches, reduced for better aspect
fig_height = 6  # inches

for t in range(data_test[0].shape[0]):
    data = data_test[0][t, 0, :, :].numpy()
    time = data_test[2][t]
    # Define coordinates
    mag_lat = np.linspace(-90, -66, data.shape[0])
    mag_lon = np.linspace(0, 360, data.shape[1], endpoint=False)

    # Create meshgrid from lat/lon
    lon_grid, lat_grid = np.meshgrid(mag_lon, mag_lat)

    # Convert to polar coordinates
    r = 90 - np.abs(lat_grid.flatten())
    theta = np.deg2rad(lon_grid.flatten())

    # Interpolate onto regular polar grid
    r_i = np.linspace(r.min(), r.max(), 200)
    theta_i = np.linspace(0, 2 * np.pi, 360)
    r_grid, theta_grid = np.meshgrid(r_i, theta_i)

    # Interpolation for original data
    points = np.column_stack((r, theta))
    grid_values = griddata(points, data.flatten(), (r_grid, theta_grid), method='linear')

    # Set up the figure: 1 polar plot + 4 lineplots
    fig = plt.figure(figsize=(fig_width, fig_height))
    # Main polar plot
    ax0 = plt.subplot2grid((1, 5), (0, 0), colspan=1, projection='polar')
    c1 = ax0.pcolormesh(theta_grid, r_grid, grid_values, shading='auto', cmap='plasma')
    ax0.set_theta_zero_location("S")
    ax0.set_theta_direction(-1)
    ax0.axis('off')
    ax0.set_title("Main Data", fontsize=12)

    # 4 additional subplots for Bx, By, Bz, Vwind as lineplots
    for i in range(4):
        ax = plt.subplot2grid((1, 5), (0, i+1))
        ax.set_title(cond_names[i], fontsize=12)
        # Plot the line up to current t
        ax.plot(timesteps[:t+1], cond_data_all[:t+1, i], color='tab:blue', marker='o')
        # Highlight the current point
        ax.scatter(timesteps[t], cond_data_all[t, i], color='red', zorder=5)
        # Optionally, set axis limits for consistency
        ax.set_xlim(timesteps[0], timesteps[-1])
        # Set y-limits based on data range, with some margin
        y_min = np.min(cond_data_all[:, i])
        y_max = np.max(cond_data_all[:, i])
        y_pad = 0.1 * (y_max - y_min) if y_max > y_min else 1
        ax.set_ylim(y_min - y_pad, y_max + y_pad)
        ax.grid(True, alpha=0.3)
        ax.tick_params(axis='both', which='both', labelsize=10)
        # Show x/y labels for clarity
        ax.set_xlabel("Timestep", fontsize=10)
        ax.set_ylabel(cond_names[i], fontsize=10)
        # Make x-axis ticks more readable
        ax.xaxis.set_tick_params(rotation=45)

    # Move the title above the image using suptitle and adjust spacing
    fig.subplots_adjust(left=0.03, right=0.99, top=0.90, bottom=0.15, wspace=0.35)
    fig.suptitle(f"Time step {t} | Time: {time}", y=0.98, fontsize=16)
    # Convert plot to image array
    fig.canvas.draw()
    frame = np.frombuffer(fig.canvas.tostring_rgb(), dtype=np.uint8)
    frame = frame.reshape(fig.canvas.get_width_height()[::-1] + (3,))
    frames.append(frame)
    plt.close(fig)

# Save as GIF with correct per-frame duration in seconds (not ms!)
# imageio.mimsave expects duration in seconds per frame, or a list of seconds per frame
# To slow down, set duration to e.g. 1.0 for 1 second per frame
# To pause at the last frame, make the last duration longer
# Save GIF, loop=0 means infinite loop, loop=1 means play twice, etc.
# Use 'fps' instead of 'duration' to control frame rate more precisely
imageio.mimsave('polar_sequence.gif', frames, duration=1000, loop=0)
print("GIF saved as polar_sequence.gif (duration per frame set to {:.1f}s, last frame pause {:.1f}s)".format(frame_duration, last_frame_pause))


  c1 = ax0.pcolormesh(theta_grid, r_grid, grid_values, shading='auto', cmap='plasma')


GIF saved as polar_sequence.gif (duration per frame set to 1.0s, last frame pause 2.0s)
