In [None]:
import numpy as np
from scipy.spatial import Delaunay
from itertools import combinations
import pandas as pd
import os

import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib
import matplotlib.tri as mtri
import matplotlib.cm as cm
matplotlib.rcParams['figure.dpi'] = 360
matplotlib.rcParams['text.usetex'] = True
os.environ['PATH'] = '/Library/TeX/texbin:' + os.environ['PATH']

### Data

In [None]:
base_url = 'create_files/'

In [None]:
data_filenames = [f'{base_url}QSO_{i}_clustering_data.ecsv' for i in range(20)]
rand_filenames = [f'{base_url}QSO_{i}_clustering_random.ecsv' for i in range(20)]

### Delaunay triangulation

In [None]:
df_rosettas = []

for data_file, rand_file, rosette_id in zip(data_filenames, rand_filenames, range(len(data_filenames))):
    data = pd.read_csv(data_file, comment='#', sep=r'\s+', engine='python')
    rand = pd.read_csv(rand_file, comment='#', sep=r'\s+', engine='python')

    data['RAN'] = False
    rand['RAN'] = True

    df = pd.concat([data, rand], ignore_index=True)
    df['ROSETTE_ID'] = rosette_id

    df_rosettas.append(df)

In [None]:
tri_3d_list = []  
tri_2d_list = []   

for df in df_rosettas:
    coords_3d = df[['X', 'Y', 'Z']].values
    coords_2d = df[['X', 'Y']].values

    tri_3d = Delaunay(coords_3d)
    tri_2d = mtri.Triangulation(df['X'], df['Y'])

    tri_3d_list.append(tri_3d)
    tri_2d_list.append(tri_2d)

In [None]:
fig, axes = plt.subplots(5, 4, figsize=(25, 25))
axes = axes.flatten()  

for i, (df, triang, ax) in enumerate(zip(df_rosettas, tri_2d_list, axes)):

    data_real = df[~df['RAN']]
    data_rand = df[df['RAN']]

    ax.triplot(triang, linewidth=0.2, color='k', zorder=1)
    ax.scatter(data_real['X'], data_real['Y'], s=4, c='r', label='Data', zorder=3)
    ax.scatter(data_rand['X'], data_rand['Y'], s=0.5, c='b', label='Random', zorder=5)

    ax.set_title(f'Rosette {i}', fontsize=10)
    ax.set_xlabel('X [Mpc]')
    ax.set_ylabel('Y [Mpc]')
    ax.set_box_aspect(1)
    #ax.legend(fontsize=6, loc='upper right')

    #ax.set_xlim(-5800, -5400)
    #ax.set_ylim(-80, -0)

plt.tight_layout()
plt.show()


In [None]:
from sklearn.decomposition import PCA

In [None]:
def rotate_pca_3d(df):
    # use only the real points to calculate the PCA
    coords_real = df[~df['RAN']][['X', 'Y', 'Z']].values

    pca = PCA(n_components=3)
    pca.fit(coords_real)  # only reals define the orientation

    coords_all = df[['X', 'Y', 'Z']].values
    coords_rotated = pca.transform(coords_all)

    df_rot = df.copy()
    df_rot[['PC1', 'PC2', 'PC3']] = coords_rotated
    #df_rot['Angle rotation [°]'] = np.degrees(np.arccos(np.clip(pca.components_[0] @ [0, 0, 1], -1, 1)))

    return df_rot

In [None]:
df_rosettas_rotated = [rotate_pca_3d(df) for df in df_rosettas]

# Create new triangulations with rotated points (real only)
tri_2d_list = []
for df in df_rosettas_rotated:
    #data_real_rot = df[~df['RAN']][['X_rot', 'Y_rot']].values
    data_rot = df[['PC1', 'PC2']].values
    tri = Delaunay(data_rot)
    tri_2d_list.append(tri)

In [None]:
fig, axes = plt.subplots(5, 4, figsize=(25, 25))
axes = axes.flatten()

for i, (df, triang, ax) in enumerate(zip(df_rosettas_rotated, tri_2d_list, axes)):

    data_real = df[~df['RAN']]
    data_rand = df[df['RAN']]

    ax.triplot(triang.points[:, 0], triang.points[:, 1], triang.simplices, linewidth=0.2, color='k', zorder=1)

    ax.scatter(data_real['PC1'], data_real['PC2'], s=4, c='r', label='Data', zorder=3)
    ax.scatter(data_rand['PC1'], data_rand['PC2'], s=0.5, c='b', label='Random', zorder=5)

    ax.set_title(f'Rosette {i}', fontsize=10)
    ax.set_xlabel('PC1 [Mpc]')
    ax.set_ylabel('PC2 [Mpc]')
    ax.set_box_aspect(1)
    ax.legend(fontsize=6, loc='upper right')

plt.tight_layout()
plt.show()

### Get $\space r$

In [None]:
def compute_r(df):
    coords = df[['X', 'Y', 'Z']].values
    is_data = ~df['RAN'].values

    tri = Delaunay(coords)

    #! adjacency list for neighbors
    neighbors = {i: set() for i in range(len(coords))}
    for simplex in tri.simplices:
        for i, j in combinations(simplex, 2):
            neighbors[i].add(j)
            neighbors[j].add(i)

    r = np.zeros(len(coords), dtype=float)
    for i, nbrs in neighbors.items():
        n_data = int(np.sum(is_data[list(nbrs)]))
        n_rand = len(nbrs) - n_data
        if (n_data + n_rand) > 0:
            r[i] = (n_data - n_rand) / (n_data + n_rand)
        else:
            raise ValueError(f'No neighbors for point {i} in the triangulation.')

    out = df.copy()
    out['r'] = r
    return out

In [None]:
s = 0
for j in range(len(df_rosettas_rotated)):
    s += len(df_rosettas_rotated[j])
print(s)

In [None]:
df_r = []

for i in range(len(df_rosettas_rotated)):
    r_data = compute_r(df_rosettas_rotated[i])

    df_r.append(r_data)

### CDF of $r$

In [None]:
from statsmodels.distributions.empirical_distribution import ECDF

In [None]:
def compute_cdf(values):
    sorted_vals = np.sort(values)
    cdf = np.linspace(0, 1, len(sorted_vals))
    return sorted_vals, cdf

In [None]:
n_rosettas = len(df_r)
colors = cm.get_cmap('tab20').colors 

plt.figure(figsize=(10, 7))

for i in range(n_rosettas):
    r_data = df_r[i][~df_r[i]['RAN']]['r'].values
    r_rand = df_r[i][df_r[i]['RAN']]['r'].values

    cdf_data = ECDF(r_data)
    cdf_rand = ECDF(r_rand)

    r_data_sorted = cdf_data.x
    cdf_data_vals = cdf_data.y

    r_rand_sorted = cdf_rand.x
    cdf_rand_vals = cdf_rand.y

    color = colors[i]

    plt.plot(r_data_sorted, cdf_data_vals, color=color, linestyle='-', label=f'Rosetta {i} Data')
    plt.plot(r_rand_sorted, cdf_rand_vals, color=color, linestyle='dotted', label=f'Rosetta {i} Random')

plt.xlabel('r')
plt.ylabel('CDF')
plt.title('QSO')
plt.grid(True)
plt.legend(ncol=2, fontsize='small', loc='upper left')
plt.tight_layout()
plt.show()

### Classify

In [None]:
def classify_r(df):
    r = df['r'].values
    conds = [(r >= -1.0) & (r <= -0.9),
             (r > -0.9) & (r <= 0.0),
             (r > 0.0) & (r <= 0.9),
             (r > 0.9) & (r <= 1.0),]
    choices = ['void', 'sheet', 'filament', 'knot']
    df = df.copy()
    df['TYPE'] = np.select(conds, choices, default='error')
    return df

In [None]:
df_typed = []

for df in df_r:
    typed = classify_r(df)
    df_typed.append(typed)

In [None]:
structure_types = ['void', 'sheet', 'filament', 'knot']

rows = []
index_labels = []

dfs_by_rosetta = []

for i, df in enumerate(df_typed):
    rosetta_rows = []
    rosetta_labels = []

    for source, label in [(False, f"Rosetta {i} data"), (True, f"Rosetta {i} rand")]:
        df_sub = df[df['RAN'] == source]
        total = len(df_sub)
        values = []

        for t in structure_types:
            count = np.sum(df_sub['TYPE'] == t)
            if count == 0:
                frac = 0.0
                std = 0.0
            else:
                frac = count / total
                std = ( (frac * (1 - frac)) / total )**0.5

            frac_percent = frac * 100
            std_percent = std * 100

            values.append(f"{frac_percent:.2f}% ± {std_percent:.2f}%")

        rosetta_rows.append(values)
        rosetta_labels.append(label)

    rosetta_df = pd.DataFrame(rosetta_rows, columns=['Voids', 'Sheets', 'Filaments', 'Knots'], index=rosetta_labels)
    dfs_by_rosetta.append(rosetta_df)

for i, rosetta_df in enumerate(dfs_by_rosetta):
    print(f"\nRosetta {i}")
    display(rosetta_df)

### Plot

In [None]:
type_colors = {
    'void': 'black',
    'sheet': 'blue',
    'filament': 'red',
    'knot': 'green'
}

# For data

In [None]:
for j in range(n_rosettas):
    df = df_typed[j]

    #df = df[df['PC3'].between(0, 25)]

    coords2d = df[['PC1', 'PC2']].values
    is_real = ~df['RAN'].values

    n_types = len(structure_types)
    fig, ax = plt.subplots(4, 1, figsize=(12, 6))

    for i, tp in enumerate(structure_types):
        ax = plt.subplot(n_types, 1, i + 1)
        color = type_colors[tp]

        mask_t = (df['TYPE'] == tp).values
        mask_real = mask_t & is_real

        ax.scatter(
            coords2d[mask_real, 0], coords2d[mask_real, 1],
            s=20, facecolors=color, edgecolors='black', linewidths=0.3, alpha=0.4
        )

        ax.set_title(f' {tp.capitalize()}')
        ax.set_xlabel('PC1 [Mpc]')
        ax.set_ylabel('PC2 [Mpc]')

    plt.suptitle(f'Rosetta {j} - Data', fontsize=16)
    plt.tight_layout(rect=[0, 0, 1, 0.96])
    plt.show()


# For random

In [None]:
for j in range(n_rosettas):
    df = df_typed[j]

    #df = df[df['PC3'].between(0, 25)]

    coords2d = df[['PC1', 'PC2']].values
    is_real = ~df['RAN'].values

    n_types = len(structure_types)
    fig, ax = plt.subplots(4, 1, figsize=(12, 6))

    for i, tp in enumerate(structure_types):
        ax = plt.subplot(n_types, 1, i + 1)
        color = type_colors[tp]

        mask_t = (df['TYPE'] == tp).values
        mask_rand = mask_t & ~is_real

        ax.scatter(
            coords2d[mask_rand, 0], coords2d[mask_rand, 1],
            s=20, facecolors=color, edgecolors='black', linewidths=0.3, alpha=0.4
        )

        ax.set_title(f' {tp.capitalize()}')
        ax.set_xlabel('PC1 [Mpc]')
        ax.set_ylabel('PC2 [Mpc]')

    plt.suptitle(f'Rosetta {j} - Random', fontsize=16)
    plt.tight_layout(rect=[0, 0, 1, 0.96])
    plt.show()