In [None]:
import numpy as np
from scipy.spatial import Delaunay
from sklearn.decomposition import PCA
from itertools import combinations
from tqdm import tqdm
import pandas as pd
import os
import pickle
from astropy.table import Table
from collections import Counter
from scipy.stats import gaussian_kde


import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib
import matplotlib.tri as mtri
import matplotlib.cm as cm
matplotlib.rcParams['figure.dpi'] = 360
matplotlib.rcParams['text.usetex'] = True
os.environ['PATH'] = '/Library/TeX/texbin:' + os.environ['PATH']

In [None]:
def compute_r(df):
    coords = df[['X', 'Y', 'Z']].values
    is_data = ~df['RAN'].values

    tri = Delaunay(coords)

    #! adjacency list for neighbors
    neighbors = {i: set() for i in range(len(coords))}
    for simplex in tri.simplices:
        for i, j in combinations(simplex, 2):
            neighbors[i].add(j)
            neighbors[j].add(i)

    r = np.zeros(len(coords), dtype=float)
    for i, nbrs in neighbors.items():
        n_data = int(np.sum(is_data[list(nbrs)]))
        n_rand = len(nbrs) - n_data
        if (n_data + n_rand) > 0:
            r[i] = (n_data - n_rand) / (n_data + n_rand)
        else:
            raise ValueError(f'No neighbors for point {i} in the triangulation.')

    out = df.copy()
    out['r'] = r
    return out

In [None]:
def classify_r(df):
    r = df['r'].values
    conds = [(r >= -1.0) & (r <= -0.9),
             (r > -0.9) & (r <= 0.0),
             (r > 0.0) & (r <= 0.9),
             (r > 0.9) & (r <= 1.0),]
    choices = ['void', 'sheet', 'filament', 'knot']
    df = df.copy()
    df['TYPE'] = np.select(conds, choices, default='error')
    return df

In [None]:
def reclassify_r(df):
    r = df['r'].values
    conds = [(r >= -1.0) & (r <= -0.5),
             (r > -0.5) & (r <= 0.0),
             (r > 0.0) & (r <= 0.9),
             (r > 0.9) & (r <= 1.0)]
    choices = ['void', 'sheet', 'filament', 'knot']
    df = df.copy()
    df['TYPE'] = np.select(conds, choices, default='error')
    return df

### Data

In [None]:
data_ngc1 = Table.read("create_files/QSO_NGC1_clustering_data.ecsv", format="ascii.ecsv").to_pandas()
data_ngc2 = Table.read("create_files/QSO_NGC2_clustering_data.ecsv", format="ascii.ecsv").to_pandas()

In [None]:
data_ngc1['RAN'] = False
data_ngc2['RAN'] = False

In [None]:
"""""
output_dir = "results_ASTRA_100"
os.makedirs(output_dir, exist_ok=True)

for zone_name, data_df in zip(['NGC_1', 'NGC_2'], [data_ngc1, data_ngc2]):
    for j in tqdm(range(100), desc=f"Proccesing {zone_name}"):
        rand_file = f"data_100_random/{zone_name}_random_{j}.ecsv"
        output_file = os.path.join(output_dir, f"{zone_name}_typed_{j}_TEST.parquet")

        if os.path.exists(output_file):
            continue  # ya procesado

        rand_df = Table.read(rand_file, format="ascii.ecsv").to_pandas()
        rand_df['RAN'] = True

        df = pd.concat([data_df, rand_df], ignore_index=True)

        df_r = compute_r(df)
        df_typed = classify_r(df_r)

        df_typed.to_parquet(output_file, index=False)
"""""

In [None]:
def plot_type_xy_by_source(type_name, rand_id, zone,
                           base_path="results_ASTRA_100", point_size=0.1):

    filename = f"{base_path}/{zone}_typed_{rand_id}_TEST.parquet"
    df = pd.read_parquet(filename)

    df_type = df[df['TYPE'] == type_name]

    df_data = df_type[df_type['RAN'] == False]
    df_rand = df_type[df_type['RAN'] == True]

    plt.figure(figsize=(8, 6))
    plt.scatter(df_rand['X'], df_rand['Y'], s=point_size, c='blue', alpha=0.5, label=f'Random (file {rand_id})')
    #plt.scatter(df_data['X'], df_data['Y'], s=point_size, c='red', alpha=0.7, label='Data')


    plt.title(f"{type_name} - {zone}", fontsize=13)
    plt.xlabel("X [Mpc]", fontsize=12)
    plt.ylabel("Y [Mpc]", fontsize=12)
    plt.legend(fontsize=7)
    #plt.grid(True)
    plt.tight_layout()
    plt.gca().set_aspect('equal', adjustable='box')
    plt.show()

In [None]:
plot_type_xy_by_source('void', 78, zone='NGC_1')

# Type classification

In [None]:
structure_types = ['void', 'sheet', 'filament', 'knot']
zones = ['NGC_1', 'NGC_2'] 

In [None]:
data_fractions = {zone: {t: [] for t in structure_types} for zone in zones}
rand_fractions = {zone: {t: [] for t in structure_types} for zone in zones}

In [None]:
for zone in zones:
    for j in range(100):
        filepath = f"results_ASTRA_100/{zone}_typed_{j}.parquet"
        df = pd.read_parquet(filepath)

        for source, container in [(False, data_fractions), (True, rand_fractions)]:
            df_sub = df[df['RAN'] == source]
            total = len(df_sub)

            for t in structure_types:
                count = np.sum(df_sub['TYPE'] == t)
                frac = count / total if total > 0 else 0.0
                container[zone][t].append(frac)

In [None]:
dfs_by_zone = []

mean_data_by_type = {t: [] for t in structure_types}
mean_rand_by_type = {t: [] for t in structure_types}

for zone in zones:
    zone_rows = []
    zone_labels = []

    for label, container, store in [(f"{zone} data", data_fractions, mean_data_by_type),
                                     (f"{zone} rand", rand_fractions, mean_rand_by_type)]:
        row = []
        for t in structure_types:
            fracs = container[zone][t]
            if len(fracs) == 0:
                mean_frac = 0.0
                std_frac = 0.0
            else:
                mean_frac = np.mean(fracs)
                std_frac = np.std(fracs, ddof=1)

            store[t].append(mean_frac)
            row.append(f"{mean_frac*100:.2f}% ± {std_frac*100:.2f}%")

        zone_rows.append(row)
        zone_labels.append(label)

    df_zone = pd.DataFrame(zone_rows,
                           columns=['Voids', 'Sheets', 'Filaments', 'Knots'],
                           index=zone_labels)
    dfs_by_zone.append(df_zone)



In [None]:
dfs_by_zone

In [None]:
summary_rows = []

for label, store in [("Mean data", mean_data_by_type),
                     ("Mean rand", mean_rand_by_type)]:
    row = []
    for t in structure_types:
        means = store[t]  # a list of two values (NGC1 and NGC2)
        avg = np.mean(means)
        err = np.std(means, ddof=1)
        row.append(f"{avg*100:.2f}% ± {err*100:.5f}%")
    summary_rows.append(row)

df_summary = pd.DataFrame(summary_rows,
                          columns=['Voids', 'Sheets', 'Filaments', 'Knots'],
                          index=['Mean data', 'Mean rand'])

In [None]:
df_summary

# Entropy
$
H = - \frac{1}{\log_2 4} \sum_{w=1}^4 p_w \log_2(p_w)
$

In [None]:
type_to_index = {t: i for i, t in enumerate(structure_types)}
type_to_index

In [None]:
base_path = "results_ASTRA_100"

In [None]:
entropy_per_zone = []

for zone in tqdm(zones, desc="Regions"):

    # reference of real points 
    df_real = pd.read_parquet(f"{base_path}/{zone}_typed_0.parquet")
    df_real = df_real[df_real['RAN'] == False].copy().reset_index(drop=True)
    n_points = len(df_real)

    # rows = real points, columns = types
    counts = np.zeros((n_points, 4), dtype=int)

    for j in tqdm(range(100), desc=f"{zone} - Files"):
        df_j = pd.read_parquet(f"{base_path}/{zone}_typed_{j}.parquet")
        df_j_real = df_j[df_j['RAN'] == False].reset_index(drop=True)
        types_j = df_j_real['TYPE'].values

        for idx, t in enumerate(types_j):
            if t in type_to_index:
                t_idx = type_to_index[t]
                counts[idx, t_idx] += 1

    # Calculate entropy for each point
    entropy_list = []

    for idx in range(n_points):
        total = counts[idx].sum()
        p_w = counts[idx] / total if total > 0 else np.zeros(4)
        entropy = -np.sum(p_w[p_w > 0] * np.log2(p_w[p_w > 0])) / np.log2(4)

        point = df_real.iloc[idx]
        entropy_list.append({
            'TARGETID': point['TARGETID'],
            'ZONE': zone,
            'ENTROPY': entropy
        })

    entropy_df = pd.DataFrame(entropy_list)
    entropy_per_zone.append(entropy_df)

In [None]:
with open('entropy_per_rosetta.pkl', 'rb') as f:
    entropy_per_rosetta = pickle.load(f)

In [None]:
plt.figure(figsize=(10, 6))  

# Rosetta
for i in range(20):
    df = entropy_per_rosetta[i]
    entropy_values = df['ENTROPY']

    hist, bin_edges = np.histogram(entropy_values, bins=16, range=(0, 0.6), density=True)
    bin_centers = (bin_edges[:-1] + bin_edges[1:]) / 2

    if i == 0:
        plt.plot(bin_centers, hist, label='Rosettas',
                 color='grey', linewidth=0.5, linestyle='--')
    else:
        plt.plot(bin_centers, hist,
                 color='grey', linewidth=0.5, linestyle='--')  

# Zones
colors = ['blue', 'red'] 

for i, df in enumerate(entropy_per_zone):
    zone_name = df['ZONE'].iloc[0]  
    entropy_values = df['ENTROPY']

    hist, bin_edges = np.histogram(entropy_values, bins=16, range=(0, 0.6), density=True)
    bin_centers = (bin_edges[:-1] + bin_edges[1:]) / 2

    plt.plot(bin_centers, hist, label=f'{zone_name}', 
             color=colors[i], linewidth=1.5)
    
plt.xlabel("Normalized Shannon Entropy", fontsize=14)
plt.ylabel("PDF", fontsize=14)
plt.grid(True)
plt.legend(loc='center left', fontsize=8, bbox_to_anchor=(1, 0.5)) 
plt.title("QSO", fontsize=15)
plt.tight_layout()
plt.show()

# Groups

In [None]:
from scipy.spatial import cKDTree

In [None]:
def identify_fof_groups(zone, rand_id, type_name, source='data', linking_length=5.0, base_path="results_ASTRA_100"):
    # 1. Cargar el archivo
    filename = f"{base_path}/{zone}_typed_{rand_id}.parquet"
    df = pd.read_parquet(filename)

    # 2. Filtrar por tipo
    df = df[df['TYPE'] == type_name]

    # 3. Filtrar por fuente
    if source == 'data':
        df = df[df['RAN'] == False]
    elif source == 'random':
        df = df[df['RAN'] == True]
    elif source == 'both':
        pass  # No filtramos RAN
    else:
        raise ValueError("source debe ser 'data', 'random' o 'both'")

    df = df.reset_index(drop=True)

    if len(df) == 0:
        print("No hay puntos con ese criterio.")
        return df

    # 4. Aplicar Friends-of-Friends (FoF) con KDTree
    positions = df[['X', 'Y', 'Z']].values
    tree = cKDTree(positions)
    pairs = tree.query_pairs(r=linking_length)

    # 5. Construir grupos (componentes conexas)
    parent = list(range(len(positions)))

    def find(i):
        while parent[i] != i:
            parent[i] = parent[parent[i]]
            i = parent[i]
        return i

    def union(i, j):
        ri, rj = find(i), find(j)
        if ri != rj:
            parent[ri] = rj

    for i, j in pairs:
        union(i, j)

    # Asignar group_id
    group_ids = np.array([find(i) for i in range(len(positions))])
    _, group_ids = np.unique(group_ids, return_inverse=True)
    df['group_id'] = group_ids

    return df


In [None]:
df_filament_groups = identify_fof_groups('NGC_1', 7, 'filament', source='data', linking_length=50)

In [None]:
def plot_top_n_groups(df, n=3, point_size=0.01):

    # obtain the N groups with the most points
    top_ids = df['group_id'].value_counts().head(n).index.tolist()

    colors = plt.cm.tab20(np.linspace(0, 1, n)) 

    plt.figure(figsize=(6, 6))
    for group_id, color in zip(top_ids, colors):
        subset = df[df['group_id'] == group_id]
        plt.scatter(subset['X'], subset['Y'], s=point_size, label=f'Group {group_id}')

    plt.xlabel("X [Mpc]")
    plt.ylabel("Y [Mpc]")
    #plt.legend(fontsize=8)
    #plt.grid(True)
    plt.gca().set_aspect('equal', adjustable='box')
    plt.tight_layout()
    plt.gca().set_aspect('equal', adjustable='box')
    plt.show()

In [None]:
len(df_filament_groups)

In [None]:
#plot_top_n_groups(df_filament_groups[df_filament_groups['group_id'] != 9492], n=100)
plot_top_n_groups(df_filament_groups, n=500)