In [1]:
from concurrent.futures import ProcessPoolExecutor
import sys
sys.path.append("/home/m/murray/dtolgay/scratch")

import numpy as np 
import pandas as pd 
import os
from scipy.spatial import KDTree
from time import time 
from scipy.interpolate import LinearNDInterpolator, NearestNDInterpolator

from tools import constants

# Global variables
epsilon = 1e-30



In [6]:
galaxy_name, galaxy_type, redshift, max_workers = "m12i_res7100_md", "zoom_in", "0.0", 1


In [11]:
def read_cloudy_gas_particles(cloudy_gas_particles_file_directory):
    # Define the column names based on your description
    gas_column_names = [
        "x",
        "y",
        "z",
        "smoothing_length",
        "mass",
        "metallicity",
        "temperature",
        "vx",
        "vy",
        "vz",
        "hden",
        "radius",
        "sfr",
        "turbulence",
        "density",
        "mu_theoretical",
        "average_sobolev_smoothingLength",
        "index",
        "isrf",
    ]

    gas_particles_df = pd.read_csv(
        f"{cloudy_gas_particles_file_directory}/cloudy_gas_particles.txt",
        delim_whitespace=True,
        comment="#",
        names=gas_column_names,
    )

    gas_particles_df['dummy_radius'] = gas_particles_df['smoothing_length'] / 2 # TODO: Delete 

    # isrf of the gas particles can be zero, therefore set them equal to a very small number
    gas_particles_df.loc[gas_particles_df["isrf"] == 0, "isrf"] = 1e-30

    # Extend the dataframe by adding the log of the parameters
    gas_particles_df[
        [
            "log_metallicity",
            "log_density",
            "log_turbulence",
            "log_isrf",
            "log_hden",
            "log_radius",
            "log_smoothing_length",
            "log_average_sobolev_smoothingLength",
            "log_dummy_radius", # TODO: Delete
        ]
    ] = np.log10(
        gas_particles_df[
            [
                "metallicity",
                "density",
                "turbulence",
                "isrf",
                "hden",
                "radius",
                "smoothing_length",
                "average_sobolev_smoothingLength",
                "dummy_radius", # TODO: Delete
            ]
        ]
    )  # Take the log of the gas properties and interpolate using these logarithmic values.  

    print(f"{cloudy_gas_particles_file_directory}/cloudy_gas_particles.txt read and dataframe is created!")      
    
    return gas_particles_df, gas_column_names 

def read_training_data(base_file_dir, main_directory, file_name, properties_column_names):

    #################################################
    # Get the trained data
    print("Training data is started to be read.")

    # Read file
    path2TrainingData = f"{base_file_dir}/{main_directory}/{file_name}"
    unprocessed_train_data = pd.read_csv(path2TrainingData) 

    ############## Process the cloudy data 
    # Take the log of the properties 
    properties_column_names_with_log = []
    for property in properties_column_names:
        unprocessed_train_data[property] += epsilon # Add a very small number 
        unprocessed_train_data[f"log_{property}"] = np.log10(unprocessed_train_data[property])
        properties_column_names_with_log.append(f"log_{property}")

    # Discard all nan values 
    print("Dropping NaN containing lines")
    processed_train_data = unprocessed_train_data.dropna()        
    train_data_df = processed_train_data.drop(properties_column_names, axis=1) # Drop the columns which log is not taken.

    # # Double check if there is any NaN
    # if (np.isnan(train_data_df.values).any()):
    #     print("Still there are NaN values. Exiting with code 1...")
    #     exit(1)
    # elif (np.isinf(train_data_df.values).any()):
    #     print("Still there are inf values. Exiting with code 2...")
    #     exit(2)

    ######
    # Add the column density data to interpolate that too 
    train_data_df['log_column_density'] = np.log10(
        (10**train_data_df['log_hden'] / constants.cm2pc**3) * (10**train_data_df['log_radius']) * (constants.mu_h * constants.proton_mass * constants.kg2Msolar)
    ) # Msolar / pc^2

    print(f"{path2TrainingData} is read.")


    return train_data_df, properties_column_names_with_log

def split_dataframe(df, max_workers):
    # Create different chunks of of dataframe to run them parallely
    n = len(df)
    chunk_size = -(
        -n // max_workers
    )  # Ceiling division to ensure all rows are included

    # Split the dataframe into chunks and store in an array
    return [df[i : i + chunk_size] for i in range(0, n, chunk_size)]


# Main

In [8]:
start = time()

# directory_name = "lichen_voronoi_1e6"
directory_name = "voronoi_1e6"
# directory_name = "voronoi_1e6_improved_wavelength_bin"
# directory_name = "trial1"

print(
    f"------------------------------------------ {galaxy_name} ------------------------------------------"
)

## Check if file exits. If it exists do not continue running the code, if not run the code.
cloudy_gas_particles_file_directory = f"/home/m/murray/dtolgay/scratch/post_processing_fire_outputs/skirt/runs_hden_radius/{galaxy_type}/z{redshift}/{galaxy_name}/{directory_name}"
# cloudy_gas_particles_file_directory = f"/home/m/murray/dtolgay/scratch/cloudy_runs/z_3/m12f_res7100_md_test"


# Read gas particles 
gas_particles_df, gas_column_names = read_cloudy_gas_particles(cloudy_gas_particles_file_directory)

gas_particles_df = gas_particles_df.iloc[0:10].copy()

# Split dataframe into several dataframes to run the parallely. 
gas_particles_df_chunks = split_dataframe(
        df=gas_particles_df,
        max_workers=max_workers, 
    )

################ Read training data particles 
properties_column_names = [
    "fh2",
    "fCO",
]

# 1st set of run
train_data_base_file_dir_1 = "/scratch/m/murray/dtolgay/cloudy_runs/z_0"
train_data_main_directory_1 = "cr_1_CO87_CII_H_O3/cr_1_CO87_CII_H_O3_metallicity_above_minus_2" 

train_data_df_1, properties_column_names_with_log = read_training_data(
    base_file_dir = train_data_base_file_dir_1, 
    main_directory = train_data_main_directory_1, 
    file_name = "other_properties.csv", 
    properties_column_names = properties_column_names,
)    

# 2nd set of run
train_data_base_file_dir_2 = "/scratch/m/murray/dtolgay/cloudy_runs/z_0"
train_data_main_directory_2 = "cr_1_CO87_CII_H_O3/cr_1_CO87_CII_H_O3_metallicity_minus2_minus3point5" 

train_data_df_2, properties_column_names_with_log = read_training_data(
    base_file_dir = train_data_base_file_dir_2, 
    main_directory = train_data_main_directory_2, 
    file_name = "other_properties.csv", 
    properties_column_names = properties_column_names,
)    


# Concattanete two dataframes 
train_data_df = pd.concat([train_data_df_2, train_data_df_1])
train_data_file_paths = [f"{train_data_base_file_dir_1}/{train_data_main_directory_1}", f"{train_data_base_file_dir_2}/{train_data_main_directory_2}"]

# train_data_file_paths = [f"{train_data_base_file_dir_1}/{train_data_main_directory_1}"]
# train_data_df = train_data_df_1



------------------------------------------ m12i_res7100_md ------------------------------------------
/home/m/murray/dtolgay/scratch/post_processing_fire_outputs/skirt/runs_hden_radius/zoom_in/z0.0/m12i_res7100_md/voronoi_1e6/cloudy_gas_particles.txt read and dataframe is created!
Training data is started to be read.
Dropping NaN containing lines
/scratch/m/murray/dtolgay/cloudy_runs/z_0/cr_1_CO87_CII_H_O3/cr_1_CO87_CII_H_O3_metallicity_above_minus_2/other_properties.csv is read.
Training data is started to be read.
Dropping NaN containing lines
/scratch/m/murray/dtolgay/cloudy_runs/z_0/cr_1_CO87_CII_H_O3/cr_1_CO87_CII_H_O3_metallicity_minus2_minus3point5/other_properties.csv is read.


In [9]:
gas_particles_df

Unnamed: 0,x,y,z,smoothing_length,mass,metallicity,temperature,vx,vy,vz,...,dummy_radius,log_metallicity,log_density,log_turbulence,log_isrf,log_hden,log_radius,log_smoothing_length,log_average_sobolev_smoothingLength,log_dummy_radius
0,8155.11156,2523.78963,295.767503,494.771024,9633.96595,1.543016,1329689.0,-440.727295,103.712943,181.075122,...,247.385512,0.188371,-25.383051,2.441073,0.635261,-1.390178,2.192656,2.694404,2.404975,2.393374
1,8009.17409,2389.75543,246.704012,586.104359,8752.24193,1.722534,2101607.0,-448.361308,143.592455,56.359671,...,293.052179,0.236168,-25.644843,2.31964,0.663333,-1.653066,2.266025,2.767975,2.479325,2.466945
2,7988.0324,2330.81139,410.916168,328.450397,8200.12148,0.994449,40592.1,-109.426567,187.687041,-165.466676,...,164.225199,-0.002417,-24.919246,2.047164,0.663333,-0.941648,2.014726,2.51647,2.247593,2.21544
3,7902.90863,2389.85935,460.785347,276.410299,14864.1929,1.922749,10158.3,-101.605009,281.62639,44.83439,...,138.20515,0.283923,-24.43903,1.822882,0.580017,-0.755093,1.940761,2.441554,2.23615,2.140524
4,7973.95872,2427.62253,399.528474,331.981654,10381.1743,1.691503,47923.6,-124.733355,214.637333,-111.297868,...,165.990827,0.228273,-24.830582,1.745932,0.663333,-0.858713,2.019314,2.521114,2.279911,2.220084
5,7890.54377,2470.0637,602.389166,159.979949,9352.69791,1.740791,9014.615,-110.398699,250.74529,-0.053703,...,79.989975,0.240747,-23.924958,1.295796,0.63356,-0.253746,1.702336,2.204066,2.283109,1.903036
6,7828.08767,2503.68247,570.979823,160.544134,8857.03221,1.449421,10926.57,-111.958055,310.089589,27.370154,...,80.272067,0.161194,-23.953826,1.408569,0.63356,-0.275519,1.704076,2.205594,2.364501,1.904564
7,7754.92834,2410.81187,557.783877,236.760581,11355.5272,2.1583,11678.67,-93.482017,338.936176,60.106515,...,118.380291,0.334112,-24.347536,1.821547,0.580017,-0.662765,1.871285,2.374309,2.145097,2.073279
8,7821.29993,2433.0487,645.13235,135.969381,7259.10054,0.422022,3048.35,-131.429354,230.359092,-4.55572,...,67.98469,-0.374665,-23.824196,1.129041,0.372245,-0.141852,1.632064,2.133441,2.209432,1.832411
9,7789.05919,2432.59306,657.669738,130.570215,9402.2189,1.829502,63.54094,-136.728369,245.432536,5.195918,...,65.285107,0.262333,-23.658503,1.052487,0.372245,0.011751,1.614282,2.115844,2.095744,1.814814


In [23]:

def prepare_interpolator(k, gas, gas_data_column_names, tree, train_data_df, train_data_column_names, target_column_names, interpolator="LinearNDInterpolator"):
    # Query the tree for neighbors
    distances, indices = tree.query(gas[gas_data_column_names].to_numpy(), k=k)
    
    # Set up linearNDInterpolator
    if interpolator == "LinearNDInterpolator":  
        interpolator = LinearNDInterpolator(
            points=train_data_df.iloc[indices][train_data_column_names].to_numpy(),
            values=train_data_df.iloc[indices][target_column_names].to_numpy()
        )
    elif interpolator == "NearestNDInterpolator":
        interpolator = NearestNDInterpolator(
            train_data_df.iloc[indices][train_data_column_names].to_numpy(),
            train_data_df.iloc[indices][target_column_names].to_numpy()
        )
    else:
        return None
    
    return interpolator

def interpolate_otherProperties(gas_particles_df, train_data_df, properties_column_names_with_log):

    print("I am in the interpolate_otherProperties")

    train_data_column_names = [
        "log_metallicity",
        "log_hden",
        "log_turbulence",
        "log_isrf",
        "log_radius",    
    ]    

    tree = KDTree(
        train_data_df[train_data_column_names].to_numpy(),
    ) # Create a tree for the training data

    scale_length = [
        "log_average_sobolev_smoothingLength"
    ]

    gas_data_column_names = [
        "log_metallicity",
        "log_hden",
        "log_turbulence",
        "log_isrf",      
    ] + scale_length

    gas_indices_luminosities = []
    
    intial_index = gas_particles_df.iloc[0]['index']
    for index, gas in gas_particles_df.iterrows():
        if intial_index == 0:
            if (gas['index'] % int(1e5) == 1):
                print(f"{gas['index']} finished. Left {len(gas_particles_df) - gas['index']}")

        # List of k values to try in order
        k_values = [50, 100, 500, 1000, 2000, 3000, 5000, int(1e4)]        

        for k in k_values: 
            try:
                # Get the interpolator 
                interpolator = prepare_interpolator(
                        k = k, 
                        gas = gas, 
                        gas_data_column_names = gas_data_column_names, 
                        tree = tree, 
                        train_data_df = train_data_df, 
                        train_data_column_names = train_data_column_names, 
                        target_column_names = properties_column_names_with_log, 
                        interpolator="LinearNDInterpolator"
                    )
                
                # Check if there are NaN values 
                interpolated_Y_values = 10**interpolator(gas[gas_data_column_names])[0] # It returns an array of arrays. That's why [0] is done.

                # If there exist any NaN change iterate to the next k value:
                if np.isnan(interpolated_Y_values).any(): 
                    if k < 300:
                        continue
                    else:
                        # use nearestNDInterpolator
                        interpolator = prepare_interpolator(
                                k = k, 
                                gas = gas, 
                                gas_data_column_names = gas_data_column_names, 
                                tree = tree, 
                                train_data_df = train_data_df, 
                                train_data_column_names = train_data_column_names, 
                                target_column_names = properties_column_names_with_log, 
                                interpolator="NearestNDInterpolator"
                            )
                        interpolated_Y_values = 10**interpolator(gas[gas_data_column_names])[0] # It returns an array of arrays. That's why [0] is done.
                        print("NearestNDInterpolator used") # TODO: Delete
                        break
                else: 
                    print("LinearNDInterpolator used") # TODO: Delete
                    break  # Break out of the loop if and there exist no NaN values 

            except Exception as e:
                # If it fails with the current k, continue to the next one
                continue
        
        # If interpolator is not able to be constructed, exit with an error code.
        if interpolator == None:
            print(f"Error: interpolator is None for index: {gas['index']}")
            exit(99)

        # Append the gas indices and properties each other. 
        gas_indices_luminosities.append(
            np.concatenate(([gas['index']], interpolated_Y_values))
        )

    return gas_indices_luminosities


In [21]:

########
# Interpolate
with ProcessPoolExecutor(max_workers=max_workers) as executor:
    futures = [
        executor.submit(interpolate_otherProperties, gas_particles_df_chunk, train_data_df, properties_column_names_with_log)
        for gas_particles_df_chunk in gas_particles_df_chunks
    ]
    gas_indices_interpolatedValues_chunks = [future.result() for future in futures]       

# Flatten the array
print("Flattening the array")
gas_indices_interpolatedValues = [] 
for interpolated_value_for_gas_particles_in_the_chunk in gas_indices_interpolatedValues_chunks:
    for interpolated_value_for_gas_particle in interpolated_value_for_gas_particles_in_the_chunk:
        gas_indices_interpolatedValues.append(interpolated_value_for_gas_particle)



# gas_indices_interpolatedValues = interpolate_otherProperties(
#     gas_particles_df=gas_particles_df, 
#     train_data_df=train_data_df, 
#     properties_column_names_with_log=properties_column_names_with_log
#     )

column_names = ['index'] + properties_column_names # Now this is not log because I took the exponential when I am interpolating 
gas_indices_Yinterpolated = pd.DataFrame(gas_indices_interpolatedValues, columns=column_names)

### 
# Merge two dataframes
if len(gas_indices_Yinterpolated) == len(gas_particles_df):
    print("Lengths of luminosities and gas particles are the same. Merging can be done.")
    merged_df = gas_particles_df.merge(gas_indices_Yinterpolated, how='left', on='index', validate='one_to_one') # Check if it is one to one 
else:
    print("Lengths of luminosities and gas particles are NOT same. Exiting with code 3...")
    exit(3) 

I am in the interpolate_otherProperties
LinearNDInterpolator used
1.0 finished. Left 9.0
LinearNDInterpolator used
LinearNDInterpolator used
LinearNDInterpolator used
LinearNDInterpolator used
LinearNDInterpolator used
LinearNDInterpolator used
LinearNDInterpolator used
LinearNDInterpolator used
LinearNDInterpolator used
Flattening the array
Lengths of luminosities and gas particles are the same. Merging can be done.


In [22]:
merged_df[properties_column_names]

Unnamed: 0,fh2,fCO
0,1.190741e-12,8.765854e-25
1,1.836694e-15,1.2192589999999999e-30
2,8.349086e-13,3.956539e-25
3,4.492946e-11,1.255279e-22
4,1.190741e-12,8.765854e-25
5,1.473104e-06,1.263396e-13
6,1.473104e-06,1.263396e-13
7,4.492946e-11,1.255279e-22
8,1.115192e-08,1.792349e-15
9,5.164997e-06,1.468022e-12


In [19]:
merged_df[properties_column_names]

Unnamed: 0,fh2,fCO
0,1.76581e-13,6.586721000000001e-27
1,4.724033e-14,2.3993280000000002e-27
2,8.997853e-13,3.606018e-25
3,1.918439e-10,4.270764e-21
4,2.260302e-11,6.991438000000001e-23
5,1.594181e-06,1.017638e-14
6,8.543294e-08,4.962646e-16
7,4.587235e-10,2.906478e-20
8,1.136497e-08,4.995571e-16
9,1.554083e-05,3.056528e-12
