In [9]:
import sys 
sys.path.append("/scratch/m/murray/dtolgay")
from tools import constants

import numpy as np
import pandas as pd 

In [3]:

def read_training_data(base_file_dir, main_directory, file_name, base_line_names):

    #################################################
    # Get the trained data

    print("Training data is started to be read.")

    line_names = []
    for line_name in base_line_names:
        line_names.append(f"I_{line_name}")

    column_names = [
        "log_metallicity",
        "log_hden",
        "log_turbulence",
        "log_isrf",
        "log_radius",
    ]  + line_names

    # Read file
    path2TrainingData = f"{base_file_dir}/{main_directory}/{file_name}"
    unprocessed_train_data = pd.DataFrame(
        np.loadtxt(fname=path2TrainingData),
        columns=column_names,
    )

    ############## Process the cloudy data 
    # Discard all nan values 
    print("Dropping NaN containing lines")
    unprocessed_train_data = unprocessed_train_data.dropna()

    # Check if all intensities are positive and set 0 values to epsilon
    print(f"Check if all intensities are positive. Then set 0 values to {epsilon}")
    all_positive_columns = (unprocessed_train_data[line_names] >= 0).all().all()
    if all_positive_columns:
        print(f"All of the intensity values are non-negative. Continuing...")
    else:
        # Set values smaller or equal to zero to epsilon in specified columns
        for col in line_names:
            unprocessed_train_data[col] = unprocessed_train_data[col].map(lambda x: epsilon if x <= 0 else x)
        print(f"Not all intensities are are non-negative. Setting them to epsilon")


    line_names_with_log = []
    for column in line_names:
        unprocessed_train_data[f"log_{column}"] = np.log10(unprocessed_train_data[column])
        line_names_with_log.append(f"log_{column}") # Store the new line names


    train_data_df = unprocessed_train_data[[
        "log_metallicity",
        "log_hden",
        "log_turbulence",
        "log_isrf",
        "log_radius",
        ] + line_names_with_log]  # Only use the log of the line luminosities    

    # # Double check if there is any NaN
    # if (np.isnan(train_data_df.values).any()):
    #     print("Still there are NaN values. Exiting with code 1...")
    #     exit(1)
    # elif (np.isinf(train_data_df.values).any()):
    #     print("Still there are inf values. Exiting with code 2...")
    #     exit(2)

    ######
    # Add the column density data to interpolate that too 
    train_data_df['log_column_density'] = np.log10(
        (10**train_data_df['log_hden'] / constants.cm2pc**3) * (10**train_data_df['log_radius']) * (mu * constants.proton_mass * constants.kg2Msolar)
    ) # Msolar / pc^2

    print(f"{path2TrainingData} is read.")


    return train_data_df, line_names_with_log


In [11]:
epsilon = 1e-30
mu = 1.38 # Krumholz and Gnedin 

base_line_names = [
    "ly_alpha",
    "h_alpha",
    "h_beta",
    "co_10",
    "co_21",
    "co_32",
    "co_43",
    "co_54",
    "co_65",
    "co_76",
    "co_87",
    "13co",
    "c2",
    "o3_88",
    "o3_5006",
    "o3_4958",        
]

# 1st set of run
train_data_base_file_dir_1 = "/scratch/m/murray/dtolgay/cloudy_runs/z_0"
train_data_main_directory_1 = "cr_1_CO87_CII_H_O3/cr_1_CO87_CII_H_O3_metallicity_above_minus_2" 

train_data_df_1, line_names_with_log = read_training_data(
    base_file_dir = train_data_base_file_dir_1, 
    main_directory = train_data_main_directory_1, 
    file_name = "I_line_values_without_reversing.txt", 
    base_line_names = base_line_names
)    

# 2nd set of run
train_data_base_file_dir_2 = "/scratch/m/murray/dtolgay/cloudy_runs/z_0"
train_data_main_directory_2 = "cr_1_CO87_CII_H_O3/cr_1_CO87_CII_H_O3_metallicity_minus2_minus3point5" 

train_data_df_2, line_names_with_log = read_training_data(
    base_file_dir = train_data_base_file_dir_2, 
    main_directory = train_data_main_directory_2, 
    file_name = "I_line_values_without_reversing.txt", 
    base_line_names = base_line_names
)    

Training data is started to be read.
Dropping NaN containing lines
Check if all intensities are positive. Then set 0 values to 1e-30
Not all intensities are are non-negative. Setting them to epsilon


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_data_df['log_column_density'] = np.log10(


/scratch/m/murray/dtolgay/cloudy_runs/z_0/cr_1_CO87_CII_H_O3/cr_1_CO87_CII_H_O3_metallicity_above_minus_2/I_line_values_without_reversing.txt is read.
Training data is started to be read.
Dropping NaN containing lines
Check if all intensities are positive. Then set 0 values to 1e-30
Not all intensities are are non-negative. Setting them to epsilon
/scratch/m/murray/dtolgay/cloudy_runs/z_0/cr_1_CO87_CII_H_O3/cr_1_CO87_CII_H_O3_metallicity_minus2_minus3point5/I_line_values_without_reversing.txt is read.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_data_df['log_column_density'] = np.log10(


In [14]:
train_data_df_2.keys()

Index(['log_metallicity', 'log_hden', 'log_turbulence', 'log_isrf',
       'log_radius', 'log_I_ly_alpha', 'log_I_h_alpha', 'log_I_h_beta',
       'log_I_co_10', 'log_I_co_21', 'log_I_co_32', 'log_I_co_43',
       'log_I_co_54', 'log_I_co_65', 'log_I_co_76', 'log_I_co_87',
       'log_I_13co', 'log_I_c2', 'log_I_o3_88', 'log_I_o3_5006',
       'log_I_o3_4958', 'log_column_density'],
      dtype='object')

In [19]:
columns = [
    "log_metallicity",
    "log_hden",
    "log_turbulence",
    "log_isrf",
    "log_radius"
]

for column in columns: 
    print(f"{column}: {np.unique(train_data_df_1[column])}")
    
print("\n\n")

for column in columns: 
    print(f"{column}: {np.unique(train_data_df_2[column])}")

log_metallicity: [-2.  -1.5 -1.  -0.5  0.   0.5  1. ]
log_hden: [-5. -4. -3. -2. -1.  0.  1.  2.  3.  4.  5.]
log_turbulence: [-3. -2. -1.  0.  1.  2.  3.]
log_isrf: [-5.  -4.5 -4.  -3.5 -3.  -2.5 -2.  -1.5 -1.  -0.5  0.   0.5  1.   1.5
  2.   2.5  3.   3.5  4.   4.5  5. ]
log_radius: [0.  0.5 1.  1.5 2.  2.5 3.  3.5 4.  4.5 5. ]



log_metallicity: [-3.5 -3.  -2.5]
log_hden: [-5. -4. -3. -2. -1.  0.  1.  2.  3.  4.  5.]
log_turbulence: [-3. -2. -1.  0.  1.  2.  3.]
log_isrf: [-5.  -4.5 -4.  -3.5 -3.  -2.5 -2.  -1.5 -1.  -0.5  0.   0.5  1.   1.5
  2.   2.5  3.   3.5  4.   4.5  5. ]
log_radius: [0.  0.5 1.  1.5 2.  2.5 3.  3.5 4.  4.5 5. ]
