<a href="https://colab.research.google.com/github/fisicorj/astrofisica/blob/main/Convers%C3%A3o_datasetcmfgem.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import re

# Helper function to process sections using regex
def extract_section(data, section_name):
    pattern = rf"{section_name}\s*\(.*?\)\s*(.*?)\s*(?=[A-Za-z]|$)"
    match = re.search(pattern, data, re.DOTALL)
    if match:
        section_data = match.group(1)
        # Split and filter empty strings
        values = [float(value) for value in section_data.split() if value]
        return values
    return []

# Reading the entire file content
file_path = '/mnt/data/OBSFLUX'
with open(file_path, 'r') as file:
    file_content = file.read()

# Extracting sections using the helper function
sections = {
    'frequencies': extract_section(file_content, "Continuum Frequencies"),
    'intensity': extract_section(file_content, "Observed intensity"),
    'luminosity': extract_section(file_content, "Luminosity")
}

# Finding the maximum length of the extracted sections
max_length = max(len(sections['frequencies']), len(sections['intensity']), len(sections['luminosity']))

# Function to pad the lists with NaN values to ensure they have the same length
def pad_list(lst, length):
    return lst + [np.nan] * (length - len(lst))

frequencies_padded = pad_list(sections['frequencies'], max_length)
intensities_padded = pad_list(sections['intensity'], max_length)
luminosities_padded = pad_list(sections['luminosity'], max_length)

# Creating DataFrame with padded lists
data = pd.DataFrame({
    'Continuum Frequencies': frequencies_padded,
    'Observed Intensity (Janskys)': intensities_padded,
    'Luminosity': luminosities_padded
})

# Saving the dataset as a CSV file
csv_file_path = '/mnt/data/processed_spectral_data.csv'
data.to_csv(csv_file_path, index=False)
