In [1]:
import numpy as np
import pandas as pd
from obspy import read
from datetime import datetime, timedelta
import matplotlib.pyplot as plt
import os
import glob

### Open Data

In [103]:
def get_data_from_mseed(training_path, csv_file):
    mseed_files = glob.glob(os.path.join(training_path, "*.mseed"))
    data_list = []
    target_df = pd.read_csv(csv_file)

    for mseed_file in mseed_files:
        stream = read(mseed_file)
        filename = os.path.basename(mseed_file).rstrip(".mseed")
        # print(f"Processing: {filename}")
        
        if 'filename' not in target_df.columns or 'time_rel(sec)' not in target_df.columns:
            # print(f"Error: Columns 'filename' or 'time_rel(sec)' not found in target DataFrame.")
            continue
        
        target_df['filename'] = target_df['filename'].str.replace('.csv', '', regex=False)
        matching_rows = target_df[target_df['filename'] == filename]
        if matching_rows.empty:
            # print(f"Warning: No matching filename found in the target DataFrame for {filename}. Skipping this file.")
            continue

        try:
            target = matching_rows['time_rel(sec)'].iloc[0]
        except IndexError:
            # print(f"Error: No 'time_rel(sec)' value found for filename {filename}. Skipping this file.")
            continue

        for trace in stream:
            row = {
                "file_name": filename,  # File name without path
                "rel_times": trace.times(),
                "data": trace.data,
                "start_time": trace.stats.starttime,  # Absolute start time
                "end_time": trace.stats.endtime,      # Absolute end time
                "target": target,                     # Target value for the filename
            }
            data_list.append(row)

    df = pd.DataFrame(data_list)
    # print("Final DataFrame:\n", df)
    return df

In [112]:
def get_data(training_folders, catalog_folders):
    df = pd.DataFrame()
    for training_folder, catalog_folder in zip(training_folders, catalog_folders):
        new_df = get_data_from_mseed(training_folder, catalog_folder)
        df = pd.concat([df, new_df], axis=0, ignore_index=True)
    print(df)
    return df

train_data = get_data(['./data/lunar/training/data/S12_GradeA', './data/mars/training/data/'], 
                      ['./data/lunar/training/catalogs/apollo12_catalog_GradeA_final.csv', './data/mars/training/catalogs/Mars_InSight_training_catalog_final.csv'])

                                  file_name  \
0    xa.s12.00.mhz.1970-01-19HR00_evid00002   
1    xa.s12.00.mhz.1970-03-25HR00_evid00003   
2    xa.s12.00.mhz.1970-03-26HR00_evid00004   
3    xa.s12.00.mhz.1970-04-25HR00_evid00006   
4    xa.s12.00.mhz.1970-04-26HR00_evid00007   
..                                      ...   
72   xa.s12.00.mhz.1975-05-04HR00_evid00192   
73   xa.s12.00.mhz.1975-06-24HR00_evid00196   
74   xa.s12.00.mhz.1975-06-26HR00_evid00198   
75  XB.ELYSE.02.BHV.2022-01-02HR04_evid0006   
76  XB.ELYSE.02.BHV.2022-02-03HR08_evid0005   

                                            rel_times  \
0   [0.0, 0.1509433962264151, 0.3018867924528302, ...   
1   [0.0, 0.1509433962264151, 0.3018867924528302, ...   
2   [0.0, 0.1509433962264151, 0.3018867924528302, ...   
3   [0.0, 0.1509433962264151, 0.3018867924528302, ...   
4   [0.0, 0.1509433962264151, 0.3018867924528302, ...   
..                                                ...   
72  [0.0, 0.1509433962264151, 0.3018