In [1]:
# -*- coding: utf-8 -*-
# Notebook Setup Cell

import argparse
import os
import sys
import glob
import time
import numpy as np
import pandas as pd
import yaml
from importlib import reload
from datetime import datetime, timedelta
import logging
from collections import defaultdict
from pickleshare import *

# Astropy imports
from astropy.time import Time, TimeDelta
from astropy.coordinates import SkyCoord, Angle, EarthLocation
import astropy.units as u
from astropy.table import Table
from astropy.io import fits
from astropy.wcs import WCS

# --- IMPORTANT: Adjust sys.path if needed ---
# If your notebook is NOT in the same directory as the 'pipeline' folder,
# add the parent directory to the path so Python can find the modules.
pipeline_parent_dir = '/data/jfaber/dsa110-contimg/' # ADJUST IF YOUR NOTEBOOK IS ELSEWHERE
if pipeline_parent_dir not in sys.path:
    sys.path.insert(0, pipeline_parent_dir)


In [2]:
from casatasks import (
        clearcal, delmod, rmtables, flagdata, bandpass, ft, mstransform, gaincal, applycal, listobs, split
    )
from casatools import componentlist, msmetadata, imager, ms, table

In [3]:
# Pipeline module imports
try:
    from pipeline import config_parser
    from pipeline import pipeline_utils
    from pipeline import ms_creation
    from pipeline import calibration
    from pipeline import skymodel
    from pipeline import imaging
    from pipeline import mosaicking
    from pipeline import photometry
    from pipeline import dsa110_utils # Needed for location
except ImportError as e:
    print(f"ERROR: Failed to import pipeline modules. Check sys.path.")
    print(f"Current sys.path: {sys.path}")
    raise e


In [4]:
reload(ms_creation)

<module 'pipeline.ms_creation' from '/data/jfaber/dsa110-contimg/pipeline/ms_creation.py'>

In [5]:
# pyuvdata needed for reading header
try:
    from pyuvdata import UVData
    pyuvdata_available = True
except ImportError:
     print("ERROR: pyuvdata is required to read HDF5 metadata.")
     pyuvdata_available = False # Script will likely fail later

# --- Define Paths and Parameters (modify as needed) ---
CONFIG_PATH = 'config/pipeline_config.yaml' # Relative path from notebook location
HDF5_DIR = '/data/incoming/' # Location of your HDF5 data chunks
BCAL_NAME_OVERRIDE = None # Optional: Force a specific BPCAL name for testing, e.g., '3C286', otherwise set to None
VERBOSE_LOGGING = True # Set True for DEBUG level, False for INFO

# --- Setup Logging ---
# Load config minimally just to get log path
try:
    with open(CONFIG_PATH, 'r') as f:
        temp_config_for_log = yaml.safe_load(f)
    log_dir_config = temp_config_for_log.get('paths', {}).get('log_dir', 'logs') #

    # Resolve log_dir relative to pipeline parent dir if log_dir_config is relative
    if not os.path.isabs(log_dir_config):
        # Assumes pipeline_utils.py is in 'pipeline_parent_dir/pipeline/'
        # and log_dir in config is relative to 'pipeline_parent_dir'
        # Example: config log_dir: ../logs -> resolved: pipeline_parent_dir/../logs
        # If log_dir is like 'logs/', it will be pipeline_parent_dir/logs/
        # The config_parser.py resolves log_dir relative to the parent of the script dir.
        # For consistency, let's assume pipeline_parent_dir is the project root.
        log_dir = os.path.abspath(os.path.join(pipeline_parent_dir, log_dir_config))
    else:
        log_dir = log_dir_config

    os.makedirs(log_dir, exist_ok=True)
    log_level = logging.DEBUG if VERBOSE_LOGGING else logging.INFO
    
    # Ensure CASA log is also set if casatasks is available
    logger = pipeline_utils.setup_logging(log_dir, config_name=f"notebook_test_{datetime.now().strftime('%H%M%S')}") #
    logger.setLevel(log_level)
    
    # Suppress overly verbose CASA logs if desired (from casatasks import casalog; casalog.filter('INFO'))
    logger.info("Setup cell executed.")
except Exception as e:
    print(f"ERROR during setup: {e}")
    # Stop execution if setup fails
    raise RuntimeError("Setup failed")

2025-05-12 08:08:32 [INFO ] [MainThread] [root] CASA log file set to: /data/jfaber/logs/casa_20250512_080832.log
2025-05-12 08:08:32 [INFO ] [MainThread] [root] Pipeline logging configured. Log file: /data/jfaber/logs/notebook_test_080832_20250512_080832.log
2025-05-12 08:08:32 [INFO ] [MainThread] [root] Setup cell executed.


In [6]:
# Notebook Cell: Helper Function Definitions

from collections import defaultdict 

def collect_files_for_nominal_start_time(nominal_start_time_str, hdf5_dir, config):
    """
    Collects a complete set of HDF5 files for a nominal start time,
    respecting timestamp variations via same_timestamp_tolerance.
    Handles HDF5 filenames with timestamps like 'YYYY-MM-DDTHH:MM:SS_sbXX.hdf5'.
    """
    logger = logging.getLogger(__name__)
    
    # Format for the user-provided nominal start time string
    nominal_time_format = "%Y%m%dT%H%M%S" 
    # Format for timestamps found in the actual HDF5 filenames
    actual_file_time_format = "%Y-%m-%dT%H:%M:%S" # Corrected format

    try:
        # Parse the user-provided nominal start time
        nominal_dt_obj = datetime.strptime(nominal_start_time_str, nominal_time_format)
    except ValueError:
        logger.error(f"Invalid nominal_start_time_str format: {nominal_start_time_str}. Expected {nominal_time_format}.")
        return None

    tolerance_sec = config['ms_creation'].get('same_timestamp_tolerance', 30.0)
    expected_spws_set = set(config['ms_creation']['spws'])
    
    logger.info(f"Collecting files for nominal start time: {nominal_start_time_str} (parsed as {nominal_dt_obj}) in {hdf5_dir} with tolerance {tolerance_sec}s")
    logger.debug(f"Expected SPWs: {sorted(list(expected_spws_set))}")

    files_for_this_chunk = defaultdict(list)
    all_hdf5_files_in_dir = glob.glob(os.path.join(hdf5_dir, "20*.hdf5")) # Glob for files starting with "20"
    logger.debug(f"Found {len(all_hdf5_files_in_dir)} total HDF5 files in {hdf5_dir} to check.")

    found_any_for_nominal_time = False
    for f_path in all_hdf5_files_in_dir:
        try:
            f_name = os.path.basename(f_path)
            # Assuming filename format YYYY-MM-DDTHH:MM:SS_sbXX.hdf5
            ts_str_from_file = f_name.split('_')[0] 
            
            # Parse timestamp from the filename using the correct format
            file_dt_obj = datetime.strptime(ts_str_from_file, actual_file_time_format)
            
            time_diff_seconds = abs((file_dt_obj - nominal_dt_obj).total_seconds())
            
            if time_diff_seconds <= tolerance_sec:
                found_any_for_nominal_time = True
                spw_str_from_file = f_name.split('_')[1].replace('.hdf5', '')
                base_spw = spw_str_from_file # Since 'spl' is no longer used
                
                logger.debug(f"  File {f_name}: ActualTS={file_dt_obj}, time_diff={time_diff_seconds:.1f}s, parsed_spw='{base_spw}'")
                if base_spw in expected_spws_set:
                    files_for_this_chunk[base_spw].append(f_path)
                    logger.debug(f"    -> Matched expected SPW: '{base_spw}'")
                else:
                    logger.debug(f"    -> Parsed SPW '{base_spw}' not in expected_spws_set.")
            # else: # This else can be very verbose if many files are outside the tolerance
                # logger.debug(f"  File {f_name}: ActualTS={file_dt_obj}, time_diff={time_diff_seconds:.1f}s (OUTSIDE tolerance for {nominal_start_time_str})")

        except (IndexError, ValueError) as e_parse: # Catch errors from split or strptime
            logger.debug(f"Could not parse filename or timestamp for {f_name} (format expected: YYYY-MM-DDTHH:MM:SS_sbXX.hdf5): {e_parse}")
            continue
        except Exception as e_gen: # Catch any other unexpected errors for a file
            logger.debug(f"Unexpected error processing file {f_name}: {e_gen}")
            continue
            
    if not found_any_for_nominal_time:
        logger.warning(f"No HDF5 files found whose timestamps were within the {tolerance_sec}s tolerance window for nominal start time {nominal_start_time_str} ({nominal_dt_obj}).")

    collected_files_list = []
    is_complete = True
    missing_spws = []
    for spw_needed in sorted(list(expected_spws_set)): 
        if spw_needed in files_for_this_chunk and files_for_this_chunk[spw_needed]:
            files_for_this_chunk[spw_needed].sort() 
            collected_files_list.append(files_for_this_chunk[spw_needed][0]) 
        else:
            is_complete = False
            missing_spws.append(spw_needed)

    if not is_complete:
        logger.error(f"Incomplete HDF5 set for nominal time {nominal_start_time_str}: Missing SPW(s): {', '.join(missing_spws)}")
        return None # Return None if set is not complete
            
    if len(collected_files_list) == len(expected_spws_set): # Check if all expected SPWs were collected
        logger.info(f"Found complete set of {len(collected_files_list)} files for nominal start time {nominal_start_time_str}")
        return sorted(collected_files_list) 
    else:
        # This path should ideally be caught by "is_complete" check, but good for robustness
        logger.error(f"Failed to form a complete set for nominal start {nominal_start_time_str}. Expected {len(expected_spws_set)}, collected {len(collected_files_list)}. Missing: {', '.join(missing_spws)}")
        return None

        
def get_obs_declination(config, hdf5_dir):
    """Reads the fixed declination from an arbitrary HDF5 file's metadata."""
    if not pyuvdata_available: return None
    logging.info("Attempting to determine observation declination from HDF5 metadata...")
    try:
        pattern = os.path.join(hdf5_dir, "20*_sb00.hdf5")
        hdf5_files = glob.glob(pattern)
        if not hdf5_files:
            raise FileNotFoundError(f"No '*_sb00.hdf5' files found in {hdf5_dir} to read metadata.")
        uvd = UVData()
        logging.debug(f"Reading metadata from: {hdf5_files[0]}")
        uvd.read(hdf5_files[0], file_type='uvh5', run_check=False, read_data=False)
        fixed_dec_rad = uvd.extra_keywords['phase_center_dec']
        fixed_dec_deg = np.rad2deg(fixed_dec_rad) % 360.0
        logging.info(f"Determined observation Declination: {fixed_dec_deg:.4f} degrees")
        return fixed_dec_deg
    except KeyError:
        logging.error(f"Metadata key 'phase_center_dec' not found in {hdf5_files[0]}. Cannot determine Dec.")
        return None
    except Exception as e:
        logging.error(f"Failed to read HDF5 metadata to determine Declination: {e}", exc_info=True)
        return None

def select_bcal_for_test(config, fixed_dec_deg, bcal_name_override=None):
    """Reads BPCAL candidate catalog, filters by Dec, selects one for testing."""
    logger = pipeline_utils.get_logger(__name__)
    cal_config = config['calibration']
    # Path to the *filtered* candidate list
    bcal_catalog_path = cal_config['bcal_candidate_catalog'] # Assumes path is resolved
    # Use flux limits from config if not overridden for selection
    min_flux_jy = cal_config.get('bcal_min_flux_jy', 1.0)
    max_flux_jy = cal_config.get('bcal_max_flux_jy', 100.0)

    if not os.path.exists(bcal_catalog_path):
        logger.error(f"BPCAL candidate catalog not found: {bcal_catalog_path}")
        logger.error("Please run the catalog generation script first (e.g., filter_vla_catalog_for_bcal.py).")
        return None

    logger.info(f"Reading BPCAL candidates from: {bcal_catalog_path}")
    try:
        df = pd.read_csv(bcal_catalog_path, na_values=['None','NaN',''])
        if df.empty:
            logger.error(f"BPCAL candidate file '{bcal_catalog_path}' is empty.")
            logger.error(f"Check filtering criteria (Dec={fixed_dec_deg:.2f}, Flux={min_flux_jy}-{max_flux_jy}Jy) or generate the file.")
            return None

        # In case the loaded file wasn't generated for this exact Dec, filter again.
        beam_radius_deg = cal_config.get('bcal_search_beam_radius_deg', 1.5)
        dec_min = fixed_dec_deg - beam_radius_deg
        dec_max = fixed_dec_deg + beam_radius_deg
        df['dec_deg'] = df['dec_str'].apply(lambda x: Angle(x.replace('"',''), unit=u.deg).deg if pd.notna(x) else np.nan)
        dec_mask = (df['dec_deg'] >= dec_min) & (df['dec_deg'] <= dec_max) & (df['dec_deg'].notna())
        df_filtered = df[dec_mask].copy() # Use copy to avoid SettingWithCopyWarning later

        if df_filtered.empty:
             logger.error(f"No BPCAL candidates found in '{bcal_catalog_path}' within Dec range [{dec_min:.2f}, {dec_max:.2f}] deg.")
             return None

        # Ensure flux is numeric
        df_filtered['flux_num'] = pd.to_numeric(df_filtered['flux_jy'], errors='coerce')
        df_filtered = df_filtered.dropna(subset=['flux_num'])

        if df_filtered.empty:
             logger.error(f"No BPCAL candidates remain after converting flux to numeric.")
             return None

        # Select calibrator
        selected_cal = None
        if bcal_name_override:
            logger.info(f"Attempting to use specified BPCAL: {bcal_name_override}")
            selection = df_filtered[df_filtered['name'] == bcal_name_override]
            if not selection.empty:
                selected_cal = selection.iloc[0]
            else:
                logger.error(f"Specified BPCAL '{bcal_name_override}' not found in filtered list.")
                return None
        else:
            # Select the brightest one in the filtered list
            selected_cal = df_filtered.loc[df_filtered['flux_num'].idxmax()]
            logger.info(f"Selected brightest available BPCAL: {selected_cal['name']} (L-Flux: {selected_cal['flux_num']:.2f} Jy)")

        # Return info as a dictionary matching format needed by skymodel.create_calibrator_component_list
        cal_info = {
            'name': selected_cal['name'],
            'ra': selected_cal['ra_str'],
            'dec': selected_cal['dec_str'],
            'epoch': selected_cal.get('epoch', 'J2000'), # Default epoch if missing
            'flux_jy': selected_cal['flux_num'],
            'ref_freq_ghz': 1.4, # Assume L-band flux reference
            'spectral_index': None # Not available
        }
        return cal_info

    except Exception as e:
        logger.error(f"Failed to read or select from BPCAL catalog '{bcal_catalog_path}': {e}", exc_info=True)
        return None

def calculate_next_transit(bcal_info, telescope_loc):
    """Calculates the next transit time for the selected calibrator."""
    logger = pipeline_utils.get_logger(__name__)
    try:
        # Use SkyCoord for robust parsing and calculations
        cal_coord = SkyCoord(ra=bcal_info['ra'], dec=bcal_info['dec'], unit=(u.hourangle, u.deg), frame='icrs')
        logger.debug(f"BPCAL Coordinate: {cal_coord.to_string('hmsdms')}")

        current_time_utc = Time.now()
        # Calculate LST at current time
        current_lst = current_time_utc.sidereal_time('apparent', longitude=telescope_loc.lon)
        # Calculate HA of source now
        current_ha = (current_lst - cal_coord.ra).wrap_at(180 * u.deg)
        # Time until next transit (when HA = 0) is -HA / (rate of change of HA = Earth rotation rate)
        earth_rot_rate_approx = 360.9856 * u.deg / u.day # More precise rate
        time_to_transit = -current_ha / earth_rot_rate_approx

        next_transit_time = current_time_utc + time_to_transit

        # If time_to_transit is negative, it means transit already happened today,
        # so add one sidereal day (approx) to get the *next* one.
        if time_to_transit < TimeDelta(0 * u.s):
            next_transit_time += TimeDelta(1.0, format='jd', scale='tdb') * (1 * u.sday).to(u.day) # Add approx 1 sidereal day

        logger.info(f"Calculated next transit for {bcal_info['name']} at: {next_transit_time.iso}")
        return next_transit_time
    except Exception as e:
        logger.error(f"Failed to calculate transit time for {bcal_info['name']}: {e}", exc_info=True)
        return None

def find_hdf5_chunks_around_time(config, hdf5_dir, target_time):
    """Finds the HDF5 sets for the 5-min chunk containing target_time and the one before it."""
    logger = pipeline_utils.get_logger(__name__)
    ms_chunk_mins = config['services'].get('ms_chunk_duration_min', 5)
    tolerance_sec = config['ms_creation'].get('same_timestamp_tolerance', 30) # Use tolerance

    logger.info(f"Searching for HDF5 chunks around target transit time: {target_time.iso}")

    # Find all potential start times from filenames
    all_files = glob.glob(os.path.join(hdf5_dir, "20*_sb00.hdf5"))
    if not all_files:
         logger.error(f"No HDF5 files found in {hdf5_dir} matching pattern.")
         return None, None, None, None

    possible_start_times = []
    time_format = "%Y%m%dT%H%M%S"
    for f in all_files:
        try:
            ts_str = os.path.basename(f).split('_')[0]
            t = Time(datetime.strptime(ts_str, time_format), format='datetime', scale='utc')
            possible_start_times.append(t)
        except Exception as e:
            logger.warning(f"Could not parse time from {os.path.basename(f)}: {e}")
            continue

    if not possible_start_times:
        logger.error(f"No valid timestamps parsed from HDF5 files found in {hdf5_dir}.")
        return None, None, None, None

    possible_start_times = sorted(list(set(possible_start_times))) # Unique sorted times
    logger.debug(f"Found {len(possible_start_times)} unique potential start times.")

    # Find the chunk containing the target_time (transit)
    transit_chunk_start_time = None
    for i, t_start in enumerate(possible_start_times):
        # Consider a chunk valid if the target time is within +/- tolerance/2 of its midpoint?
        # Or simpler: find chunk where target_time falls between t_start and t_start + chunk_duration
        t_end = t_start + timedelta(minutes=ms_chunk_mins)
        if t_start <= target_time < t_end:
            transit_chunk_start_time = t_start
            logger.info(f"Found transit chunk starting at: {transit_chunk_start_time.iso}")
            break

    # Handle case where target time is not exactly within a chunk (pick closest start time before it)
    if transit_chunk_start_time is None:
         times_before = [t for t in possible_start_times if t <= target_time]
         if not times_before:
              logger.error(f"No HDF5 chunks found starting at or before the target transit time {target_time.iso}.")
              return None, None, None, None
         transit_chunk_start_time = times_before[-1] # Closest start time <= target time
         logger.warning(f"Target time {target_time.iso} not within a chunk's exact 5min window. Selecting closest preceding chunk: {transit_chunk_start_time.iso}")


    # Find the preceding chunk
    preceding_chunk_start_time = None
    transit_chunk_index = possible_start_times.index(transit_chunk_start_time)
    if transit_chunk_index > 0:
        preceding_chunk_start_time = possible_start_times[transit_chunk_index - 1]
        logger.info(f"Found preceding chunk starting at: {preceding_chunk_start_time.iso}")
    else:
        logger.error("Cannot find a chunk preceding the transit chunk. Need at least two chunks.")
        return None, None, None, None

    # Now find the actual complete file sets for these two timestamps using the tolerance
    ts1_dt = preceding_chunk_start_time.datetime
    ts2_dt = transit_chunk_start_time.datetime
    hdf5_sets = {}

    all_hdf5 = glob.glob(os.path.join(hdf5_dir, "20*.hdf5"))
    files_by_approx_ts = defaultdict(list)
    for f_path in all_hdf5:
         try:
             f_name = os.path.basename(f_path)
             ts_str = f_name.split('_')[0]
             file_dt = datetime.strptime(ts_str, time_format)
             # Group files that are close in time to the target start times
             if abs((file_dt - ts1_dt).total_seconds()) <= tolerance_sec:
                  files_by_approx_ts[ts1_dt.strftime(time_format)].append(f_path)
             elif abs((file_dt - ts2_dt).total_seconds()) <= tolerance_sec:
                  files_by_approx_ts[ts2_dt.strftime(time_format)].append(f_path)
         except Exception:
             continue # Ignore files with bad names

    # Check completeness for the two target timestamps
    expected_subbands = config['services']['hdf5_expected_subbands']
    spws_to_include = set(config['ms_creation']['spws'])
    ts1_str_exact = ts1_dt.strftime(time_format)
    ts2_str_exact = ts2_dt.strftime(time_format)

    for ts_key in [ts1_str_exact, ts2_str_exact]:
         found_files_for_ts = {}
         if ts_key in files_by_approx_ts:
              for f_path in files_by_approx_ts[ts_key]:
                   try:
                        f_name = os.path.basename(f_path)
                        spw_str = f_name.split('_')[1].replace('.hdf5', '')
                        base_spw = spw_str.split('spl')[0]
                        if base_spw in spws_to_include:
                             found_files_for_ts[base_spw] = f_path
                   except IndexError: continue
         if len(found_files_for_ts) == len(spws_to_include):
              logger.info(f"Found complete set for target time {ts_key}")
              sorted_filepaths = [found_files_for_ts[spw] for spw in sorted(list(spws_to_include))]
              hdf5_sets[ts_key] = sorted_filepaths
         else:
              logger.error(f"Incomplete HDF5 set found for target time {ts_key} ({len(found_files_for_ts)}/{len(spws_to_include)} required SPWs).")
              return None, None, None, None

    return hdf5_sets[ts1_str_exact], hdf5_sets[ts2_str_exact], preceding_chunk_start_time, transit_chunk_start_time



logging.info("Helper functions defined.")

2025-05-12 08:08:36 [INFO ] [MainThread] [root] Helper functions defined.


In [7]:
# Load the main pipeline configuration
config = config_parser.load_config(CONFIG_PATH) 
if not config:
    raise ValueError("Failed to load configuration.")

config['services']['hdf5_post_handle'] = 'none' 
logging.info("Ensuring HDF5 post_handle is set to 'none' for this test run.")

# --- Stage 0: MANUAL HDF5 Chunk Selection by Nominal Start Time ---
logging.info("--- Stage 0: MANUAL HDF5 Chunk Selection by Nominal Start Time ---")

HDF5_DIR_MANUAL = config['paths']['hdf5_incoming'] # Or override: '/data/incoming/' 

# == Specify your desired NOMINAL start times for the two 5-minute chunks ==
# For your example (2025-05-07T00:04:06/07), a nominal start might be "20250507T000400" or "20250507T000405"
# The exact nominal value here helps center the search window defined by `same_timestamp_tolerance`.
# Let's assume the first 5-min block you want to process starts *nominally* around ts1_manual_nominal_str
ts1_manual_nominal_str = "20250507T000500"  # first chunk's nominal start time
ts2_manual_nominal_str = "20250507T001000"  # second chunk's nominal start time
# ==========================================================================

hdf5_files_1 = collect_files_for_nominal_start_time(ts1_manual_nominal_str, HDF5_DIR_MANUAL, config)
hdf5_files_2 = collect_files_for_nominal_start_time(ts2_manual_nominal_str, HDF5_DIR_MANUAL, config)


2025-05-12 08:08:38 [INFO ] [MainThread] [pipeline.config_parser] Loading configuration from: config/pipeline_config.yaml
2025-05-12 08:08:38 [DEBUG] [MainThread] [root] Resolved relative path 'ms_stage1_dir': /data/jfaber/dsa110-contimg/pipeline/ms_stage1/
2025-05-12 08:08:38 [DEBUG] [MainThread] [root] Resolved relative path 'cal_tables_dir': /data/jfaber/dsa110-contimg/pipeline/cal_tables/
2025-05-12 08:08:38 [DEBUG] [MainThread] [root] Resolved relative path 'skymodels_dir': /data/jfaber/dsa110-contimg/pipeline/skymodels/
2025-05-12 08:08:38 [DEBUG] [MainThread] [root] Resolved relative path 'images_dir': /data/jfaber/dsa110-contimg/pipeline/images/
2025-05-12 08:08:38 [DEBUG] [MainThread] [root] Resolved relative path 'mosaics_dir': /data/jfaber/dsa110-contimg/pipeline/mosaics/
2025-05-12 08:08:38 [DEBUG] [MainThread] [root] Resolved relative path 'photometry_dir': /data/jfaber/dsa110-contimg/pipeline/photometry/
2025-05-12 08:08:38 [DEBUG] [MainThread] [root] Resolved relative pa

In [8]:
# The 'ts1_str' and 'ts2_str' should be the nominal timestamps used for collection,
# as these are used for directory/file naming in subsequent pipeline stages.
ts1_str = ts1_manual_nominal_str
ts2_str = ts2_manual_nominal_str

if not hdf5_files_1 or not hdf5_files_2:
    raise RuntimeError("Manual HDF5 file selection failed for one or both nominal start times. Check logs and HDF5_DIR.")

logging.info(f"Manually selected HDF5 chunk 1 (Nominal Start: {ts1_str}): Files: {list(map(os.path.basename, hdf5_files_1))}")
logging.info(f"Manually selected HDF5 chunk 2 (Nominal Start: {ts2_str}): Files: {list(map(os.path.basename, hdf5_files_2))}")

# --- You still need to select a BPCAL for calibration/imaging metadata ---
# Determine observation declination (can still be automatic or you can hardcode it)
# It will read one of the files from your HDF5_DIR_MANUAL to get the declination
fixed_dec_deg = get_obs_declination(config, HDF5_DIR_MANUAL) 
if fixed_dec_deg is None: 
    logging.warning("Failed to get observation declination automatically. Using a default or you might need to set it manually.")
    fixed_dec_deg = 67.0 # Example default value, adjust as needed
config['calibration']['fixed_declination_deg'] = fixed_dec_deg
logging.info(f"Observation Declination set to: {fixed_dec_deg:.4f} degrees for this run.")

selected_bcal_info = select_bcal_for_test(config, fixed_dec_deg, BCAL_NAME_OVERRIDE) 
if selected_bcal_info is None: 
    logging.warning(f"Failed to select BPCAL for test. Subsequent steps might be affected.")
    # Optionally, provide a default BPCAL dictionary here if needed for the test to proceed
    # selected_bcal_info = {'name': '3C286', 'ra': '13h31m08.288s', 'dec': '+30d30m32.96s', 
    #                       'epoch': 'J2000', 'flux_jy': 14.79, 'ref_freq_ghz': 1.4}


# Store variables for the next cells
%store ts1_str 
%store ts2_str 
%store hdf5_files_1 
%store hdf5_files_2 
%store selected_bcal_info 
%store config

2025-05-12 08:08:46 [INFO ] [MainThread] [root] Manually selected HDF5 chunk 1 (Nominal Start: 20250507T000500): Files: ['2025-05-07T00:04:06_sb02.hdf5', '2025-05-07T00:04:06_sb10.hdf5', '2025-05-07T00:04:07_sb00.hdf5', '2025-05-07T00:04:07_sb01.hdf5', '2025-05-07T00:04:07_sb03.hdf5', '2025-05-07T00:04:07_sb04.hdf5', '2025-05-07T00:04:07_sb05.hdf5', '2025-05-07T00:04:07_sb06.hdf5', '2025-05-07T00:04:07_sb07.hdf5', '2025-05-07T00:04:07_sb08.hdf5', '2025-05-07T00:04:07_sb09.hdf5', '2025-05-07T00:04:07_sb11.hdf5', '2025-05-07T00:04:07_sb12.hdf5', '2025-05-07T00:04:07_sb13.hdf5', '2025-05-07T00:04:07_sb14.hdf5', '2025-05-07T00:04:07_sb15.hdf5']
2025-05-12 08:08:46 [INFO ] [MainThread] [root] Manually selected HDF5 chunk 2 (Nominal Start: 20250507T001000): Files: ['2025-05-07T00:09:16_sb00.hdf5', '2025-05-07T00:09:16_sb01.hdf5', '2025-05-07T00:09:16_sb02.hdf5', '2025-05-07T00:09:16_sb03.hdf5', '2025-05-07T00:09:16_sb04.hdf5', '2025-05-07T00:09:16_sb05.hdf5', '2025-05-07T00:09:16_sb07.hdf5',

In [9]:
# Notebook Cell: MS Creation
%store -r ts1_str ts2_str hdf5_files_1 hdf5_files_2 selected_bcal_info config # Load variables

logging.info("--- Stage 1: MS Creation ---")
ms_path_1 = ms_creation.process_hdf5_set(config, ts1_str, hdf5_files_1)
ms_path_2 = ms_creation.process_hdf5_set(config, ts2_str, hdf5_files_2)

if not ms_path_1 or not ms_path_2:
    raise RuntimeError("MS Creation failed for one or both chunks.")

logging.info(f"Created MS files: {os.path.basename(ms_path_1)}, {os.path.basename(ms_path_2)}")
ms_files_to_process = [ms_path_1, ms_path_2]

# Store paths for next cell
%store ms_files_to_process selected_bcal_info config

no stored variable or alias #
no stored variable or alias Load
no stored variable or alias variables
2025-05-12 08:08:55 [INFO ] [MainThread] [root] --- Stage 1: MS Creation ---
2025-05-12 08:08:55 [INFO ] [MainThread] [pipeline.ms_creation] MS_CREATION - PyUVData version: 3.2.1, Path: /data/jfaber/conda/envs/dsa_contimg/lib/python3.10/site-packages/pyuvdata/__init__.py
2025-05-12 08:08:55 [INFO ] [MainThread] [pipeline.ms_creation] Processing HDF5 set for timestamp: 20250507T000500
2025-05-12 08:08:55 [INFO ] [MainThread] [pipeline.ms_creation] Loading 16 HDF5 files for one time chunk...
2025-05-12 08:08:55 [INFO ] [MainThread] [pipeline.ms_creation] Attempting to read first file: /data/incoming/2025-05-07T00:04:06_sb02.hdf5
2025-05-12 08:08:56 [DEBUG] [MainThread] [pipeline.ms_creation] Successfully executed read command for /data/incoming/2025-05-07T00:04:06_sb02.hdf5
2025-05-12 08:08:56 [DEBUG] [MainThread] [pipeline.ms_creation] Original uvw_array dtype from /data/incoming/2025-05

The uvw_array does not match the expected values given the antenna positions. The largest discrepancy is 1253.2069264236213 meters. This is a fairly common situation but might indicate an error in the antenna positions, the uvws or the phasing.


2025-05-12 08:08:57 [DEBUG] [MainThread] [pipeline.ms_creation] Reading subsequent file 1: /data/incoming/2025-05-07T00:04:06_sb10.hdf5
2025-05-12 08:08:57 [DEBUG] [MainThread] [pipeline.ms_creation] Converting uvw_array for /data/incoming/2025-05-07T00:04:06_sb10.hdf5 to float64.
2025-05-12 08:08:57 [DEBUG] [MainThread] [pipeline.ms_creation] Successfully read and processed subsequent file /data/incoming/2025-05-07T00:04:06_sb10.hdf5
2025-05-12 08:08:57 [DEBUG] [MainThread] [pipeline.ms_creation] Reading subsequent file 2: /data/incoming/2025-05-07T00:04:07_sb00.hdf5
2025-05-12 08:08:58 [DEBUG] [MainThread] [pipeline.ms_creation] Converting uvw_array for /data/incoming/2025-05-07T00:04:07_sb00.hdf5 to float64.
2025-05-12 08:08:58 [DEBUG] [MainThread] [pipeline.ms_creation] Successfully read and processed subsequent file /data/incoming/2025-05-07T00:04:07_sb00.hdf5
2025-05-12 08:08:58 [DEBUG] [MainThread] [pipeline.ms_creation] Reading subsequent file 3: /data/incoming/2025-05-07T00:04

The uvw_array does not match the expected values given the antenna positions. The largest discrepancy is 1253.2069264236213 meters. This is a fairly common situation but might indicate an error in the antenna positions, the uvws or the phasing.


2025-05-12 08:09:31 [DEBUG] [MainThread] [pipeline.ms_creation] Final telescope antenna names in _load_uvh5_file: ['1' '2' '3' '4' '5' '6' '7' '8' '9' '10' '11' '12' '13' '14' '15' '16'
 '17' '18' '19' '20' '21' '22' '23' '24' '25' '26' '27' '28' '29' '30'
 '31' '32' '33' '34' '35' '36' '37' '38' '39' '40' '41' '42' '43' '44'
 '45' '46' '47' '48' '49' '50' '51' '52' '53' '54' '55' '56' '57' '58'
 '59' '60' '61' '62' '63' '64' '65' '66' '67' '68' '69' '70' '71' '72'
 '73' '74' '75' '76' '77' '78' '79' '80' '81' '82' '83' '84' '85' '86'
 '87' '88' '89' '90' '91' '92' '93' '94' '95' '96' '97' '98' '99' '100'
 '101' '102' '103' '104' '105' '106' '107' '108' '109' '110' '111' '112'
 '113' '114' '115' '116' '117']
2025-05-12 08:09:31 [INFO ] [MainThread] [pipeline.ms_creation] Finished loading data. Nbls: 4656, Ntimes: 24, Nfreqs: 768, Nants: 96
2025-05-12 08:09:31 [INFO ] [MainThread] [pipeline.ms_creation] Calculating and setting phase centers for drift scan.
2025-05-12 08:09:31 [INFO ] [M

UVData object contains a mix of baseline conjugation states, which is not uniformly supported in CASA -- forcing conjugation to be "ant2<ant1" on object.


: 

In [None]:
# Notebook Cell: Calibration and Imaging
%store -r ms_files_to_process selected_bcal_info config # Load variables

logging.info("--- Stage 2: Calibration and Imaging ---")
processed_images = []
processed_pbs = []
block_mask_path = None
template_image_path = None
gcal_table_path = None
cl_path_bcal = None
paths_config = config['paths'] # Get paths config

# 2a. Find latest BPCAL table
try:
    cal_tables_dir = paths_config['cal_tables_dir']
    bcal_files = sorted(glob.glob(os.path.join(cal_tables_dir, "*.bcal")))
    if not bcal_files: raise RuntimeError(f"No BPCAL tables (*.bcal) found in {cal_tables_dir}.")
    latest_bcal_table = bcal_files[-1]
    logging.info(f"Using BPCAL table: {os.path.basename(latest_bcal_table)}")
except Exception as e:
    logging.critical(f"Failed to find BPCAL table: {e}. Aborting test.")
    raise e # Stop execution

# 2b. Generate Calibrator Model & Gain Cal Table (using transit chunk only)
try:
    skymodels_dir = paths_config['skymodels_dir']
    cl_bcal_filename = f"bcal_sky_{selected_bcal_info['name']}_test.cl" # Add suffix
    cl_bcal_output_path = os.path.join(skymodels_dir, cl_bcal_filename)
    cl_path_bcal, _ = skymodel.create_calibrator_component_list(config, selected_bcal_info, cl_bcal_output_path)
    if not cl_path_bcal: raise RuntimeError("Failed to create BPCAL sky model.")

    # Use the second MS (transit chunk) for gain cal
    ms_path_transit = ms_files_to_process[1]
    ts_transit = os.path.basename(ms_path_transit).split('_')[1].replace('.ms', '')
    logging.info(f"Performing gain calibration on transit chunk: {os.path.basename(ms_path_transit)}")
    gcal_time_str = f"bcal_test_{ts_transit}"
    gcal_table_path = calibration.perform_gain_calibration(config, [ms_path_transit], cl_path_bcal, gcal_time_str, solint='inf')
    if not gcal_table_path: raise RuntimeError("Gain calibration on BPCAL failed.")
    logging.info(f"Gain table generated: {os.path.basename(gcal_table_path)}")
except Exception as e:
    logging.error(f"Failed during gain calibration setup stage: {e}", exc_info=True)
    logging.warning("Proceeding without gain calibration solutions.")
    gcal_table_path = []

# 2c. Prepare Mask (using BPCAL model, defer creation until template exists)
use_mask_config = config.get('imaging',{}).get('use_clean_mask', False)
mask_output_path = None
if use_mask_config and cl_path_bcal:
    mask_output_path = os.path.join(skymodels_dir, f"mask_bcal_test_{selected_bcal_info['name']}.mask")
    logging.info(f"Will attempt to create mask: {mask_output_path}")
else:
    logging.info("Masking disabled or BPCAL model missing, skipping mask.")
mask_created = False

# 2d. Loop through MS files
images_dir = paths_config['images_dir']
for i, ms_path in enumerate(ms_files_to_process):
    logging.info(f"Processing MS {i+1}/{len(ms_files_to_process)}: {os.path.basename(ms_path)}")
    ms_base = os.path.splitext(os.path.basename(ms_path))[0]
    image_base = os.path.join(images_dir, f"{ms_base}_test")

    try:
        if not calibration.flag_rfi(config, ms_path): raise RuntimeError("RFI Flagging failed.")
        if not calibration.flag_general(config, ms_path): raise RuntimeError("General Flagging failed.")

        gcal_list = [gcal_table_path] if gcal_table_path and isinstance(gcal_table_path, str) else []
        if not calibration.apply_calibration(config, ms_path, latest_bcal_table, gcal_list):
            raise RuntimeError("ApplyCal failed.")

        ms_to_image = ms_path
        current_mask_path = None
        if use_mask_config and mask_output_path:
            if not mask_created:
                if template_image_path:
                    logging.info(f"Creating block mask {mask_output_path} using template {template_image_path}")
                    if imaging.create_clean_mask(config, cl_path_bcal, template_image_path, mask_output_path):
                        mask_created = True
                    else: logging.warning("Failed to create mask. Proceeding without.")
                else: logging.debug("Template image not yet available for mask creation.")
            if mask_created: current_mask_path = mask_output_path

        logging.info("Running tclean...")
        tclean_image_basename = imaging.run_tclean(config, ms_to_image, image_base, cl_path=None, mask_path=current_mask_path)

        if tclean_image_basename:
            img_path = f"{tclean_image_basename}.image"
            pb_path = f"{tclean_image_basename}.pb"
            if os.path.exists(img_path) and os.path.exists(pb_path):
                processed_images.append(img_path); processed_pbs.append(pb_path)
                logging.info(f"Successfully imaged {ms_path}")
                if template_image_path is None: template_image_path = img_path
            else: raise RuntimeError(f"tclean image/pb missing for {tclean_image_basename}")
        else: raise RuntimeError("tclean failed.")

    except Exception as e_ms:
        logging.error(f"Failed processing MS {ms_path}: {e_ms}", exc_info=True)
        raise e_ms # Stop execution on failure

# Store results for next cell
%store processed_images processed_pbs config selected_bcal_info ts1_str ts2_str

In [None]:
# Notebook Cell: Mosaicking
%store -r processed_images processed_pbs config selected_bcal_info ts1_str ts2_str # Load variables

mosaic_img_path = None
if len(processed_images) == 2:
    logging.info("--- Stage 3: Mosaicking ---")
    # Use timestamps from original chunks for naming
    mosaic_basename = f"mosaic_test_{ts1_str}_{ts2_str}"
    try:
        mosaic_img_path, _ = mosaicking.create_mosaic(config, processed_images, processed_pbs, mosaic_basename)
        if not mosaic_img_path: raise RuntimeError("Mosaicking function returned None.")
        logging.info(f"Mosaic created: {mosaic_img_path}")
        # Store for next cell
        %store mosaic_img_path config selected_bcal_info ts1_str ts2_str
    except Exception as e_mosaic:
        logging.error(f"Mosaicking failed: {e_mosaic}", exc_info=True)
        raise e_mosaic # Stop execution
else:
    raise RuntimeError(f"Could not proceed to mosaicking: Only {len(processed_images)} images were created.")

In [None]:
# Notebook Cell: Photometry
%store -r mosaic_img_path config selected_bcal_info ts1_str ts2_str # Load variables

if mosaic_img_path:
    logging.info("--- Stage 4: Photometry ---")
    mosaic_fits_path = f"{os.path.splitext(mosaic_img_path)[0]}.linmos.fits"
    if not os.path.exists(mosaic_fits_path):
         logging.warning(f"Mosaic FITS {mosaic_fits_path} not found, attempting export...")
         mosaic_fits_path = imaging.export_image_to_fits(config, mosaic_img_path, suffix='.linmos')

    if mosaic_fits_path and os.path.exists(mosaic_fits_path):
        logging.info(f"Running photometry on mosaic: {mosaic_fits_path}")
        try:
            targets, references = photometry.identify_sources(config, mosaic_fits_path)
            # Convert to pandas DataFrames for easier handling below
            phot_targets_df = pd.DataFrame(targets) if targets is not None else pd.DataFrame()
            phot_references_df = pd.DataFrame(references) if references is not None else pd.DataFrame()

            # Add BPCAL to targets list if not already there
            if selected_bcal_info and selected_bcal_info['name'] not in phot_targets_df['name'].values:
                 try:
                      bcal_coord = SkyCoord(ra=selected_bcal_info['ra'], dec=selected_bcal_info['dec'], unit=(u.hourangle, u.deg), frame='icrs')
                      with fits.open(mosaic_fits_path) as hdul: wcs = WCS(hdul[0].header).celestial
                      xpix, ypix = wcs.world_to_pixel(bcal_coord)
                      # Create row ensuring necessary columns exist
                      bcal_row_data = {'name': selected_bcal_info['name'], 'source_id': selected_bcal_info['name'],
                                      'RAJ2000': selected_bcal_info['ra'], 'DEC_J2000': selected_bcal_info['dec'],
                                      'xpix': xpix, 'ypix': ypix}
                      for col in phot_targets_df.columns:
                           if col not in bcal_row_data: bcal_row_data[col] = np.nan
                      phot_targets_df = pd.concat([phot_targets_df, pd.DataFrame([bcal_row_data])], ignore_index=True)
                      logging.info(f"Added BPCAL {selected_bcal_info['name']} to target list for photometry.")
                 except Exception as e_add: logging.warning(f"Could not add BPCAL to target list: {e_add}")


            if not phot_targets_df.empty and not phot_references_df.empty:
                phot_table = photometry.perform_aperture_photometry(config, mosaic_fits_path, phot_targets_df, phot_references_df)
                if phot_table is not None:
                    rel_flux_table = photometry.calculate_relative_fluxes(config, phot_table) # Assumes returns DF
                    if rel_flux_table is not None:
                        logging.info("Photometry successful. Relative flux results:")
                        print("\n--- Relative Photometry Results ---")
                        # Display relevant columns using pandas display
                        display_cols = ['source_id', 'relative_flux', 'relative_flux_error', 'median_reference_flux', 'reference_source_ids']
                        # Ensure columns exist before displaying
                        display_cols = [col for col in display_cols if col in rel_flux_table.columns]
                        display(rel_flux_table[display_cols]) # Use IPython display

                        # Save to a test CSV
                        test_output_csv = os.path.join(config['paths']['photometry_dir'], f"test_photometry_{ts1_str}_{ts2_str}.csv")
                        rel_flux_table.to_csv(test_output_csv, index=False, float_format='%.4f', na_rep='NaN')
                        logging.info(f"Saved test photometry results to: {test_output_csv}")
                    else: logging.error("Relative flux calculation failed.")
                else: logging.error("Aperture photometry failed.")
            elif phot_targets_df.empty: logging.warning("No target sources identified/valid for photometry.")
            else: logging.error("Reference source identification failed or references missing.")
        except Exception as e_phot: logging.error(f"Photometry stage failed: {e_phot}", exc_info=True)
    else: logging.error(f"Mosaic FITS file missing: {mosaic_fits_path}. Cannot run photometry.")

logging.info("--- Notebook Test Run Finished ---")