# Goals

Create a file that will load the data to my local device such that I do not have to interact with the temperamental SPARCL Client,


# Imports

In [4]:
import os
import pandas as pd
from sparcl.client import SparclClient
from dl import queryClient as qc, authClient as ac

In [5]:
DATA_DIR = '/Users/elicox/Desktop/Mac/Work/Yr4 Work/Project/CNN-auto/'
CSV_PATH = os.path.join(DATA_DIR, 'spectra_data.csv')
client = SparclClient()

ServerConnectionError: [SRVCONER] Could not connect to https://astrosparcl.datalab.noirlab.edu/sparc/version/. HTTPSConnectionPool(host='astrosparcl.datalab.noirlab.edu', port=443): Max retries exceeded with url: /sparc/version/ (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x15f4c0c70>, 'Connection to astrosparcl.datalab.noirlab.edu timed out. (connect timeout=1.1)'))Did you enable VPN?

In [6]:
import os
import pandas as pd
from sparcl.client import SparclClient
from tqdm import tqdm
import time

# File paths
DATA_DIR = '/Users/elicox/Desktop/Mac/Work/Yr4 Work/Project/CNN-auto/'
CSV_PATH = os.path.join(DATA_DIR, 'spectra_data.csv')
os.makedirs(DATA_DIR, exist_ok=True)

# Initialize SPARCL client
client = SparclClient()

def query_and_save_sparcl_data(csv_path):
    """
    Queries the DESI database and saves the results to a CSV file.
    """
    query = """
    SELECT zp.targetid, zp.z, zp.zwarn, zp.coadd_fiberstatus, zp.spectype, 
           zp.mean_fiber_ra, zp.mean_fiber_dec
    FROM desi_edr.zpix AS zp
    WHERE zp.zcat_primary = 't'
      AND zp.zcat_nspec > 2
      AND zp.z <= 0.5
      AND zp.spectype = 'GALAXY'
      AND zp.zwarn = '0'
    """
    try:
        print("Querying SPARCL database...")
        zpix_cat = client.query(sql=query, fmt='table')
        df = zpix_cat.to_pandas()
        print(f"Retrieved {len(df)} records.")
        df.to_csv(csv_path, index=False)
        print(f"Data saved to {csv_path}.")
        return df
    except Exception as e:
        print(f"Error querying SPARCL: {e}")
        return None

def load_or_query_data(csv_path):
    """
    Loads data from CSV if available; otherwise, queries SPARCL and saves the results.
    """
    if os.path.exists(csv_path):
        print(f"Loading data from {csv_path}...")
        return pd.read_csv(csv_path)
    else:
        return query_and_save_sparcl_data(csv_path)

zpix_cat = load_or_query_data(CSV_PATH)
if zpix_cat is None or zpix_cat.empty:
    raise RuntimeError("Failed to retrieve or load data.")


KeyboardInterrupt: 

In [None]:
def retrieve_flux_with_retry(targetid, retries=5, delay=3):
    """
    Retrieves flux, wavelength, and error for a given target ID with retry logic.
    """
    inc = ['specid', 'flux', 'wavelength', 'ivar']
    for attempt in range(retries):
        try:
            result = client.retrieve_by_specid([targetid], include=inc, dataset_list=['DESI-EDR'])
            record = next(rec for rec in result.records if rec['specprimary'])
            flux, wavelength, ivar = record['flux'], record['wavelength'], record['ivar']
            error = 1 / (ivar ** 0.5 + 1e-10)
            return flux, wavelength, error
        except Exception as e:
            print(f"Retry {attempt + 1}/{retries} failed for target {targetid}: {e}")
            time.sleep(delay * (2 ** attempt))
    return None, None, None

def process_flux_data(zpix_cat, batch_size=20):
    """
    Processes flux data for all targets and saves the results.
    """
    all_fluxes, all_wavelengths, all_errors = [], [], []
    for i in tqdm(range(0, len(zpix_cat), batch_size), desc="Retrieving flux data"):
        batch = zpix_cat.iloc[i:i + batch_size]
        for _, row in batch.iterrows():
            flux, wavelength, error = retrieve_flux_with_retry(row['targetid'])
            if flux is not None:
                all_fluxes.append(flux)
                all_wavelengths.append(wavelength)
                all_errors.append(error)
    return all_fluxes, all_wavelengths, all_errors


In [None]:
# Load data (skip SPARCL query on re-runs)
zpix_cat = load_or_query_data(CSV_PATH)

# Retrieve or load fluxes and related arrays
fluxes, wavelengths, errors = process_flux_data(zpix_cat)
