In [6]:
import os
import re
import shutil
from glob import glob
from pathlib import Path

import pandas as pd

In [2]:
def extract_dates_from_file(file_path):
    """
    Extract the reference and secondary dates from the input text file.

    Parameters:
    - file_path: Path to the input text file.

    Returns:
    - A tuple containing the reference date and secondary date as strings (YYYYMMDD).
    """
    with open(file_path, "r") as file:
        content = file.read()

    # Extract reference and secondary dates using the adjusted regex pattern based on the file structure
    ref_date_match = re.search(r"S1_\d+_IW\d+_(\d{8})T", content)
    sec_date_match = re.search(r"S1_\d+_IW\d+_(\d{8})T", content.split("--secondary")[1])

    if not ref_date_match or not sec_date_match:
        raise ValueError("Could not extract reference or secondary date from the file.")

    ref_date = ref_date_match.group(1)
    sec_date = sec_date_match.group(1)

    return ref_date, sec_date


# -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- --


def search_safe_files(storage_dir, ref_date, sec_date):
    """
    Search for SAFE files corresponding to the reference and secondary dates in the storage directory.

    Parameters:
    - storage_dir: Directory where SAFE files are stored.
    - ref_date: Reference date as string (YYYYMMDD).
    - sec_date: Secondary date as string (YYYYMMDD).

    Returns:
    - A tuple containing the paths to the reference and secondary SAFE files.
    """
    safe_files = os.listdir(storage_dir)
    ref_safe_file = None
    sec_safe_file = None

    # Search for files containing the reference and secondary dates
    for safe_file in safe_files:
        if ref_date in safe_file:
            ref_safe_file = safe_file
        elif sec_date in safe_file:
            sec_safe_file = safe_file

    if not ref_safe_file or not sec_safe_file:
        raise FileNotFoundError("Could not find both reference and secondary SAFE files.")

    return Path(os.path.join(storage_dir, ref_safe_file)), Path(os.path.join(storage_dir, sec_safe_file))

In [4]:
# Example usage
script_fld = (
    r"K:\PROJECT\001_CHOUSHUI_RIVER_BASIN\200_CRFP_S1A_HYP3\CRFP_desc\scripts"  # Replace with your input file path
)
# script_fld = "/mnt/hgfs/SharedFolder/ISCE_Mintpy/temp2/"  # Replace with your input file path
storage_dir = (
    r"K:\PROJECT\001_CHOUSHUI_RIVER_BASIN\200_CRFP_S1A_HYP3\CRFP_desc\DOWNLOAD_SAFE"  # Directory containing SAFE files
)

# select_year = 2018
# files2process = sorted(glob(os.path.join(script_fld, f"*_{select_year}*_*_VV*.txt")))
files2process = []

for select_year in range(2014, 2025):
    files2process.extend(sorted(glob(os.path.join(script_fld, f"*_{select_year}*_*_VV*.txt"))))

cache = []

# select_fpath = files2process[0]

for select_fpath in files2process:
    try:
        # Extract dates
        ref_date, sec_date = extract_dates_from_file(select_fpath)

        # Search for SAFE files
        ref_safe_file, sec_safe_file = search_safe_files(storage_dir, ref_date, sec_date)

        cache.extend([str(ref_safe_file), str(sec_safe_file)])
    except Exception as e:
        print(select_fpath)
        print(e)
        pass

cache = sorted(set(cache))

K:\PROJECT\001_CHOUSHUI_RIVER_BASIN\200_CRFP_S1A_HYP3\CRFP_desc\scripts\S1AA_20240901_20241019_VVP048.txt
Could not find both reference and secondary SAFE files.
K:\PROJECT\001_CHOUSHUI_RIVER_BASIN\200_CRFP_S1A_HYP3\CRFP_desc\scripts\S1AA_20240913_20241019_VVP036.txt
Could not find both reference and secondary SAFE files.
K:\PROJECT\001_CHOUSHUI_RIVER_BASIN\200_CRFP_S1A_HYP3\CRFP_desc\scripts\S1AA_20240925_20241019_VVP024.txt
Could not find both reference and secondary SAFE files.
K:\PROJECT\001_CHOUSHUI_RIVER_BASIN\200_CRFP_S1A_HYP3\CRFP_desc\scripts\S1AA_20241007_20241019_VVP012.txt
Could not find both reference and secondary SAFE files.
K:\PROJECT\001_CHOUSHUI_RIVER_BASIN\200_CRFP_S1A_HYP3\CRFP_desc\scripts\S1AA_20241019_20241031_VVP012.txt
Could not find both reference and secondary SAFE files.


In [9]:
df = pd.read_csv(r"K:\PROJECT\001_CHOUSHUI_RIVER_BASIN\200_CRFP_S1A_HYP3\burst_to_download_20241111.csv")
unique_date = sorted(set(df["date"].tolist()))
df.head(5)

Unnamed: 0,date,burst
0,20230227,S1_224317_IW2_20230227T215252_VV_6C09-BURST
1,20230227,S1_224318_IW2_20230227T215255_VV_9F75-BURST
2,20230227,S1_224318_IW3_20230227T215256_VV_9F75-BURST
3,20230227,S1_224319_IW2_20230227T215257_VV_9F75-BURST
4,20230227,S1_224319_IW3_20230227T215258_VV_9F75-BURST


In [15]:
downloaded_files = sorted(set([os.path.basename(f).split("_")[5].split("T")[0] for f in glob(os.path.join(storage_dir, "*.SAFE"))]))
downloaded_files[:3]

['20141105', '20141129', '20141223']

In [4]:
len(cache)

72

In [5]:
# with open(f"SAFE_files_to_process_{select_year}.txt", "w") as file:
with open(f"SAFE_files_to_process_2014_2018.txt", "w") as file:
    for fpath in cache:
        file.write(f"{os.path.basename(fpath)}\n")