# Full working

In [None]:
import os
import subprocess
import csv
from datetime import datetime, timedelta
from tqdm import tqdm


def get_unique_dates_from_csv(csv_file_path, limit_rows=None):
    """Extract unique observation dates from the CSV file, with optional row limit."""
    unique_dates = set()

    with open(csv_file_path, 'r') as csvfile:
        reader = csv.DictReader(csvfile)
        total_rows = sum(1 for _ in open(csv_file_path)) - 1  # minus header
        csvfile.seek(0)

        for i, row in enumerate(tqdm(reader, total=limit_rows or total_rows, desc="Reading CSV")):
            if limit_rows is not None and i >= limit_rows:
                break

            if 'observed_on' in row and row['observed_on']:
                date_str = row['observed_on'].split(" ")[0]
                date_obj = None
                
                # Try different date formats
                date_formats = [
                    "%Y-%m-%d",      # 2022-03-02
                    "%m/%d/%Y",      # 2/3/2022
                    "%d/%m/%Y",      # 3/2/2022
                    "%Y/%m/%d",      # 2022/03/02
                    "%m-%d-%Y",      # 2-3-2022
                    "%d-%m-%Y",      # 3-2-2022
                ]
                
                for fmt in date_formats:
                    try:
                        date_obj = datetime.strptime(date_str, fmt)
                        break
                    except ValueError:
                        continue
                
                if date_obj:
                    unique_dates.add(date_obj)
                else:
                    print(f"⚠️ Could not parse date: {row['observed_on']}")
                    continue
    return sorted(list(unique_dates))


def generate_file_structure_from_csv(csv_file_path, output_file_path, limit_rows=None):
    """
    Generate file structure for all dates in CSV with 15 days before each observation,
    for both MSWX_V100/PAST and MSWX_V100/NRT.
    """
    unique_dates = get_unique_dates_from_csv(csv_file_path, limit_rows=limit_rows)
    print(f"📅 Found {len(unique_dates)} unique observation dates")

    all_files = set()
    variables = ['Wind', 'P', 'Pres', 'RelHum', 'SpecHum', 'Tmin', 'Tmax', 'Temp']

    for obs_date in tqdm(unique_dates, desc="Building file list"):
        for i in range(15):
            target_date = obs_date - timedelta(days=i)
            date_string = target_date.strftime("%Y%j")

            for variable in variables:
                all_files.add(f"+ /Past/{variable}/Daily/{date_string}.nc")
                all_files.add(f"+ /NRT/{variable}/Daily/{date_string}.nc")

    file_list = sorted(list(all_files))
    file_list.append("- *")  # exclude everything else

    with open(output_file_path, 'w') as file:
        for line in file_list:
            file.write(line + '\n')

    print(f"📝 Generated {len(file_list)-1} unique file entries")
    print(f"✅ File structure saved to: {output_file_path}")
    return file_list


def run_rclone_sync_fixed(filter_file_path, dest_folder="climate_data"):
    """Run rclone sync for both PAST and NRT using one filter file"""
    rclone_path = os.path.join("..", "docker", "rclone.exe")

    if not os.path.exists(rclone_path):
        print(f"❌ Error: rclone not found at {rclone_path}")
        return

    command = [
        rclone_path, "sync", "-v",
        "--filter-from", filter_file_path,
        "--drive-shared-with-me",
        "google:/MSWX_V100",  # root folder
        dest_folder
    ]

    print(f"▶️ Running command: {' '.join(command)}")
    try:
        subprocess.run(command, check=True)
        print("✅ rclone sync command executed successfully.")
        print(f"📂 Climate data downloaded to: {dest_folder}")
    except subprocess.CalledProcessError as e:
        print(f"❌ Error running rclone: {e}")
    except FileNotFoundError:
        print("❌ Error: rclone executable not found.")


def download_climate_data_from_csv_fixed(csv_file_path, dest_folder="..//climate_data", limit_rows=None):
    """Complete workflow with filter file + rclone sync"""
    filter_file_path = "climate_files_from_csv.txt"

    print("Step 1: Generating file structure from CSV...")
    generate_file_structure_from_csv(csv_file_path, filter_file_path, limit_rows=limit_rows)

    print("Step 2: Downloading climate data files...")
    run_rclone_sync_fixed(filter_file_path, dest_folder)

    return filter_file_path


if __name__ == "__main__":
    csv_file_path = "..//data/negative_samples_within_land_10k_with_coords_topography.csv"
    download_climate_data_from_csv_fixed(csv_file_path)


Step 1: Generating file structure from CSV...


Reading CSV: 100%|██████████| 5403/5403 [00:00<00:00, 173094.09it/s]


📅 Found 2744 unique observation dates


Building file list: 100%|██████████| 2744/2744 [00:00<00:00, 20204.64it/s]


📝 Generated 140032 unique file entries
✅ File structure saved to: climate_files_from_csv.txt
Step 2: Downloading climate data files...
▶️ Running command: ..\docker\rclone.exe sync -v --filter-from climate_files_from_csv.txt --drive-shared-with-me google:/MSWX_V100 ..//climate_data
