# Downloading GCM data through Python

Use esgf-pyclient

## 1. Construct ESGF Search Query:

In [1]:
# Run in terminal
# pip install esgf-pyclient
# pip install wget

In [2]:
import os
os.environ['ESGF_PYCLIENT_NO_FACETS_STAR_WARNING'] = '1'

In [3]:
print("Current working directory:", os.getcwd())

Current working directory: /home/zeus/Documents/UNIZAR/Projects/2025/ERA5_Record_Algorithm/Code/1-Data_Retrieval/1-3_GCMs


In [4]:
from pathlib import Path

# === CONFIGURATION ===
download_folder = "cmip6_ta"
url_list_file = Path(download_folder) / "cmip6_ta_1960_urls.txt"
# Define years of interest
target_years = ['1960']

In [5]:
# === STEP 1: Make download folder if not exists ===
Path(download_folder).mkdir(parents=True, exist_ok=True)

In [6]:
# === STEP 2: Connect to ESGF and search dataset ===
from pyesgf.search import SearchConnection

# 2.1. Connect to an ESGF node
conn = SearchConnection('https://esgf-node.llnl.gov/esg-search', distrib=True)

# 2.2. Create a search context with specific constraints
ctx = conn.new_context(
    project='CMIP6',
    source_id='EC-Earth3-AerChem',
    experiment_id='historical',
    variable_id='ta',
    frequency='day',
    table_id='day',
    member_id='r1i1p1f1',
    latest=True,
    facets='project,experiment_id,source_id,variable_id,table_id,member_id'
)

# 2.3. Execute the search
results = ctx.search()

In [7]:
# Print number of searchs
print(f"Found {len(results)} datasets.")

# If results are found, print their dataset IDs
if len(results) > 0:
    print("\nDataset IDs:")
    for result in results:
        print(result.dataset_id)

Found 5 datasets.

Dataset IDs:
CMIP6.CMIP.EC-Earth-Consortium.EC-Earth3-AerChem.historical.r1i1p1f1.day.ta.gr.v20200624|esgf-data1.llnl.gov
CMIP6.CMIP.EC-Earth-Consortium.EC-Earth3-AerChem.historical.r1i1p1f1.day.ta.gr.v20200624|esgf-data04.diasjp.net
CMIP6.CMIP.EC-Earth-Consortium.EC-Earth3-AerChem.historical.r1i1p1f1.day.ta.gr.v20200624|esgf.ceda.ac.uk
CMIP6.CMIP.EC-Earth-Consortium.EC-Earth3-AerChem.historical.r1i1p1f1.day.ta.gr.v20200624|esg-dn2.nsc.liu.se
CMIP6.CMIP.EC-Earth-Consortium.EC-Earth3-AerChem.historical.r1i1p1f1.day.ta.gr.v20200624|esgf.nci.org.au


In [8]:
# Deduplicate the results (keep one replica per dataset)
unique_datasets = {}
for result in results:
    base_id = result.dataset_id.split('|')[0]
    if base_id not in unique_datasets:
        unique_datasets[base_id] = result  # keep the first one

# Replace original results with the deduplicated ones
results = list(unique_datasets.values())

# Now results only contains one replica per dataset
print(f"Filtered to {len(results)} unique datasets.")

# If results are found, print their dataset IDs
if len(results) > 0:
    print("\nDataset IDs:")
    for result in results:
        print(result.dataset_id)

Filtered to 1 unique datasets.

Dataset IDs:
CMIP6.CMIP.EC-Earth-Consortium.EC-Earth3-AerChem.historical.r1i1p1f1.day.ta.gr.v20200624|esgf-data1.llnl.gov


In [11]:
import time

# === Step 4: Extract matching file URLs ===

start_time = time.time()  # Start timer

download_urls = []

for dataset in results:
    files = dataset.file_context().search()
    for f in files:
        if any(year in f.filename for year in target_years):
            download_urls.append(f.download_url)

end_time = time.time()  # End timer
elapsed = end_time - start_time

print(f"\nDownload step completed in {elapsed:.2f} seconds.")


Download step completed in 6.77 seconds.


In [12]:
# Show found URLs
print(download_urls)

['https://esgf-data1.llnl.gov/thredds/fileServer/css03_data/CMIP6/CMIP/EC-Earth-Consortium/EC-Earth3-AerChem/historical/r1i1p1f1/day/ta/gr/v20200624/ta_day_EC-Earth3-AerChem_historical_r1i1p1f1_gr_19600101-19601231.nc']


In [13]:
# === Step 5: Save URLs to file ===
with open(url_list_file, "w") as f:
    for url in download_urls:
        f.write(url + "\n")

print(f"\nSaved {len(download_urls)} URL(s) to {url_list_file}")


Saved 1 URL(s) to cmip6_ta/cmip6_ta_1960_urls.txt


In [14]:
# === Step 6: Download files with wget ===
start_time = time.time()  # Start timer

for url in download_urls:
    print(f"Downloading: {url}")
    os.system(f"wget -q --no-check-certificate -c -P {download_folder} '{url}'")

end_time = time.time()  # End timer
elapsed = end_time - start_time

print(f"\nDownload step completed in {elapsed:.2f} seconds.")


Downloading: https://esgf-data1.llnl.gov/thredds/fileServer/css03_data/CMIP6/CMIP/EC-Earth-Consortium/EC-Earth3-AerChem/historical/r1i1p1f1/day/ta/gr/v20200624/ta_day_EC-Earth3-AerChem_historical_r1i1p1f1_gr_19600101-19601231.nc

Download step completed in 217.84 seconds.
