In [2]:
import os
import numpy as np
import xarray as xr
import pandas as pd
import pyesgf as pyesgf
from pyesgf.search import SearchConnection
os.environ["ESGF_PYCLIENT_NO_FACETS_STAR_WARNING"] = "on"
from geopy.geocoders import Nominatim
from pyesgf.search import SearchConnection
from dask.diagnostics import ProgressBar
from utils import *

wd = os.getcwd()
print(wd)
# Make sure these directories exist
models_dir=os.path.join(wd, 'data/models')
reference_dir=os.path.join(wd, 'data/reference')
if not os.path.exists(models_dir):
  os.makedirs(models_dir)
if not os.path.exists(reference_dir):
  os.makedirs(reference_dir)

/home/minh/Documents/CI_2023


In [7]:
city = 'Nairobi'
latitude, longitude = get_coords(city)

Location, (lat, lon):  Nairobi, Kenya (-1.3026148499999999, 36.82884201813725)


In [8]:
# %%
# Download any of the reanalysis reference .nc datasets W5E5 from: https://data.isimip.org/datasets/96369b63-4fbf-4b90-8b58-79e5f50a385a/, and add it to the current working directory.
W5E5 = xr.open_mfdataset(os.path.join(wd, '*.nc'), engine = 'netcdf4').sel(lat=latitude, lon=longitude, method='nearest').convert_calendar("noleap")

In [10]:
project='CMIP6'
models = 'GFDL-ESM4,IPSL-CM6A-LR,MPI-ESM1-2-HR,MRI-ESM2-0'#,UKESM1-0-LL'
variable_id = 'tas'
table_id = 'day'
experiment_id='historical'
member_id='r1i1p1f1,r1i1p1f2' # f1 not available for UKESM1-0-LL

connection = SearchConnection('https://esgf-data.dkrz.de/esg-search', distrib=True)
query = connection.new_context(
    latest = True,
    project='CMIP6',
    source_id=models,
    experiment_id=experiment_id,
    variable_id=variable_id,
    table_id=table_id,
    member_id=member_id,
    data_node='esgf.ceda.ac.uk')

print("Number of search results:", query.hit_count)

results = query.search()
files=[]
for i, result in enumerate(results):
    print("Retrieving search results: ", result.dataset_id)
    #print(result.json)
    files.extend(list(map(lambda f : {'model': f.json['source_id'].pop(), 'dataset_id': result.dataset_id, 'filename': f.filename, 'url': f.opendap_url}, result.file_context().search())))    
    
files = list(files)
files = pd.DataFrame.from_dict(files)
files.drop_duplicates('filename')

grouped_files = files.groupby('model', as_index=False).agg(list)
grouped_files

Number of search results: 4
Retrieving search results:  CMIP6.CMIP.IPSL.IPSL-CM6A-LR.historical.r1i1p1f1.day.tas.gr.v20190614|esgf.ceda.ac.uk
Retrieving search results:  CMIP6.CMIP.NOAA-GFDL.GFDL-ESM4.historical.r1i1p1f1.day.tas.gr1.v20190726|esgf.ceda.ac.uk
Retrieving search results:  CMIP6.CMIP.MRI.MRI-ESM2-0.historical.r1i1p1f1.day.tas.gn.v20190603|esgf.ceda.ac.uk
Retrieving search results:  CMIP6.CMIP.MPI-M.MPI-ESM1-2-HR.historical.r1i1p1f1.day.tas.gn.v20190710|esgf.ceda.ac.uk


Unnamed: 0,model,dataset_id,filename,url
0,GFDL-ESM4,[CMIP6.CMIP.NOAA-GFDL.GFDL-ESM4.historical.r1i...,[tas_day_GFDL-ESM4_historical_r1i1p1f1_gr1_185...,[https://esgf.ceda.ac.uk/thredds/dodsC/esg_cmi...
1,IPSL-CM6A-LR,[CMIP6.CMIP.IPSL.IPSL-CM6A-LR.historical.r1i1p...,[tas_day_IPSL-CM6A-LR_historical_r1i1p1f1_gr_1...,[https://esgf.ceda.ac.uk/thredds/dodsC/esg_cmi...
2,MPI-ESM1-2-HR,[CMIP6.CMIP.MPI-M.MPI-ESM1-2-HR.historical.r1i...,[tas_day_MPI-ESM1-2-HR_historical_r1i1p1f1_gn_...,[https://esgf.ceda.ac.uk/thredds/dodsC/esg_cmi...
3,MRI-ESM2-0,[CMIP6.CMIP.MRI.MRI-ESM2-0.historical.r1i1p1f1...,[tas_day_MRI-ESM2-0_historical_r1i1p1f1_gn_185...,[https://esgf.ceda.ac.uk/thredds/dodsC/esg_cmi...


In [11]:
# load all files from url list for each model into xarray multi-file dataset

start_time = W5E5.time[0].values
end_time = W5E5.time[-1].values

data={}
for i,model in enumerate(grouped_files.model):
    print("Selecting location...")
    print("Loading dataset: ", model)

    data[model]=xr.open_mfdataset(grouped_files.iloc[i].url, chunks={'time': 120}).sel(
        lat=latitude, lon=longitude, method='nearest').convert_calendar(
        'noleap', align_on='year', missing='NaN').sel(
        time=slice(start_time,end_time)).interpolate_na(method='nearest')

Selecting location...
Loading dataset:  GFDL-ESM4
Selecting location...
Loading dataset:  IPSL-CM6A-LR
Selecting location...
Loading dataset:  MPI-ESM1-2-HR
Selecting location...
Loading dataset:  MRI-ESM2-0


In [12]:
os.chdir(wd)
for model, dataset in data.items():
    print("Saving ", model, "for selected city ", city)
    identifier = '_'.join([model, city])
    years, y_datasets = zip(*dataset.groupby("time.year"))
    fns=[identifier+f'_{y}.nc' for y in years]
    paths=[os.path.join(models_dir,fn) for fn in fns]
    with ProgressBar():
        xr.save_mfdataset(y_datasets[-2:], paths[-2:], mode="w") # saving only 2 years of data for demo
        ### For the purpsoe of inspecting the dataset as a pandas dataframe
        df = dataset.to_dataframe()
        print("Dataframe for (", model, ",", city, "):", df)


Saving  GFDL-ESM4 for selected city  Nairobi
[########################################] | 100% Completed | 102.48 ms
[########################################] | 100% Completed | 101.73 ms
[########################################] | 100% Completed | 99.05 s
[########################################] | 100% Completed | 2.22 sms
                          lat     lon  height  lat_bnds  lon_bnds         tas  \
bnds time                                                                       
1.0  1979-01-01 12:00:00 -1.5  36.875     2.0      -2.0     36.25  293.577148   
     1979-01-02 12:00:00 -1.5  36.875     2.0      -2.0     36.25  294.044708   
     1979-01-03 12:00:00 -1.5  36.875     2.0      -2.0     36.25  292.982666   
     1979-01-04 12:00:00 -1.5  36.875     2.0      -2.0     36.25  294.250641   
     1979-01-05 12:00:00 -1.5  36.875     2.0      -2.0     36.25  294.484741   
...                       ...     ...     ...       ...       ...         ...   
2.0  1980-12-26 12:00:

In [16]:
os.chdir(wd)
end_time = min([dataset.time[-1].values for dataset in data.values()])
reference = W5E5.sel(time=slice(start_time,end_time))
print("Saving reference for selected city ", city)
identifier = '_'.join(['W5E5', city])
years, y_datasets = zip(*reference.groupby("time.year"))
fns=[identifier+f'_{y}.nc' for y in years]
paths=[os.path.join(reference_dir,fn) for fn in fns]
with ProgressBar():
    xr.save_mfdataset(y_datasets[-2:], paths[-2:], mode="w")

Saving reference for selected city  Nairobi
[                                        ] | 0% Completed | 272.58 us

[########################################] | 100% Completed | 4.27 ss
[########################################] | 100% Completed | 3.24 ss


In [None]:
# ds = xr.open_dataset('/home/minh/Documents/CI_2023/data/models/GFDL-ESM4_Nairobi_1979.nc')
# df = ds.to_dataframe()
# print(df)


                          lat     lon  height  lat_bnds  lon_bnds         tas  \
bnds time                                                                       
1.0  1979-01-01 12:00:00 -1.5  36.875     2.0      -2.0     36.25  293.577148   
     1979-01-02 12:00:00 -1.5  36.875     2.0      -2.0     36.25  294.044708   
     1979-01-03 12:00:00 -1.5  36.875     2.0      -2.0     36.25  292.982666   
     1979-01-04 12:00:00 -1.5  36.875     2.0      -2.0     36.25  294.250641   
     1979-01-05 12:00:00 -1.5  36.875     2.0      -2.0     36.25  294.484741   
...                       ...     ...     ...       ...       ...         ...   
2.0  1979-12-27 12:00:00 -1.5  36.875     2.0      -1.0     37.50  291.411499   
     1979-12-28 12:00:00 -1.5  36.875     2.0      -1.0     37.50  290.447174   
     1979-12-29 12:00:00 -1.5  36.875     2.0      -1.0     37.50  290.641022   
     1979-12-30 12:00:00 -1.5  36.875     2.0      -1.0     37.50  290.487762   
     1979-12-31 12:00:00 -1.