# ESGF Data Search
This notebook will provide the most basic search capabilities. Please reference the [documentation](https://esgf.github.io/esg-search/ESGF_Search_RESTful_API.html) for addition details.

In [None]:
import requests

import pandas
from ipywidgets import widgets
from IPython.display import display

# Need to set the compute_token, see getting_started notebook
compute_token = 'huuhCnol42KQUgO304psAC9tVh4jysnCSu73Axa2iRrvhVJtYpaf9Qu4m74msPTF'

pandas.set_option('display.max_columns', None)
pandas.set_option('display.max_rows', None)

In [None]:
base_url = 'https://esgf-node.llnl.gov/esg-search/search'

In [None]:
# Search configuration
params = {
    'format': 'application/solr+json',
    'query': '*',
    'latest': 'true',
    'type': 'File',
    'limit': '10000',
    # Everything below appears as facets in CoG
    'variable': 'tas',
    'data_node': 'aims3.llnl.gov',
    'project': 'CMIP6',
    'frequency': 'mon',
    'experiment_id': 'hist-piNTCF',
}

In [None]:
# Note not working at the moment
def get_facets():
    facet_params = {
        'format': 'application/solr+json',
        'project': 'CMIP6', # Enable to get project specific facets
        'facets': '*',
        'limit': '0',
    }

    result = requests.get(base_url, params=facet_params, timeout=5*60)

    if not result.ok:
        display(widgets.HTML(result.text))
    else:
        output = result.json()

In [None]:
filter_columns = ['id', 'version', 'activity_id', 'datetime_start', 'datetime_stop', 'frequency', 'grid', 'experiment_id', 'source_id', 'institution_id', 'url', 'variant_label', 'retracted']

def get_opendap_url(x):
    for item in x:
        if 'opendap' in item.lower():
            return item.split('|')[0].replace('.html', '')
        
def format_column(x):
    if x.name == 'url':
        return [get_opendap_url(y) for y in x]
    elif isinstance(x[0], list):
        return [y[0] for y in x]
    return x
        
result = requests.get(base_url, params=params)

if not result.ok:
    display(widgets.HTML(result.text))
else:
    output = result.json()
    docs = output['response']['docs']
    # Use pandas to display results
    df = pandas.DataFrame.from_dict(docs)
    #  Filter columns we're interested in
    df = df.filter(filter_columns)
    # Format values in rows
    df = df.apply(format_column)
    display(df)

In [None]:
# Filter from search results files we're interested in
miroc6 = df[(df.source_id=='MIROC6') & (df.variant_label=='r1i1p1f1')]
miroc6

In [None]:
# Open the sources so we can inspect the time axis
import cdms2
files = [cdms2.open(x) for x in miroc6.url]
time = pandas.Series([str(y) for x in files for y in x['tas'].getTime().asComponentTime()])
time.index = pandas.DatetimeIndex(time)
subset = time['1850-01-16':'1859-12-16']
print('Start: ', subset[0], 'Stop: ', subset[-1], 'Length: ', len(subset))

In [None]:
# Use remote compute resources to aggregate and subset data
import cwt
client = cwt.WPSClient('https://aims2.llnl.gov/wps/', compute_token=compute_token)

# Define aggregation process
aggregate = client.process_by_name('CDAT.aggregate')
aggregate.add_inputs(*[cwt.Variable(x, 'tas') for x in miroc6.url])
aggregate.set_domain(cwt.Domain(time=('1850-1-1', '1860-1-16'), lat=(-45, 45)))

# Execute process on remote resources
client.execute(aggregate)

# Wait for the process to complete, will print status messages
aggregate.wait(sleep=1)

print('Output', aggregate.output.uri, aggregate.output.var_name)

In [None]:
# Open the result with CDMS2
import cdms2

f = cdms2.open(aggregate.output.uri)

output_var = f[aggregate.output.var_name]

output_var.shape