# Pulls data from the Copernicus Data Provider (European Commision) available at cds.climate.copernicus.eu
The output is a csv file containing the requested time series.

As currently downloading the complete (daily) data set is not supported due to size limitations - only sampled data for every 10th day is available

To use the data set legally it is required to create an account here: https://cds.climate.copernicus.eu/
Please then obtain your API key from your user provide and provide it as a parameter to this component

WARNING: This component currently only supports local execution (not Kubeflow/Airflow)

Future work  
[ ] Download complete data set by creating multile requests and then merge the results

In [None]:
!pip3 install xarray==0.17.0 netcdf4==1.5.6 cdsapi==0.5.1 wget==3.2

In [None]:
# @param api key in form UID:APIKey obtained from
# https://cds.climate.copernicus.eu/
# @param data_dir temporal data storage for local execution
# @param file_name csv file name
# @param start_year of data
# @param end_year of data

In [None]:
import wget
s1 = 'https://raw.githubusercontent.com/elyra-ai/'
s2 = 'component-library/master/claimed_utils.py'
wget.download(s1 + s2)

In [None]:
import cdsapi
from claimed_utils import unzip
import glob
import os
import pandas as pd
import xarray as xr

In [None]:
apikey = os.environ.get('api key')
file_name = os.environ.get('file_name', 'data.csv')
data_dir = os.environ.get('data_dir', '../../data/')
start_year = os.environ.get('start_year', '2017')  # up to 1978
end_year = os.environ.get('end_year', '2019')

In [None]:
skip = False

if os.path.exists(data_dir + file_name):
    skip = True

In [None]:
if not skip:
    with open(os.path.expanduser('~/.cdsapirc'), "w") as myfile:
        myfile.write("url: https://cds.climate.copernicus.eu/api/v2\n")
        myfile.write("key: " + apikey + "\n")
        myfile.write("verify: 0")

In [None]:
year_range = []
for i in range(int(start_year), int(end_year) + 1):
    year_range.append(str(i))

In [None]:
query = {
    'variable': 'volumetric_surface_soil_moisture',
    'type_of_sensor': 'passive',
    'time_aggregation': 'month_average',
    'year': [
        '2017', '2018', '2019'
    ],
    'month': [
        '01', '02', '03',
        '04', '05', '06',
        '07', '08', '09',
        '10', '11', '12',
    ],
    'day': '01',
    'type_of_record': 'cdr',
    'version': 'v201912.0.0',
    'format': 'zip',
}
query['year'] = year_range

In [None]:
if not skip:
    c = cdsapi.Client()

    c.retrieve(
        'satellite-soil-moisture',
        query,
        data_dir + 'download.zip')

In [None]:
if not skip:
    for f in glob.glob(data_dir + '*.nc'):
        os.remove(f)

In [None]:
if not skip:
    unzip(data_dir, data_dir + 'download.zip')

In [None]:
debug = False

if not skip:
    for filename in os.listdir(data_dir):
        if filename.endswith(".nc"):
            if debug:
                print('Starting to process {}...'.format(filename))
            else:
                print(".", end="")
            filename_csv = filename.split('.nc')[0] + '.csv'
            if not os.path.exists(data_dir + filename_csv):
                dset = xr.open_dataset(os.path.join(data_dir, filename))
                df = pd.DataFrame(dset['sm'].to_series())
                df.reset_index(inplace=True)
                df.to_csv(data_dir + filename_csv, index=False)
            else:
                if debug:
                    s1 = 'CSV file {} already '
                    s2 = 'present, skipping...'
                    print(s1 + s2.format(filename_csv))
                else:
                    print("c", end="")
            os.remove(data_dir + filename)
        else:
            continue

In [None]:
!echo "time,lat,lon,sm" > ../../data/data.csv
!for file in `ls ../../data/C3S-SOILMOISTURE-*.csv`; do cat $file |tail -n +2 >> ../../data/data.csv; done