# GDPS Assimilation

Given a small set of GDPS files, assimilate them in a format we like.
This assimilation will then be packaged in a crawler that assimilates all incoming GDPS files.

In [None]:
import cfgrib
import os
import pathlib
import xarray as xr

In [None]:
DATA_DIR = pathlib.Path(os.getenv('DATA_DIR'))
GDPS_SAMPLE_PATH = DATA_DIR / '2021-01-28-incoming-sample'
DATABASE_PATH = DATA_DIR / '2021-01-28-test-assimilation'

In [None]:
gdps_files = list(pathlib.Path(GDPS_SAMPLE_PATH).glob('*.grib2'))
gdps_files = gdps_files[0:20]

In [None]:
gdps_sample = xr.open_dataset(gdps_files[1], engine='cfgrib', backend_kwargs={'filter_by_keys': {'typeOfLevel': 'surface', 'stepType': 'instant'}})

In [None]:
gdps_sample

In [None]:
gdps_sample = xr.open_mfdataset(gdps_files, engine='cfgrib', concat_dim='step', combine='nested', parallel=True, backend_kwargs={'filter_by_keys': {'shortName': 'gh'}})

In [None]:
gdps_sample.sel(isobaricInhPa=[1000, 925, 850, 700, 500])

In [None]:
gdps_sample.t.shape

In [None]:
gdps_sample.t2m.isel(step=0).plot()

The variables I wish to grab (at least at first) are the ones used by UMOS 

```
000       MOS 059253   1
004        NT 020010   1
008        PR 059174   1
009        PC 059175   1
012   TT12000 059047   1
013    TT1000 059048   1
014     TT925 059049   1
015     TT850 059050   1
016     TT700 059051   1
017     TT500 059052   1
018   TX12000 059059   1
019    TX1000 059060   1
022     TX700 059063   1
024   TY12000 059065   1
025    TY1000 059066   1
032     ES925 059055   1
033     ES850 059056   1
048   UU12000 059035   1
049    UU1000 059036   1
054   VV12000 059041   1
096    WW1000 059020   1
097     WW925 059021   1
099     WW700 059023   1
104     GZ700 059018   1
122   UV12000 059124   1
123    UV1000 059125   1
125     UV850 059127   1
134   AT12000 059151   1
135    AT1000 059152   1
136     AT925 059153   1
137     AT850 059154   1
138     AT700 059155   1
140   AD12000 059157   1
146   HR12000 059163   1
147    HR1000 059164   1
148     HR925 059165   1
149     HR850 059166   1
150     HR700 059167   1
152    UG1000 059130   1
157    VG1000 059135   1
158     VG925 059136   1
169        EC 059008   1
170        ED 059009   1
176        KI 059005   1
177    SUNFAC 059178   1
179 PERS SPOT 012004   1
```


# The easy ones

In [None]:
datasets = []
for short_name in ['gh', 'q', 't', 'u', 'v']:
    print(short_name)
    datasets.append(
        xr.open_mfdataset(
            gdps_files, engine='cfgrib', concat_dim='step', combine='nested', parallel=True, 
            backend_kwargs={'filter_by_keys': {
                'shortName': short_name, 
                'typeOfLevel': 'isobaricInhPa'
            }}))


In [None]:
datasets[0]

In [None]:
merged = xr.merge(datasets)

In [None]:
merged

In [None]:
merged.to_netcdf(GDPS_SAMPLE_PATH + '/_filtered_117.nc')

In [None]:
compression_options = {var: {'zlib': True, 'complevel': 9} for var in ['gh', 'q', 't', 'u', 'v']}

In [None]:
compression_options

In [None]:
merged.isel(step=1).to_netcdf(GDPS_SAMPLE_PATH + '/_filtered_117_for_real_comp_9.nc', encoding=compression_options)