### Read MISO-BOB IMD bufr file 

using pdbufr, format the table, and output as netcdf using xarray.

In [1]:
import pdbufr
import os
# import pandas as pd
import xarray as xr

In [2]:
"Read IMD high-resolution BUFR sounding time series filein, write out NetCDF fileout."
def bufr_to_nc(filein, fileout="snd.nc"):
    
    # fields available to include in sounding
    fields = ["WMO_station_id", "latitude", "longitude", "timePeriod", 
                              "pressure", "nonCoordinateGeopotentialHeight",
                              "latitudeDisplacement", "longitudeDisplacement",
                              "airTemperature", "dewpointTemperature",
                              "windDirection", "windSpeed" ]

    # read sounding vector into pandas dataframe; pdbufr query takes ~10 s
    df = pdbufr.read_bufr( filein, columns=tuple(fields) )
    
    # format data table for netcdf

    # separate scalars from sounding dataframe
    latitude = df['latitude'][0]
    longitude = df['longitude'][0]
    WMO_station_id = df['WMO_station_id'][0]
    # make a subset with vertically-varying variables
    u = df[df.columns[3:-1]]
    
    # rename latitude, longitude
    vert_vars = u.rename( {'latitudeDisplacement': 'latitude', 'longitudeDisplacement': 'longitude'}, axis='columns' )
    # add station lat, lon to displacements
    vert_vars['latitude'] += latitude
    vert_vars['longitude'] += longitude
    
    ds = vert_vars.to_xarray() # --> xarray dataset
    # add WMO station id as an attribute
    ds.attrs['WMO_station_id'] = WMO_station_id

    # write out netcdf
    ds.to_netcdf(path=fileout, mode="w")

# takes about 10s, almost all to read/parse the bufr file.

In [13]:
filein = "/Users/sdeszoek/Data/cruises/MISOBOB_2019/SR1911/radiosonde/Soundings from IMD/KOLKATA/2019071923/BUFR/20190719230018023412_Bufr_309052_all.bufr"
fileout = "./20190719230018023412_Bufr_309052_all.nc"

bufr_to_nc(filein, fileout)

In [3]:
# use pdbufr
# example at https://pdbufr.readthedocs.io/en/latest/examples/radiosonde.html

file = "/Users/sdeszoek/Data/cruises/MISOBOB_2019/SR1911/radiosonde/Soundings from IMD/KOLKATA/2019071923/BUFR/20190719230018023412_Bufr_309052_all.bufr"

fields = ["WMO_station_id", "latitude", "longitude", "timePeriod", 
                              "pressure", "nonCoordinateGeopotentialHeight",
                              "latitudeDisplacement", "longitudeDisplacement",
                              "airTemperature", "dewpointTemperature",
                              "windDirection", "windSpeed" ]

# read sounding vector into pandas dataframe; pdbufr query takes ~10 s
df = pdbufr.read_bufr( filein, columns=tuple(fields) )
#                     filters={"timePeriod": slice(0, None)}) # not needed
df.keys()

Index(['latitude', 'longitude', 'timePeriod', 'pressure',
       'nonCoordinateGeopotentialHeight', 'latitudeDisplacement',
       'longitudeDisplacement', 'airTemperature', 'dewpointTemperature',
       'windDirection', 'windSpeed', 'WMO_station_id'],
      dtype='object')

In [4]:
# format data table for netcdf

# separate scalars from sounding dataframe
latitude = df['latitude'][0]
longitude = df['longitude'][0]
WMO_station_id = df['WMO_station_id'][0]

# it looks like pandas has to copy to have predicatble behavior
# make a subset with vertically-varying variables
u = df[df.columns[3:-1]]
# rename latitude, longitude
vert_vars = u.rename( {'latitudeDisplacement': 'latitude', 'longitudeDisplacement': 'longitude'}, axis='columns' )
#df.to_xarray() # this is how the netcdf will be formatted

vert_vars['latitude'] += latitude
vert_vars['longitude'] += longitude
vert_vars

Unnamed: 0,pressure,nonCoordinateGeopotentialHeight,latitude,longitude,airTemperature,dewpointTemperature,windDirection,windSpeed
0,100240.0,6,22.65082,88.43830,300.55,298.19,0,0.0
1,100180.0,11,22.65084,88.43829,300.65,298.38,139,0.1
2,100130.0,16,22.65086,88.43827,300.74,298.57,139,0.1
3,100070.0,21,22.65088,88.43826,300.84,298.75,139,0.2
4,100000.0,27,22.65091,88.43824,300.95,298.97,139,0.2
...,...,...,...,...,...,...,...,...
6613,560.0,35077,22.73042,87.35694,234.52,211.11,85,29.7
6614,570.0,35056,22.73040,87.35666,234.52,211.07,85,29.7
6615,570.0,35034,22.73038,87.35638,234.52,211.03,85,29.7
6616,570.0,35013,22.73037,87.35610,234.55,211.01,85,29.7


In [5]:
# make netcdf file
ds = vert_vars.to_xarray() # --> xarray dataset
# add WMO station id as an attribute
ds.attrs['WMO_station_id'] = WMO_station_id

# write out netcdf
ds.to_netcdf(path="./snd.nc", mode="w")
ds

### Try with metview

In [6]:
import metview as mv

filein = "/Users/sdeszoek/Data/cruises/MISOBOB_2019/SR1911/radiosonde/Soundings from IMD/KOLKATA/2019071923/BUFR/20190719230018023412_Bufr_309052_all.bufr"
mbu = mv.read(filein)

# doesn't read Vaisala BUFR files, saying they are BINARY.  Error could be reversed byte order.



In [7]:
"get the number of times by querying extendedDelayedDescriptorReplicationFactor"
def get_nt(mbu):
    nt_ = mv.obsfilter(parameter="extendedDelayedDescriptorReplicationFactor", output="geopoints", data=mbu)
    return round(nt_['value'][0])
nt = get_nt(mbu)
nt

6618

In [8]:
ed = mv.obsfilter(parameter="extendedDelayedDescriptorReplicationFactor", output="geopoints", data=mbu)
ed[0]

{'latitude': 22.6508,
 'longitude': 88.4383,
 'height': 0.0,
 'date': 20190719.0,
 'time': 2311.0,
 'value': 6618.0,
 'value_missing': 0.0,
 'value2': 0.0,
 'value2_missing': 0.0}

In [9]:
p=mv.obsfilter(parameter="timePeriod", output="geopoints", data=mbu)
p[0]

{'latitude': 22.6508,
 'longitude': 88.4383,
 'height': 0.0,
 'date': 20190719.0,
 'time': 2311.0,
 'value': 0.0,
 'value_missing': 0.0,
 'value2': 0.0,
 'value2_missing': 0.0}

In [10]:
# load the vector of airTemperature within multiple timePeriod(s)
# looping with comprehension is slow
T = [ mv.obsfilter(level="descriptor_value", level_descriptor="timePeriod", first_level=i, second_level=i, 
                 output="ncols", parameter=["timePeriod", "pressure", "airTemperature"], data=mbu) for i in range(0,5) ]
# not looping just gets one timePeriod T[0].
# airTemperature element 012101

T
# and I don't know how to pythonically unpack this vector of geopoints.

[<metview.bindings.Geopoints at 0x10dff4810>,
 <metview.bindings.Geopoints at 0x10e075450>,
 <metview.bindings.Geopoints at 0x10e0776d0>,
 <metview.bindings.Geopoints at 0x10e077710>,
 <metview.bindings.Geopoints at 0x10e077750>]