In [1]:
import numpy as np
import libaarhusxyz
import pandas as pd
import dill
def read_em_data_from_dat_xyz(em_data, name_gate_times='gate times', dat_type='dat'):
    gate_times = np.array(em_data.info[name_gate_times])
    df_em = em_data.flightlines
    group_em = df_em.groupby('record')
    records = list(group_em.groups.keys())
    df_em_collar = group_em[['line_no','utmx','utmy','elevation','altitude_[m]']].mean()

    dat_names = ['data_{:d}'.format(ii+1) for ii in range(gate_times.size)]
    df_em[dat_names] = em_data.layer_data['data']
    if dat_type == 'dat':    
        dat_std_names = ['datastd_{:d}'.format(ii+1) for ii in range(gate_times.size)]
        df_em[dat_std_names] = em_data.layer_data['datastd']    

    em_data_nan = em_data.layer_data['data']
    em_data_nan = em_data_nan.replace(9999, np.nan)
    n_ch1_active = np.round((~np.isnan(em_data_nan.loc[df_em[df_em.segments==1].index]).values).sum(axis=1).mean())
    n_ch2_active = np.round((~np.isnan(em_data_nan.loc[df_em[df_em.segments==2].index]).values).sum(axis=1).mean())
    print (f"Active # of channels: Ch1={n_ch1_active:.0f}, Ch2={n_ch2_active:.0f}")

    inds_lm = (~np.isnan(em_data_nan.loc[df_em[df_em.segments==1].index]).values).sum(axis=0) > 0.
    inds_hm = (~np.isnan(em_data_nan.loc[df_em[df_em.segments==2].index]).values).sum(axis=0) > 0.

    data_lm = []
    data_hm = []
    if dat_type == 'dat':
        datastd_lm = []
        datastd_hm = []
    records_inv = []
    for i_record in records:
        df_tmp = group_em.get_group(i_record)
        values = df_tmp[dat_names].values
        if dat_type == 'dat':
            values_std = df_tmp[dat_std_names].values
        if df_tmp.shape[0] == 2:
            i_lm = np.argwhere(df_tmp['segments'].values==1)[0][0]                
            i_hm = np.argwhere(df_tmp['segments'].values==2)[0][0]
            data_lm.append(values[i_lm,inds_lm])
            data_hm.append(values[i_hm,inds_hm])
            if dat_type == 'dat':
                datastd_lm.append(values_std[i_lm,inds_lm])
                datastd_hm.append(values_std[i_hm,inds_hm])        
            records_inv.append(i_record)
    data_lm = np.vstack(data_lm)
    data_hm = np.vstack(data_hm)
    if dat_type == 'dat':
        datastd_lm = np.vstack(datastd_lm)
        datastd_hm = np.vstack(datastd_hm)
    records_inv = np.hstack(records_inv)
    df_em_inv = df_em_collar.loc[records_inv]
    times_lm_inv = gate_times[inds_lm]
    times_hm_inv = gate_times[inds_hm]    

    ch1_names = ['dbdt_ch1gt_{:d}'.format(ii+1) for ii in range(data_lm.shape[1])]
    ch2_names = ['dbdt_ch2gt_{:d}'.format(ii+1) for ii in range(data_hm.shape[1])]
    if dat_type == 'dat':    
        std_ch1_names = ['dbdt_std_ch1gt_{:d}'.format(ii+1) for ii in range(data_lm.shape[1])]
        std_ch2_names = ['dbdt_std_ch2gt_{:d}'.format(ii+1) for ii in range(data_hm.shape[1])]    

    df_em_inv[ch1_names] = data_lm
    df_em_inv[ch2_names] = data_hm
    if dat_type == 'dat':    
        df_em_inv[std_ch1_names] = datastd_lm
        df_em_inv[std_ch2_names] = datastd_hm
    
    meta_data_dict = {}
    meta_data_dict['ch1_names'] = ch1_names
    meta_data_dict['ch2_names'] = ch2_names
    if dat_type == 'dat':    
        meta_data_dict['std_ch1_names'] = std_ch1_names
        meta_data_dict['std_ch2_names'] = std_ch2_names
    meta_data_dict['ch1_times'] = times_lm_inv
    meta_data_dict['ch2_times'] = times_hm_inv
    df_em_inv = df_em_inv.replace(9999, np.nan)
    return df_em_inv, meta_data_dict

In [2]:
work_dir = "../data/"
fname = work_dir+"GMD4_SCI02_MOD_dat.xyz"
aem_data = libaarhusxyz.XYZ(fname)
fname = work_dir+"GMD4_SCI02_MOD_syn.xyz"
aem_data_syn = libaarhusxyz.XYZ(fname)
fname = work_dir+"GMD4_SCI02_MOD_inv.xyz"
aem_model = libaarhusxyz.XYZ(fname)

In [3]:
df_em_inv, meta_data_dict = read_em_data_from_dat_xyz(aem_data, name_gate_times='gate times (s)')
df_em_syn, _ = read_em_data_from_dat_xyz(aem_data_syn, name_gate_times='gate times (s)', dat_type='syn')

  df_em[dat_std_names] = em_data.layer_data['datastd']
  df_em[dat_std_names] = em_data.layer_data['datastd']
  df_em[dat_std_names] = em_data.layer_data['datastd']
  df_em[dat_std_names] = em_data.layer_data['datastd']
  df_em[dat_std_names] = em_data.layer_data['datastd']
  df_em[dat_std_names] = em_data.layer_data['datastd']
  df_em[dat_std_names] = em_data.layer_data['datastd']
  df_em[dat_std_names] = em_data.layer_data['datastd']


Active # of channels: Ch1=20, Ch2=27
Active # of channels: Ch1=20, Ch2=27


In [4]:
dill.dump(meta_data_dict, open(work_dir+'gmd_4_meta_data.pik', 'wb'))
df_em_inv.to_parquet(work_dir+'gmd_4_inv.parquet')
df_em_syn.to_parquet(work_dir+'gmd_4_syn.parquet')

In [5]:
# !pip install pyarrow fastparquet