In [7]:
import os
import sys

from datetime import datetime
import pandas as pd
import numpy as np
import h5py as h5

import dask.dataframe as dd
from dask.delayed import delayed
from dask.diagnostics import ProgressBar

progress_bar = ProgressBar()
progress_bar.register()

Get a list of all files to be converted by directory. For volumetric considerations, we will keep each stage separate and write to it's own file.

In [8]:
DESC = ''
FTYPE = '.las'

In [9]:
raw_path = r'C:\Users\george.crowther\Dropbox (OptaSense OFS)\Rice - OptaSense Project\1.2 Fracturing DAS'

dir_list = os.listdir(raw_path)
file_paths = {}

for directory in dir_list:
    for main_dir, sub_dir, filenames in os.walk(os.path.join(raw_path, directory)):
        file_paths[main_dir] = []
        for f in filenames:
            if f.startswith(DESC) and f[-4:] == FTYPE and f[-5] != ')':
                file_paths[main_dir].append(os.path.join(main_dir, f))
        

In [10]:
@delayed
def read_das_las(f_path):
    
    def tstamp_from_f_path(f_path):
        """Helper function to get timestamp from file path"""
        tstring = f_path.split('\\')[-1].split('_')[-1].split('.')[0]
        
        year = int("20" + tstring[:2])
        month = int(tstring[2:4])
        day = int(tstring[4:6])
        hour = int(tstring[6:8])
        minute = int(tstring[8:10])
        second = int(tstring[10:])
        
        return datetime(year = year, month = month, day = day, hour = hour, minute = minute, second = second)
    # file_properties will contain all metadata and data from file
    file_properties = {}
    
    # get datetime from filepath
    file_properties['DTIME'] = tstamp_from_f_path(f_path)
    
    # Open file to read header information
    with open(f_path, 'r') as f:
    
        index = 0
        line = f.readline()
        
        curve_ids = []
        # Read up to the start of the data
        while not line.startswith('~A Log Data'):
            
            if line[0] != '#' and line[0] != '~':
                prop = line.split('.')[0]
                if prop.startswith('RMS'):
                    curve_ids.append(prop)
                    value = line.split(' ', maxsplit = 1)[-1].split(':')[0]
                if prop.startswith('FreqBand'):
                    curve_ids.append(prop)
                    value = line.split(' ')[-1]
                else:
                    value = line.split(' ', maxsplit = 1)[-1].split(':')[0]
                
                file_properties[prop] = value
             
            index += 1
            line = f.readline()            
        
        #Define data columns
        columns = [file_properties['DTIME']] * len(curve_ids)
        
        # Read data to dataframe
        frame = pd.read_csv(f_path, skiprows=index + 1, names = columns, index_col=0, dtype='str', delimiter=' ')
        frame = frame.sort_index()
        
        df = frame.T
        df.columns = pd.to_numeric(df.columns, errors = 'coerce') / 3.28084
        df['BANDS'] = [file_properties[curve] for curve in curve_ids]
        
    return df

In [11]:
def write_fbe_hdf(hdf_file, band_no, fbe_band, start_freq, end_freq, fbe_id = "RiceEnergy-Silixa 2"):
    if "Acquisition" not in hdf_file.keys():
        
        acq_group = hdf_file.create_group('Acquisition')
        
        acq_group.attrs['AcquisitionId'] = ""
        acq_group.attrs['Creation'] = datetime.now().strftime('%Y-%m-%dT%H:%M:%SZ%Z')
        acq_group.attrs.create('NumberOfLoci', len(fbe_band.columns), dtype = np.dtype(np.int32))
        acq_group.attrs.create('SpatialSamplingInterval', np.diff(fbe_band.columns).mean(), dtype = np.dtype(np.float32))
        acq_group.attrs.create('StartLocusIndex', int(fbe_band.columns[0]), dtype = np.dtype(np.int32))
        
        proc_group = acq_group.create_group('Processed')
        
        hdf_file["/Acquisition/Processed"].create_group('Fbe[1]')

        fbe_group = hdf_file["/Acquisition/Processed/Fbe[1]"]

        fbe_group.attrs['FbeId'] = fbe_id
        fbe_group.attrs['NumberOfLoci'] = len(fbe_band.columns)
        fbe_group.attrs['OutputDataRate'] = float(1)
        fbe_group.attrs.create('StartLocusIndex', int(fbe_band.columns[0]), dtype = np.dtype(np.int32))

        fbe_time = fbe_group.create_dataset('FbeDataTime', data = pd.to_numeric(fbe_band.index)/1000, dtype = 'int64', chunks = None)
        fbe_time.attrs['EndTime'] = fbe_band.index[-1].strftime('%Y-%m-%dT%H:%M:%S.%fZ%Z')
        fbe_time.attrs['StartTime'] = fbe_band.index[0].strftime('%Y-%m-%dT%H:%M:%S.%fZ%Z')
        
        
    fbe_group = hdf_file["/Acquisition/Processed/Fbe[1]"]

    fbe_data = fbe_group.create_dataset('FbeData[{0}]'.format(band_no), 
                                       data = fbe_band.as_matrix(), shuffle = True,
                                       chunks = (10, fbe_band.shape[1]), compression = 'gzip',
                                       dtype = 'float32')
    fbe_data.attrs.create('Dimensions', ['Time', 'Loci'], dtype = np.dtype('S5'))
    fbe_data.attrs["EndFrequency"] = float(end_freq)
    fbe_data.attrs["StartFrequency"] = float(start_freq)

In [13]:
master_path = r'C:\Users\george.crowther\Dropbox (OptaSense OFS)\Customer Projects\Rice Energy\Belmont Project\data'
completed = []

for key in file_paths.keys():
    fpaths = file_paths[key]
    print(key)
    data_path = key.split('\\')[-1]
    write_path = os.path.join(master_path, data_path)
    
    fbe_id = -9999.25
    
    if len(fpaths) == 0:
        continue
    
    if not os.path.exists(write_path):
        os.mkdir(write_path)
        
        print("{0} - {1}".format(len(fpaths), data_path))
        dfs = [read_das_las(n) for n in fpaths]
        df = dd.from_delayed(dfs)

        bands = df.compute()
        
        fname = "FBE_" + bands.index[0].strftime('%Y%m%dT%H%M%S') + ".h5"
        f = h5.File(os.path.join(write_path, fname), 'w')
        sys.stdout.write('Writing [')
        band_no = 0
        for i, group in bands.groupby('BANDS'):
            group = group.drop('BANDS', axis = 1)
            a = pd.to_datetime(group.index)
            group.index = a - pd.Timedelta(a[0].microsecond, unit='us')
            group.columns = pd.to_numeric(group.columns)
            
            group = group.astype('float32')
            
            if i.startswith('RMS'):
                start_freq = i.split(' ')[1]
                end_freq = i.split(' ')[-2]
            elif i.startswith('from'):
                start_freq = i.split(' ')[1]
                end_freq = i.split(' ')[-2]
            else:
                start_freq = i.split('_')[0]
                end_freq = i.split('_')[-1]
            
            sys.stdout.write('#{0}#'.format(band_no))
            write_fbe_hdf(f, band_no, group, start_freq, end_freq, str(fbe_id))
            band_no += 1
            f.flush()
        
        sys.stdout.write(']\n')
        f.close()
        completed.append(key)
        fbe_id += 1

C:\Users\george.crowther\Dropbox (OptaSense OFS)\Rice - OptaSense Project\1.2 Fracturing DAS\Stage 08
C:\Users\george.crowther\Dropbox (OptaSense OFS)\Rice - OptaSense Project\1.2 Fracturing DAS\Stage 30
C:\Users\george.crowther\Dropbox (OptaSense OFS)\Rice - OptaSense Project\1.2 Fracturing DAS\Stage 06 with All Freq Bands
C:\Users\george.crowther\Dropbox (OptaSense OFS)\Rice - OptaSense Project\1.2 Fracturing DAS\Stage 21
C:\Users\george.crowther\Dropbox (OptaSense OFS)\Rice - OptaSense Project\1.2 Fracturing DAS\Stage 19
C:\Users\george.crowther\Dropbox (OptaSense OFS)\Rice - OptaSense Project\1.2 Fracturing DAS\Stage 01 Offset
C:\Users\george.crowther\Dropbox (OptaSense OFS)\Rice - OptaSense Project\1.2 Fracturing DAS\Stage 35
C:\Users\george.crowther\Dropbox (OptaSense OFS)\Rice - OptaSense Project\1.2 Fracturing DAS\Stage 27
C:\Users\george.crowther\Dropbox (OptaSense OFS)\Rice - OptaSense Project\1.2 Fracturing DAS\Stage 42
C:\Users\george.crowther\Dropbox (OptaSense OFS)\Rice -

In [6]:
2**7

128