In [None]:
import os
import h5py
import numpy as np
from scipy.signal import decimate
import os
import numpy as np
from datetime import datetime
import shutil
import h5py
from scipy.signal import resample_poly, decimate, resample
from tqdm import tqdm

In [2]:
# Setup source and target folders
source_dir = "/scratch/ddordevic/FORGE/downsample_test_source"
target_dir = "/scratch/ddordevic/FORGE/downsample_test_target"

os.makedirs(target_dir, exist_ok=True)

In [3]:
def timestamp2datetime(timestamp):
    return datetime.strptime(timestamp, "%Y%m%dT%H%M%S")

def timestampFromFilename(filename):
    return filename.split("StrainRate_")[1].split("+")[0]

In [4]:
# Go through files, take 3 consecutive files, merge them and downsample

def process_files(source_dir, target_dir):
    # Sort the filenames according to the date
    files = [f for f in os.listdir(source_dir) if f.endswith('.h5') and f.startswith('16B')]
    files = sorted(files, key=lambda f: timestampFromFilename(f))

    # Process first file
    file1_path = os.path.join(source_dir, files[0])
    file2_path = os.path.join(source_dir, files[1])

    file1_path_new = os.path.join(target_dir, files[0])
    shutil.copy2(file1_path, file1_path_new)

    f1 = h5py.File(file1_path_new, 'r+')
    f2 = h5py.File(file2_path, 'r')

    dataset1 = f1['Acoustic']
    dataset2 = f2['Acoustic']

    datasets_data = np.concatenate([dataset1, dataset2], axis=0)
    data_downsampled = resample(datasets_data, num=datasets_data.shape[0]//2, axis=0) # resample_poly(datasets_data, up=1, down=2, axis=0)
    data_downsampled = data_downsampled[:dataset1.shape[0]//2]

    assert data_downsampled.shape[0] == dataset1.shape[0]//2 and data_downsampled.shape[1] == dataset1.shape[1]

    dataset1.resize(data_downsampled.shape)
    dataset1[...] = data_downsampled
    dataset1.attrs.modify('TimeSamplingInterval(seconds)', dataset1.attrs['TimeSamplingInterval(seconds)']*2)
    dataset1.attrs.modify('InterrogationRate(Hz)', dataset1.attrs['InterrogationRate(Hz)']/2)

    f1.close()
    f2.close()

    # Process triplets of consecutive files
    for i in tqdm(range(1, len(files)-1), desc="Downsampling files:"):
        file1_path = os.path.join(source_dir, files[i-1])
        file2_path = os.path.join(source_dir, files[i]) # main file to process
        file3_path = os.path.join(source_dir, files[i+1])

        file2_path_new = os.path.join(target_dir, files[i])
        shutil.copy2(file2_path, file2_path_new)

        f1 = h5py.File(file1_path, 'r')
        f2 = h5py.File(file2_path_new, 'r+')
        f3 = h5py.File(file3_path, 'r')

        dataset1 = f1['Acoustic']
        dataset2 = f2['Acoustic']
        dataset3 = f3['Acoustic']

        datasets_data = np.concatenate([dataset1, dataset2, dataset3], axis=0)
        data_downsampled = resample(datasets_data, num=datasets_data.shape[0]//2, axis=0) # resample_poly(datasets_data, up=1, down=2, axis=0)
        data_downsampled = data_downsampled[dataset1.shape[0]//2:dataset1.shape[0]//2+dataset2.shape[0]//2]

        assert data_downsampled.shape[0] == dataset2.shape[0]//2 and data_downsampled.shape[1] == dataset2.shape[1]

        dataset2.resize(data_downsampled.shape)
        dataset2[...] = data_downsampled
        dataset2.attrs.modify('TimeSamplingInterval(seconds)', dataset2.attrs['TimeSamplingInterval(seconds)']*2)
        dataset2.attrs.modify('InterrogationRate(Hz)', dataset2.attrs['InterrogationRate(Hz)']/2) 

        f1.close()
        f2.close()
        f3.close()

        os.remove(file1_path)

    # Process last file
    file1_path = os.path.join(source_dir, files[-2])
    file2_path = os.path.join(source_dir, files[-1])

    file2_path_new = os.path.join(target_dir, files[-1])
    shutil.copy2(file2_path, file2_path_new)

    f1 = h5py.File(file1_path, 'r')
    f2 = h5py.File(file2_path_new, 'r+')

    dataset1 = f1['Acoustic']
    dataset2 = f2['Acoustic']

    datasets_data = np.concatenate([dataset1, dataset2], axis=0)
    data_downsampled = resample(datasets_data, num=datasets_data.shape[0]//2, axis=0) # resample_poly(datasets_data, up=1, down=2, axis=0)
    data_downsampled = data_downsampled[dataset1.shape[0]//2:dataset1.shape[0]//2+dataset2.shape[0]]

    assert data_downsampled.shape[0] == dataset2.shape[0]//2 and data_downsampled.shape[1] == dataset2.shape[1]

    dataset2.resize(data_downsampled.shape)
    dataset2[...] = data_downsampled
    dataset2.attrs.modify('TimeSamplingInterval(seconds)', dataset2.attrs['TimeSamplingInterval(seconds)']*2)
    dataset2.attrs.modify('InterrogationRate(Hz)', dataset2.attrs['InterrogationRate(Hz)']/2)

    f1.close()
    f2.close()

    os.remove(file1_path)
    os.remove(file2_path)


In [5]:
# Test run
process_files(source_dir, target_dir)

Downsampling files:: 100%|██████████| 3/3 [00:51<00:00, 17.04s/it]


In [12]:
# Validation of results
def compare_files(file1_path, file2_path):
    with h5py.File(file1_path, 'r') as f1, h5py.File(file2_path, 'r') as f2:
        dataset1 = f1['Acoustic']
        dataset2 = f2['Acoustic']
        
        data1 = dataset1[...]
        data2 = dataset2[...]
        
        if np.array_equal(data1, data2):
            print("The data in the two files is the same.")
        else:
            print("The data in the two files is different.")

# Example usage
file1_path = "/scratch/ddordevic/FORGE/example_data/16B_StrainRate_20240407T072128+0000_34573_method2.h5"
file2_path = "/scratch/ddordevic/FORGE/downsample_test_target/16B_StrainRate_20240407T072128+0000_34573.h5"
compare_files(file1_path, file2_path)

The data in the two files is the same.


In [2]:
import h5py
with h5py.File("/scratch/ddordevic/FORGE/validation_data/16B_1_StrainRate_20240421T214608+0000_32.h5", 'r') as f:
    d1 = f['Acoustic']
    print(d1[...].shape)

(120000, 1496)
