## Convert .dat file to .csv file

In [1]:
import struct
import pandas as pd
import os

def read_binary_data(binary_file, start_offset, ele_offsets):
    """
    Reads and parses binary data from a file into a DataFrame.
    
     - Number of measurement (size of the dump) found in 0-7 (unsigned long long) 
     - local0 found in 8-11 (float)
     - local1 found in 12-15 (float)
     - var_local0 found in offset 16-19 (float)
     - var_local1 found in offset 20-23 (float)
     - geometry ID found in offset 24-31 (unsigned long long)
     - measurement_id found in offset 32-39 (unsigned long long)
     - cluster_link found in offset 40-47 (unsigned long long)
     - meas_dim found in offset 48-51 (unsigned int)

    Args:
        binary_file (str): Path to the binary file.
        start_offset (int): Offset to start reading from.
        ele_offsets (list): Byte offsets for unpacking.

    Returns:
        pd.DataFrame: DataFrame containing the parsed data.
    """
    # Define an empty DataFrame 
    data = []

    # Get the total file size
    file_size = os.path.getsize(binary_file)

    # Read the binary data
    with open(binary_file, "rb") as bin_file:
        # mm = 
        print("number of measurements:", struct.unpack("Q", bin_file.read(8))[0])
        
        bin_file.seek(start_offset)
        raw_bytes = bin_file.read(file_size - start_offset)


    # Ensure offsets are consistent
    if len(ele_offsets) < 9:
        raise ValueError("ele_offsets must contain at least 9 elements for parsing.")

    # Process the binary data
    for i in range(0, len(raw_bytes), ele_offsets[-1]):
        try:
            new_data = {
                "local0": struct.unpack("f", raw_bytes[i + ele_offsets[0]: i + ele_offsets[1]])[0],
                "local1": struct.unpack("f", raw_bytes[i + ele_offsets[1]: i + ele_offsets[2]])[0],
                "var_local0": struct.unpack("f", raw_bytes[i + ele_offsets[2]: i + ele_offsets[3]])[0],
                "var_local1": struct.unpack("f", raw_bytes[i + ele_offsets[3]: i + ele_offsets[4]])[0],
                "geometry_id": struct.unpack("Q", raw_bytes[i + ele_offsets[4]: i + ele_offsets[5]])[0],
                "measurement_id": struct.unpack("Q", raw_bytes[i + ele_offsets[5]: i + ele_offsets[6]])[0],
                "cluster_link": struct.unpack("Q", raw_bytes[i + ele_offsets[6]: i + ele_offsets[7]])[0],
                "meas_dim": struct.unpack("I", raw_bytes[i + ele_offsets[7]: i + ele_offsets[8]])[0],
            }
            data.append(new_data)
        except struct.error as e:
            print(f"Error unpacking at index {i}: {e}")
            break
    
    # create the dataframe
    df = pd.DataFrame(data)

    return df

# Configuration
binary_file = "test_measurements/event000000001-measurements.dat"
start_offset = 8
ele_offsets = [0, 4, 8, 12, 16, 24, 32, 40, 44, 64]

# Read the binary data into a DataFrame
df = read_binary_data(binary_file, start_offset, ele_offsets)

# Save the DataFrame to a CSV file
csv_file = "test_measurements/measurements.csv"
df.to_csv(csv_file, index=False)

print(f"{binary_file} saved to {csv_file}")


number of measurements: 253988
test_measurements/event000000001-measurements.dat saved to test_measurements/measurements.csv


## Comparsion between dat2csv file and original csv file (within 3 decimal places)

In [2]:
import csv
from tqdm import tqdm
import numpy as np

# Function to compare two floats within three decimal places
def compare_within_three_decimals(val1, val2):
    try:
        return np.abs(float(val1) - float(val2)) < 1e-3
    except ValueError:
        return False

# Function to compare specific elements in CSV files
# Limited to the first 10 rows

def compare_specific_elements(file1, file2, element1, element2):
    with open(file1, 'r') as f1, open(file2, 'r') as f2:
        reader1 = csv.DictReader(f1)
        reader2 = csv.DictReader(f2)

        row_num = 1
        for row1, row2 in tqdm(zip(reader1, reader2)):

            local0_1 = row1.get(element1, None)
            local1_1 = row1.get(element2, None)
            local0_2 = row2.get(element1, None)
            local1_2 = row2.get(element2, None)

            if local0_1 is None or local0_2 is None or local1_1 is None or local1_2 is None:
                print(f"Missing element at row {row_num}: {element1} or {element2} not found.")
            elif not compare_within_three_decimals(local0_1, local0_2): # compare local0 value
                print(f"Mismatch at row {row_num}: {element1} = {local0_1}, {element1} = {local0_2}")
            elif not compare_within_three_decimals(local1_1, local1_2): # compare local1 value
                print(f"Mismatch at row {row_num}: {element2} = {local1_1}, {element2} = {local1_2}")

            row_num += 1
        print(f"Comparsion between {file1} and {file2} is completed")

compare_specific_elements('test_measurements/event000000001-measurements.csv', 'test_measurements/measurements.csv', 'local0', 'local1')



253988it [00:01, 136239.98it/s]

Comparsion between test_measurements/event000000001-measurements.csv and test_measurements/measurements.csv is completed





## Create surface_link and geometry_id map (based on event000000001)

In [3]:
file1_path = 'test_measurements/event000000001-measurements.csv' # Traccc output
file2_path = 'test_measurements/measurements.csv' # dat2csv output

df1 = pd.read_csv(file1_path)
df2 = pd.read_csv(file2_path)

df1_selected = df1['surface_link']
df2_selected = df2['geometry_id'] 

combined_df = pd.concat([df1_selected, df2_selected], axis=1)
# Drop duplicate rows to ensure one-to-one mapping
unique_map = combined_df.drop_duplicates()

output_path = 'test_measurements/surface_link_geometry_map.csv'
unique_map.to_csv(output_path, index=False)

print(f'surface_link to geometry_id map is saved to {output_path}')

surface_link to geometry_id map is saved to test_measurements/surface_link_geometry_map.csv
