In [22]:
import pandas as pd
import librosa
import soundfile as sf
import os

In [23]:
datasets = [
    {"audio": '../../data/audio_22092022.WAV', "lidar_log": "../../data/lidar_data_with_audio_timestamps_22_09_22.csv"},
    {"audio": '../../data/audio_27092022_1.WAV', "lidar_log": "../../data/lidar_data_with_audio_timestamps_27_09_22_1.csv"},
    {"audio": '../../data/audio_27092022_2.WAV', "lidar_log": "../../data/lidar_data_with_audio_timestamps_27_2.csv"},
    {"audio": '../../data/audio_21102022_1.WAV', "lidar_log": "../../data/lidar_data_with_audio_timestamps_oct_21_1.csv"},
    {"audio": '../../data/audio_21102022_2.WAV', "lidar_log": "../../data/lidar_data_with_audio_timestamps_oct_21_2.csv"}
]

TRAIN_DATA_PATH = './data/train'
VALID_DATA_PATH = './data/valid'

In [24]:
#Clear data dir
import os
import glob

files = glob.glob(f'{TRAIN_DATA_PATH}/*')
for f in files:
    os.remove(f)
files = glob.glob(f'{VALID_DATA_PATH}/*')
for f in files:
    os.remove(f)

In [25]:
start_index = 0
all_train = pd.DataFrame()
all_valid = pd.DataFrame()
for dataset in datasets:
    print("Generating dataset for", dataset["lidar_log"], dataset["audio"])
    audio_file = dataset["audio"]
    lidar_log = pd.read_csv(dataset["lidar_log"])
    lidar_log.drop(["Unnamed: 0"], axis=1)
    
    y, sr = librosa.load(audio_file, sr=None)
    audio_length = librosa.get_duration(y=y, sr=sr)
    
    lidar_log = lidar_log[(lidar_log["audio_start_s"] >= 0) & (lidar_log["audio_end_s"] <= audio_length)].reset_index(drop=True)
    lidar_log["filename"] = "None"
    lidar_log["train"] = "True"
    
    for i, row in lidar_log.iterrows():
        start, end = row["audio_start_s"], row["audio_end_s"]
        start = round(start*sr)
        end = round(end*sr)
        file_index = i + start_index

        if i / len(lidar_log) < 0.8:
            filename = f"train_{file_index}.wav"
            wavpath = f"{TRAIN_DATA_PATH}/train_{file_index}.wav"
        else:
            filename = f"valid_{file_index}.wav"
            wavpath = f"{VALID_DATA_PATH}/valid_{file_index}.wav"
            lidar_log.loc[i, "train"] = "False"

        lidar_log.loc[i, "filename"] = filename
        assert start > 0
        assert end < len(y)
        sf.write(wavpath, y[start:end], sr, 'PCM_24')  # slice by seconds, seconds in audio = SR * seconds
    
    train = lidar_log[lidar_log["train"]=="True"]
    valid = lidar_log[lidar_log["train"]=="False"].reset_index(drop=True)
    
    all_train = pd.concat([all_train, train])
    all_valid = pd.concat([all_valid, valid])
        
    start_index = start_index + len(lidar_log.index)
    print("Finished dataset for", dataset["lidar_log"], "Generated total", start_index, "samples")
        
all_train.to_csv(f"{TRAIN_DATA_PATH}/train.csv", header=True)
all_valid.to_csv(f"{VALID_DATA_PATH}/valid.csv", header=True)

Generating dataset for ../../data/lidar_data_with_audio_timestamps_22_09_22.csv ../../data/audio_22092022.WAV
Finished dataset for ../../data/lidar_data_with_audio_timestamps_22_09_22.csv Generated total 534 samples
Generating dataset for ../../data/lidar_data_with_audio_timestamps_27_09_22_1.csv ../../data/audio_27092022_1.WAV
Finished dataset for ../../data/lidar_data_with_audio_timestamps_27_09_22_1.csv Generated total 974 samples
Generating dataset for ../../data/lidar_data_with_audio_timestamps_27_2.csv ../../data/audio_27092022_2.WAV
Finished dataset for ../../data/lidar_data_with_audio_timestamps_27_2.csv Generated total 1798 samples
Generating dataset for ../../data/lidar_data_with_audio_timestamps_oct_21_1.csv ../../data/audio_21102022_1.WAV
Finished dataset for ../../data/lidar_data_with_audio_timestamps_oct_21_1.csv Generated total 2330 samples
Generating dataset for ../../data/lidar_data_with_audio_timestamps_oct_21_2.csv ../../data/audio_21102022_2.WAV
Finished dataset for