In [1]:
# Import Libraries needed
import librosa
import numpy as np
import pandas as pd
from tqdm import tqdm
from os import listdir
from os.path import join
from datetime import datetime, timedelta

In [2]:
# Set file paths for mechanical readings csv and wav file location
DATA_PATH = "/kaggle/input/rain-data-master-8k-10-sec"
MECH_FILE_PATH = "/kaggle/input/rain-data-master-8k-10-sec/davis_label_10_sec.csv"
NON_MECH_PATH = "/kaggle/input/rain-data-master-8k-10-sec/rainfall_sound_8k_10_sec"
Fs = 8000
MAX_LEN = 80000

In [3]:
# Read mechanical raingauge readings as csv using pandas
mech_data = pd.read_csv(join(DATA_PATH, MECH_FILE_PATH))

# Convert time column to pandas time datatype
mech_data["time"]= pd.to_datetime(mech_data["time"])

# Get list of wav files and sort them in ascending order
wave_files = sorted(listdir(NON_MECH_PATH))

# Get total count of wav files available
N = len(wave_files)

In [4]:
# Function to parse date and time components from wav filenames
def filename_parser(filename):
    year, month, day, hour, minute, second, _ = map(int, filename.split(".")[0].split("_"))
    return datetime(year, month, day, hour, minute, second)

# Get the fist starting time available for wav files
start_time = filename_parser(wave_files[0])

# Get the last time available for wav files
end_time = filename_parser(wave_files[-1])

In [5]:
# Find overlapping time instances where mechanical readings and wav files are available
row_overlap = (mech_data["time"]>start_time)&(mech_data["time"]<end_time)
mech_data = mech_data[row_overlap]

In [6]:
mech_data["time_delta"] = mech_data["time"].diff()
mech_data["time_delta"] = mech_data["time_delta"].dt.total_seconds()

In [7]:
# To find the number of unique days where we have recording available
mech_data["time"] = mech_data["time"].astype('str')
mech_data[["YMD", "HMS"]] = mech_data["time"].str.split(" ",expand=True)
mech_data["YMD"].value_counts().keys()

Index(['2024-04-12', '2024-04-13', '2024-04-11', '2024-04-15', '2024-04-14'], dtype='object', name='YMD')

In [8]:
mech_data["time"]= pd.to_datetime(mech_data["time"])
mech_data = mech_data[["time", "rainfall"]]

In [9]:
# mech_data = mech_data[mech_data["rainfall"]!=0.0]

In [10]:
mech_data

Unnamed: 0,time,rainfall
0,2024-04-11 22:20:59.757439,0.0
1,2024-04-11 22:21:09.757439,0.0
2,2024-04-11 22:21:19.757438,0.0
3,2024-04-11 22:21:29.757440,0.0
4,2024-04-11 22:21:39.757440,0.0
...,...,...
282,2024-04-15 06:00:19.766818,0.2
283,2024-04-15 06:00:29.757440,0.0
284,2024-04-15 06:00:39.757441,0.0
285,2024-04-15 06:00:49.757439,0.0


In [11]:
# Function to load a wav file using librosa and get its sampling rate and duration
def load_wav(file_path, Fs=Fs):
    audio, Fs = librosa.load(file_path, sr=Fs)
    duration = librosa.get_duration(y=audio, sr=Fs)
    return audio, Fs, duration

In [12]:
mech_data

Unnamed: 0,time,rainfall
0,2024-04-11 22:20:59.757439,0.0
1,2024-04-11 22:21:09.757439,0.0
2,2024-04-11 22:21:19.757438,0.0
3,2024-04-11 22:21:29.757440,0.0
4,2024-04-11 22:21:39.757440,0.0
...,...,...
282,2024-04-15 06:00:19.766818,0.2
283,2024-04-15 06:00:29.757440,0.0
284,2024-04-15 06:00:39.757441,0.0
285,2024-04-15 06:00:49.757439,0.0


In [13]:
def get_fname_time(file_name):
    file_name_short = file_name.split("_")[:-1]
    year, month, day, hour, minute, second = map(int, file_name_short)
    fname_time = datetime(year, month, day, hour, minute, second)
    return fname_time

def ftime_to_fname(ftime):
    ftime_prev = ftime.strftime('%Y_%m_%d_%H_%M_%S')
    return ftime_prev

def get_previous_file(file_name, wave_files):
    fname_time = get_fname_time(file_name)
    fname_time_prev = fname_time-timedelta(seconds=10)
    prev_file_name = ftime_to_fname(fname_time_prev)
    prev_file_to_read = None
    for wave_file in wave_files:
        wave_file_short = "_".join(wave_file.split("_")[:-1])
        if wave_file_short == prev_file_name:
            prev_file_to_read = wave_file
            break
        else:
            continue
    return prev_file_to_read

def evaluate_duration(duration):
    sec_difference = duration.total_seconds()
    if sec_difference<0:
        selection_flag = False
        partial = False
        current_file_init_secs = 0
        prev_file_end_secs = 0
    elif sec_difference==0:
        selection_flag = True
        partial = False
        current_file_init_secs = 10
        prev_file_end_secs = 0
    elif sec_difference>9:
        selection_flag = False
        partial = False
        current_file_init_secs = 0
        prev_file_end_secs = 0
    elif sec_difference<=9:
        selection_flag = True
        partial = True
        current_file_init_secs = 10-sec_difference
        prev_file_end_secs  = sec_difference
    return selection_flag, partial, current_file_init_secs, prev_file_end_secs

def read_and_merge_files(file_path_1, file_path_2, current_file_init_secs, prev_file_end_secs):
    audio_1, _, _ = load_wav(file_path_1)
    audio_2, _, _ = load_wav(file_path_2)
    audio_1 = audio_1[:int(current_file_init_secs*Fs)]
    audio_2 = audio_2[int((10-prev_file_end_secs)*Fs):]
    audio = np.concatenate((audio_1, audio_2), axis=None)
    return audio

def file_flagger(file_name, check_point, rainfall):
    check_point = check_point.replace(microsecond=0)
    fname_time = get_fname_time(file_name).replace(microsecond=0)
    fname_end_time = fname_time+timedelta(seconds=10)
    duration = fname_end_time-check_point
    selection_flag, partial, current_file_init_secs, prev_file_end_secs = evaluate_duration(duration)
    if selection_flag and not partial:
        file_path = join(NON_MECH_PATH, file_name)
        audio, Fs, duration = load_wav(file_path)
        print("Reading the full file: ", file_name)
    elif selection_flag and partial:
        prev_file_to_read = get_previous_file(file_name, wave_files)
        if prev_file_to_read:
            file_path_1 = join(NON_MECH_PATH, file_name)
            file_path_2 = join(NON_MECH_PATH, prev_file_to_read)
            print("Reading initial of {} and end of {}".format(file_name, prev_file_to_read))
            audio = read_and_merge_files(file_path_1, file_path_2, current_file_init_secs, prev_file_end_secs)
        else:
            audio = np.zeros(MAX_LEN)
    elif not selection_flag:
        audio = np.zeros(MAX_LEN)
    return audio, selection_flag

In [14]:
data_basic = pd.DataFrame()
target = np.array([])
for idx, row in mech_data.iterrows():
    check_point = row["time"]
    rainfall = row["rainfall"]
    for wave_file in wave_files:
        audio_sample, selection_flag = file_flagger(wave_file, check_point, rainfall)
        if selection_flag:
            with open("audio_{}.npy".format(idx), "wb") as f:
                np.save(f, audio_sample)
            data_row = {"checkpoint": check_point,
                        "fname": "audio_{}.npy".format(idx), 
                        "target": row["rainfall"]}
            data_basic = pd.concat([data_basic, pd.DataFrame([data_row])], ignore_index=True)
        else:
            continue

Reading initial of 2024_04_11_22_21_01_206302.wav and end of 2024_04_11_22_20_51_032425.wav
Reading initial of 2024_04_11_22_21_11_377146.wav and end of 2024_04_11_22_21_01_206302.wav
Reading initial of 2024_04_11_22_21_21_550275.wav and end of 2024_04_11_22_21_11_377146.wav
Reading initial of 2024_04_11_22_21_31_724199.wav and end of 2024_04_11_22_21_21_550275.wav
Reading initial of 2024_04_11_22_21_41_895893.wav and end of 2024_04_11_22_21_31_724199.wav
Reading initial of 2024_04_11_22_22_02_239525.wav and end of 2024_04_11_22_21_52_066773.wav
Reading initial of 2024_04_11_22_22_12_413496.wav and end of 2024_04_11_22_22_02_239525.wav
Reading initial of 2024_04_11_22_22_22_586632.wav and end of 2024_04_11_22_22_12_413496.wav
Reading initial of 2024_04_11_22_22_32_760900.wav and end of 2024_04_11_22_22_22_586632.wav
Reading initial of 2024_04_11_22_43_27_627611.wav and end of 2024_04_11_22_43_17_453961.wav
Reading initial of 2024_04_11_22_43_37_799816.wav and end of 2024_04_11_22_43_27

In [15]:
data_basic

Unnamed: 0,checkpoint,fname,target
0,2024-04-11 22:20:59.757439,audio_0.npy,0.0
1,2024-04-11 22:21:09.757439,audio_1.npy,0.0
2,2024-04-11 22:21:19.757438,audio_2.npy,0.0
3,2024-04-11 22:21:29.757440,audio_3.npy,0.0
4,2024-04-11 22:21:39.757440,audio_4.npy,0.0
...,...,...,...
266,2024-04-14 06:01:59.757438,audio_275.npy,0.2
267,2024-04-15 06:00:29.757440,audio_283.npy,0.0
268,2024-04-15 06:00:39.757441,audio_284.npy,0.0
269,2024-04-15 06:00:49.757439,audio_285.npy,0.0


In [16]:
data_basic.to_csv("data_basic.csv")
mech_data.to_csv("mech_data.csv")

In [17]:
mech_data

Unnamed: 0,time,rainfall
0,2024-04-11 22:20:59.757439,0.0
1,2024-04-11 22:21:09.757439,0.0
2,2024-04-11 22:21:19.757438,0.0
3,2024-04-11 22:21:29.757440,0.0
4,2024-04-11 22:21:39.757440,0.0
...,...,...
282,2024-04-15 06:00:19.766818,0.2
283,2024-04-15 06:00:29.757440,0.0
284,2024-04-15 06:00:39.757441,0.0
285,2024-04-15 06:00:49.757439,0.0
