In [1]:
# Import Libraries needed
import librosa
import numpy as np
import pandas as pd
from tqdm import tqdm
from os import listdir
from os.path import join
from collections import Counter
from scipy.io.wavfile import write
from datetime import datetime, timedelta



In [2]:
!mkdir data 

In [3]:
# Set file paths for mechanical readings csv and wav file location
DATA_PATH = "/kaggle/input/rain-data-master-8k"
MECH_FILE_PATH = "/kaggle/input/rain-data-master-8k/rain_data_mechanical_master.csv"
NON_MECH_PATH = "/kaggle/input/rain-data-master-8k/rainfall_sound_8k"
Fs = 8000
MAX_LEN = 1368000 # 171 sec @ Fs 8k samples/sec
DAVIS_INTERVAL = 180

In [4]:
# Read mechanical raingauge readings as csv using pandas
mech_data = pd.read_csv(join(DATA_PATH, MECH_FILE_PATH))

# Convert time column to pandas time datatype
mech_data["Time"]= pd.to_datetime(mech_data["Time"])

# Get list of wav files and sort them in ascending order
wave_files = sorted(listdir(NON_MECH_PATH))

# Get total count of wav files available
N = len(wave_files)

In [5]:
mech_data.head(2)

Unnamed: 0,Time,device_frmpayload_data_rainfall
0,2023-11-22 18:27:26,800 µm
1,2023-11-22 18:30:26,1.40 mm


In [6]:
# Function to parse date and time components from wav filenames
def filename_parser(filename):
    year, month, day, hour, minute, second, _ = map(int, filename.split(".")[0].split("_"))
    return datetime(year, month, day, hour, minute, second)

# Get the fist starting time available for wav files
start_time = filename_parser(wave_files[0])

# Get the last time available for wav files
end_time = filename_parser(wave_files[-1])

In [7]:
# Find overlapping time instances where mechanical readings and wav files are available
row_overlap = (mech_data["Time"]>start_time)&(mech_data["Time"]<end_time)
mech_data = mech_data[row_overlap]

In [8]:
mech_data["time_delta"] = mech_data["Time"].diff()
mech_data["time_delta"] = mech_data["time_delta"].dt.total_seconds()

In [9]:
# To find the number of unique days where we have recording available
mech_data["Time"] = mech_data["Time"].astype('str')
mech_data[["YMD", "HMS"]] = mech_data["Time"].str.split(" ",expand=True)
mech_data["YMD"].value_counts().keys()

Index(['2023-12-17', '2023-11-23', '2024-05-20', '2024-05-29', '2024-05-28',
       '2023-11-22', '2024-05-21', '2024-01-05', '2023-12-02', '2024-05-26',
       '2024-05-17', '2024-04-23', '2024-05-24', '2024-05-15', '2024-05-25',
       '2024-05-22', '2024-05-27', '2024-05-11', '2023-12-16', '2024-04-24',
       '2024-04-30', '2023-12-18', '2023-12-08', '2024-05-16', '2024-04-27',
       '2024-05-14', '2024-05-12'],
      dtype='object', name='YMD')

In [10]:
mech_data["Time"]= pd.to_datetime(mech_data["Time"])
mech_data = mech_data[["Time", "device_frmpayload_data_rainfall"]]

In [11]:
# mech_data = mech_data[mech_data["rainfall"]!=0.0]

In [12]:
mech_data

Unnamed: 0,Time,device_frmpayload_data_rainfall
0,2023-11-22 18:27:26,800 µm
1,2023-11-22 18:30:26,1.40 mm
2,2023-11-22 18:33:26,1 mm
3,2023-11-22 18:36:26,1.40 mm
4,2023-11-22 18:39:26,200 µm
...,...,...
1342,2024-05-29 16:38:56,0.6 mm
1343,2024-05-29 16:47:56,0.6 mm
1344,2024-05-29 17:02:56,0.6 mm
1345,2024-05-29 17:05:56,0.6 mm


In [13]:
# Function to load a wav file using librosa and get its sampling rate and duration
def load_wav(file_path, Fs=Fs):
    audio, Fs = librosa.load(file_path, sr=Fs)
    duration = librosa.get_duration(y=audio, sr=Fs)
    return audio, Fs, duration

In [14]:
mech_data

Unnamed: 0,Time,device_frmpayload_data_rainfall
0,2023-11-22 18:27:26,800 µm
1,2023-11-22 18:30:26,1.40 mm
2,2023-11-22 18:33:26,1 mm
3,2023-11-22 18:36:26,1.40 mm
4,2023-11-22 18:39:26,200 µm
...,...,...
1342,2024-05-29 16:38:56,0.6 mm
1343,2024-05-29 16:47:56,0.6 mm
1344,2024-05-29 17:02:56,0.6 mm
1345,2024-05-29 17:05:56,0.6 mm


In [15]:
def get_fname_time(file_name):
    file_name_short = file_name.split("_")[:-1]
    year, month, day, hour, minute, second = map(int, file_name_short)
    fname_time = datetime(year, month, day, hour, minute, second)
    return fname_time

In [16]:
def ftime_to_fname(ftime):
    ftime_prev = ftime.strftime('%Y_%m_%d_%H_%M_%S')
    return ftime_prev

In [17]:
def filter_files(filenames, start_time, timestamp):
    filtered_files = []
    offset = timedelta(minutes=3)
    start_time -= offset
    timestamp += offset
    for filename in filenames:
        fname_time = get_fname_time(filename).replace(microsecond=0)
        if start_time <= fname_time <= timestamp:
            filtered_files.append(filename)
    return filtered_files

In [18]:
def combine_audio(timestamp, filenames):
    start_time = timestamp - timedelta(seconds=DAVIS_INTERVAL)
    combined_audio = []
    filtered_files = filter_files(filenames, start_time, timestamp)
    for filename in filtered_files:
        file_path = join(NON_MECH_PATH, filename)
        audio_data, sampling_rate, audio_duration = load_wav(file_path)
        fname_time = get_fname_time(filename).replace(microsecond=0)
        fname_end_time = fname_time+timedelta(seconds=audio_duration)
#         print(start_time, timestamp, fname_time, fname_end_time)
        if fname_time<start_time and fname_end_time<=start_time:
            continue
        elif fname_time<start_time and fname_end_time>start_time:
#             print("Getting parts of: ", filename)
            end_secs = (fname_end_time-start_time).total_seconds()
            init_secs = audio_duration-end_secs
            audio_data = audio_data[int(init_secs*sampling_rate):]
        elif fname_time==start_time or fname_end_time==timestamp:
#             print("Getting fullof")
            pass
        elif fname_time>start_time and fname_end_time<timestamp:
#             print("Getting fullof")
            pass
        elif fname_time<timestamp and fname_end_time>timestamp:
#             print("Getting parts of: ", filename)
            end_secs = (fname_end_time-timestamp).total_seconds()
            init_secs = audio_duration-end_secs
            audio_data = audio_data[:int(init_secs*sampling_rate)]
        elif fname_time>=timestamp:
            continue
        combined_audio.extend(audio_data)
    combined_audio = np.array(combined_audio)
    return combined_audio

In [19]:
data_basic = pd.DataFrame()
target = np.array([])
audio_lens=[]
for idx, row in tqdm(mech_data.iterrows(), total=mech_data.shape[0]):
    check_point = row["Time"]
    rainfall = row["device_frmpayload_data_rainfall"]
    audio_sample = combine_audio(check_point, wave_files)
    audio_len = audio_sample.shape[0]
    audio_lens.append(audio_len)
    if audio_sample.shape[0]>=MAX_LEN:
        audio_sample = audio_sample[-MAX_LEN:]
#         write(join("data","audio_{}.wav".format(idx)), Fs, audio_sample)
        with open(join("data","audio_{}.npy".format(idx)), "wb") as f:
            np.save(f, audio_sample)
        data_row = {"checkpoint": check_point,
                    "fname": "audio_{}.wav".format(idx), 
                    "target": row["device_frmpayload_data_rainfall"]}
        data_basic = pd.concat([data_basic, pd.DataFrame([data_row])], ignore_index=True)

100%|██████████| 1347/1347 [16:24<00:00,  1.37it/s]


In [20]:
# Function to trim unit from target column and convert all values to millimeters
def format_rainfall(rain_fall):
    rain_fall, unit = rain_fall.split(" ")
    if unit == "µm":
        rain_fall = float(rain_fall)/(10**3)
    elif unit == "mm":
        rain_fall = float(rain_fall)
    return rain_fall

In [21]:
data_basic["target"] = data_basic["target"].apply(format_rainfall)
mech_data["device_frmpayload_data_rainfall"] = mech_data["device_frmpayload_data_rainfall"].apply(format_rainfall)

In [22]:
print(Counter(audio_lens))

Counter({1392000: 291, 1384000: 161, 1424000: 154, 1376000: 151, 1368000: 141, 0: 74, 1400000: 72, 1360000: 53, 1408000: 46, 1416000: 34, 1320000: 30, 1328000: 28, 1344000: 26, 1336000: 26, 1352000: 21, 1432000: 16, 1312000: 5, 1296000: 4, 8000: 2, 1208000: 1, 16000: 1, 1304000: 1, 848000: 1, 1192000: 1, 1184000: 1, 248000: 1, 1160000: 1, 880000: 1, 288000: 1, 272000: 1, 96000: 1})


In [23]:
data_basic.to_csv("data_basic.csv")
mech_data.to_csv("mech_data.csv")

In [24]:
mech_data

Unnamed: 0,Time,device_frmpayload_data_rainfall
0,2023-11-22 18:27:26,0.8
1,2023-11-22 18:30:26,1.4
2,2023-11-22 18:33:26,1.0
3,2023-11-22 18:36:26,1.4
4,2023-11-22 18:39:26,0.2
...,...,...
1342,2024-05-29 16:38:56,0.6
1343,2024-05-29 16:47:56,0.6
1344,2024-05-29 17:02:56,0.6
1345,2024-05-29 17:05:56,0.6


In [25]:
data_basic

Unnamed: 0,checkpoint,fname,target
0,2023-11-22 18:27:26,audio_0.wav,0.8
1,2023-11-22 18:30:26,audio_1.wav,1.4
2,2023-11-22 18:33:26,audio_2.wav,1.0
3,2023-11-22 18:36:26,audio_3.wav,1.4
4,2023-11-22 18:39:26,audio_4.wav,0.2
...,...,...,...
1061,2024-05-29 16:20:56,audio_1339.wav,0.6
1062,2024-05-29 16:23:56,audio_1340.wav,0.6
1063,2024-05-29 16:26:56,audio_1341.wav,0.6
1064,2024-05-29 17:05:56,audio_1345.wav,0.6
