In [None]:
import pandas as pd
import torchaudio
import shutil
import os
import math
from src.datasets import get_data_path, DATASET_PATH

In [None]:
annotations_path, data_path = get_data_path("hesitation_test")

In [None]:
annotations = pd.read_csv(annotations_path)
annotations.head(5)

In [None]:
len(annotations)

In [None]:
def get_audio_duration(file_path):
	# print(file_path)
	waveform, sample_rate = torchaudio.load(os.path.join(data_path, file_path))
	return math.ceil(waveform.shape[-1] / sample_rate)
annotations['duration(sec)'] = annotations['file_path'].apply(get_audio_duration)
annotations['duration(sec)'].describe()

In [None]:
annotations = annotations[annotations['duration(sec)']>=3]
annotations['duration(sec)'].describe()

In [None]:
filtered_annotations = annotations[annotations['votes_for_noise_or_low_voice'] == 0].reset_index(names=["original_idx"])
len(filtered_annotations)

In [None]:
filtered_annotations["has_hesitation"] = (filtered_annotations[['votes_for_hesitation', 'votes_for_filled_pause']] > 0).any(axis=1).astype(int)
filtered_annotations[["has_hesitation"]].value_counts()

In [None]:
classes = filtered_annotations["has_hesitation"].unique()
less_class_count = filtered_annotations[["has_hesitation"]].value_counts().min()
classes, less_class_count

In [None]:
balanced_data = pd.concat(filtered_annotations[filtered_annotations['has_hesitation'] == c].sample(less_class_count, random_state=1) for c in classes)
balanced_data = balanced_data.sort_index().reset_index(drop=True)
balanced_data['NEW_HAS_HESITATION'] = -999
balanced_data['has_hesitation'].value_counts()

In [None]:
data_to_label_dir = os.path.join(DATASET_PATH, "data_to_label")
def copy_audio(path):
	new_path = os.path.join(data_to_label_dir, path)
	new_dir = os.path.join(*new_path.split(os.sep)[:-1])
	if not os.path.isdir(new_dir):
		os.makedirs(new_dir)
	shutil.copy(os.path.join(data_path, path), new_path)
    
balanced_data['file_path'].apply(copy_audio)

new_annotations_path = os.path.join(data_to_label_dir, "annotations_to_label.csv")
balanced_data.drop(
	["task", "variety", "accent", "speech_genre", "speech_style", "up_votes", "down_votes", "votes_for_hesitation", "votes_for_filled_pause", "votes_for_noise_or_low_voice", "votes_for_second_voice", "votes_for_no_identified_problem", "text", "duration(sec)", "has_hesitation"],
	axis=1).sample(frac=1).to_csv(new_annotations_path, index=False)