In [1]:
import pandas as pd
import torchaudio
import shutil
import os
import math
from src.datasets import get_data_path, DATASET_PATH

In [2]:
annotations_path, data_path = get_data_path("hesitation_test")

In [3]:
annotations = pd.read_csv(annotations_path)
annotations.head(5)

Unnamed: 0,file_path,task,variety,dataset,accent,speech_genre,speech_style,up_votes,down_votes,votes_for_hesitation,votes_for_filled_pause,votes_for_noise_or_low_voice,votes_for_second_voice,votes_for_no_identified_problem,text,duration(sec)
0,test/sp/42881_sp_.wav,annotation_and_transcription,pt_br,SP2010,São Paulo (cap.),Conversation or Interview or Reading,Spontaneous and Read Speech,2,1,0.0,0.0,1.0,0.0,2.0,certo e aí quanto tempo você mora aqui ne ness...,4
1,test/sp/42883_sp_.wav,annotation_and_transcription,pt_br,SP2010,São Paulo (cap.),Conversation or Interview or Reading,Spontaneous and Read Speech,2,1,0.0,0.0,1.0,0.0,2.0,ah legal faz tempão já,2
2,test/sp/42885_sp_.wav,annotation_and_transcription,pt_br,SP2010,São Paulo (cap.),Conversation or Interview or Reading,Spontaneous and Read Speech,2,1,0.0,0.0,1.0,0.0,2.0,no jardins e aqui eu moro há vinte anos,4
3,test/sp/42889_sp_.wav,annotation_and_transcription,pt_br,SP2010,São Paulo (cap.),Conversation or Interview or Reading,Spontaneous and Read Speech,2,1,0.0,0.0,1.0,0.0,1.0,e eu nasci aqui eu e meus irmãos,3
4,test/sp/42893_sp_.wav,annotation_and_transcription,pt_br,SP2010,São Paulo (cap.),Conversation or Interview or Reading,Spontaneous and Read Speech,3,0,0.0,0.0,2.0,0.0,2.0,gosto bastante,2


In [4]:
len(annotations)

12676

In [5]:
def get_audio_duration(file_path):
	# print(file_path)
	waveform, sample_rate = torchaudio.load(os.path.join(data_path, file_path))
	return math.ceil(waveform.shape[-1] / sample_rate)
annotations['duration(sec)'] = annotations['file_path'].apply(get_audio_duration)
annotations['duration(sec)'].describe()

count    12676.000000
mean         3.683181
std          2.767088
min          1.000000
25%          2.000000
50%          3.000000
75%          4.000000
max         40.000000
Name: duration(sec), dtype: float64

In [6]:
annotations = annotations[annotations['duration(sec)']>=3]
annotations['duration(sec)'].describe()

count    7421.000000
mean        5.070610
std         2.879919
min         3.000000
25%         3.000000
50%         4.000000
75%         6.000000
max        40.000000
Name: duration(sec), dtype: float64

In [7]:
filtered_annotations = annotations[annotations['votes_for_noise_or_low_voice'] == 0].reset_index(names=["original_idx"])
len(filtered_annotations)

2801

In [8]:
filtered_annotations["has_hesitation"] = (filtered_annotations[['votes_for_hesitation', 'votes_for_filled_pause']] > 0).any(axis=1).astype(int)
filtered_annotations[["has_hesitation"]].value_counts()

has_hesitation
0                 2093
1                  708
Name: count, dtype: int64

In [9]:
classes = filtered_annotations["has_hesitation"].unique()
less_class_count = filtered_annotations[["has_hesitation"]].value_counts().min()
classes, less_class_count

(array([0, 1]), np.int64(708))

In [10]:
balanced_data = pd.concat(filtered_annotations[filtered_annotations['has_hesitation'] == c].sample(less_class_count, random_state=1) for c in classes)
balanced_data = balanced_data.sort_index().reset_index(drop=True)
balanced_data['tem_hesitacao(0-1)'] = None
balanced_data['nivel_de_hesitacao(0-5)'] = None
balanced_data['duvida(0-1)'] = None
balanced_data['has_hesitation'].value_counts()

has_hesitation
0    708
1    708
Name: count, dtype: int64

In [11]:
data_to_label_dir = os.path.join(DATASET_PATH, "data_to_label")
def copy_audio(path):
	new_path = os.path.join(data_to_label_dir, path)
	new_dir = os.path.join(*new_path.split(os.sep)[:-1])
	if not os.path.isdir(new_dir):
		os.makedirs(new_dir)
	shutil.copy(os.path.join(data_path, path), new_path)
    
balanced_data['file_path'].apply(copy_audio)

new_annotations_path = os.path.join(data_to_label_dir, "annotations_to_label.xlsx")
balanced_data.drop(
	["task", "variety", "accent", "speech_genre", "speech_style", "up_votes", "down_votes", "votes_for_hesitation", "votes_for_filled_pause", "votes_for_noise_or_low_voice", "votes_for_second_voice", "votes_for_no_identified_problem", "text", "duration(sec)", "has_hesitation"],
	axis=1).sample(frac=1).to_excel(new_annotations_path, index=False)