In [1]:
from datasets import load_dataset
from huggingface_hub import login
from dotenv import load_dotenv
from pathlib import Path
import pandas as pd
import shutil
import librosa
import os
from IPython.display import Audio

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
load_dotenv()
login(os.environ['HF_TOKEN']) # Dataset is gated, neet hugginface-cli login

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


In [75]:
def get_duration(file_path, sr):
    y, sr = librosa.load(file_path, sr=sr)
    duration = librosa.get_duration(y=y, sr=sr)
    return duration

In [None]:
LANG = 'pt'
cv_17 = load_dataset("mozilla-foundation/common_voice_17_0", LANG)


In [96]:
transf_dataset = (cv_17['test']
    .select_columns(['path', 'sentence', 'audio'])
    .map(lambda x: {'duration': get_duration(x['path'], x['audio']['sampling_rate']), 'sr': x['audio']['sampling_rate']}, remove_columns=['audio'])
)

Map: 100%|██████████| 9467/9467 [00:48<00:00, 194.55 examples/s]


In [111]:
def load_audio(idx):
    file_path, txt, duration, *_ = transf_dataset[idx].values()

    print(txt)
    print(duration, 'seconds')
    display(Audio(file_path))

load_audio(913)

Um galo atacará outros machos que entrarem em seu território.
8.496 seconds


In [169]:
def generate_dataset(n_examples):
    shutil.rmtree('data')
    Path('data').mkdir(exist_ok=True)

    df = transf_dataset.to_pandas().nlargest(n_examples, 'duration').copy()
    df['orig_path'] = df.path
    df['path'] = df.index.map(lambda x: f"data/audio/{x:>05}.mp3")

    for row in df.itertuples():
        orig_path = Path(row.orig_path)
        new_path = Path(row.path)
        sentence_file = Path(f"data/description/{row.Index:>05}.txt")
        
        sentence_file.parent.mkdir(parents=True, exist_ok=True)
        sentence_file.write_text(row.sentence)

        new_path.parent.mkdir(parents=True, exist_ok=True)
        shutil.copy(orig_path, new_path)
     
    df.drop('orig_path', axis=1).sort_values('path').to_csv('data/audios.csv', index=False)
    print('Saved csv file in `data/audios.csv`.')

In [170]:
generate_dataset(100)
df = pd.read_csv('data/audios.csv')
df

Saved csv file in `data/audios.csv`.


Unnamed: 0,path,sentence,duration,sr
0,data/audio/00010.mp3,Já é uma sorte que você é poupado de um acidente.,9.648,48000
1,data/audio/00076.mp3,"esquadrão da morte, jogo do bicho, máfia do ap...",9.756,48000
2,data/audio/00083.mp3,"burnout, anorexia nervosa, bulimia, periódica,...",9.972,48000
3,data/audio/00084.mp3,Duas mulheres e uma menina levantam com troféus.,10.176,48000
4,data/audio/00092.mp3,Mesa de sinuca no bar,10.620,48000
...,...,...,...,...
95,data/audio/08771.mp3,Gerencie as configurações de segurança do usuá...,10.512,48000
96,data/audio/09190.mp3,"deliberação, publicação, veredicto, abstenção,...",10.008,48000
97,data/audio/09196.mp3,"agorafobia, pânico, generalizada, social, obse...",9.720,48000
98,data/audio/09345.mp3,"Eletrobras Furnas, Embelleze, Embratel, Enel, ...",10.332,48000
