In [1]:
import requests
import pandas as pd

In [2]:

def download_tsv_from_google_sheet(sheet_url):
    # Modify the Google Sheet URL to export it as TSV
    tsv_url = sheet_url.replace('/edit#gid=', '/export?format=tsv&gid=')
    
    # Send a GET request to download the TSV file
    response = requests.get(tsv_url)
    response.encoding = 'utf-8'
    # Check if the request was successful
    if response.status_code == 200:
        # Read the TSV content into a pandas DataFrame
        from io import StringIO
        tsv_content = StringIO(response.text)
        df = pd.read_csv(tsv_content, sep='\t', encoding='utf-8')
        return df
    else:
        print("Failed to download the TSV file.")
        return None
    

In [3]:
import azure.cognitiveservices.speech as speechsdk
import os

   
def generate_synth_speech(voice_name, prompt, output_file="output.wav"):
     # This example requires environment variables named "SPEECH_KEY" and "SPEECH_REGION"
    speech_config = speechsdk.SpeechConfig(subscription=os.environ.get('SPEECH_KEY'), region=os.environ.get('SPEECH_REGION'))
    speech_config.speech_synthesis_voice_name=voice_name

    # supported voices: https://learn.microsoft.com/en-us/azure/ai-services/speech-service/language-support?tabs=stt#prebuilt-neural-voices
    audio_config = speechsdk.audio.AudioOutputConfig(filename=output_file)
    speech_synthesizer = speechsdk.SpeechSynthesizer(speech_config=speech_config, audio_config=audio_config)

    speech_synthesis_result = speech_synthesizer.speak_text_async(prompt).get()

    if speech_synthesis_result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted:
        print("Speech synthesized for text [{}]".format(prompt))
    elif speech_synthesis_result.reason == speechsdk.ResultReason.Canceled:
        cancellation_details = speech_synthesis_result.cancellation_details
        print("Speech synthesis canceled: {}".format(cancellation_details.reason))
        if cancellation_details.reason == speechsdk.CancellationReason.Error:
            if cancellation_details.error_details:
                print("Error details: {}".format(cancellation_details.error_details))
                print("Did you set the speech resource key and region values?")

    print("Speech synthesis status: ", speech_synthesis_result.reason)
    print("Speech synthesized for text [{}]".format(prompt))
    print(f'Audio content written to file "{output_file}"')

    print(speech_synthesis_result)
    del speech_synthesizer


In [23]:
# where generated files will be saved
out_dir_root = "bigos-synth-release"

# name of the subset - corresponds to the name of the promptset
subset_name = "amu-medical_test-24"

# subset - for synth consider also validation split, but rather test only will make more sense
split = "test"

# promptset source
promptset_source = "https://docs.google.com/spreadsheets/d/1Dk-uXWbrXIg59xGd8MnHRljUjrHPW4xKfmunrqMuBRw/edit#gid=0"
promptset_type = "google_sheet"

# Azure voices - Polish - consider adding more synthetic generation engines and voices in the future
tts_engine = "azure"
azure_voices_pl = {"0001":"pl-PL-AgnieszkaNeural", "0002":"pl-PL-MarekNeural", "0003":"pl-PL-ZofiaNeural"}
sampling_rate = 16000 # default for Azure TTS

# read prompts
# df_prompts = pd.read_csv("promptset_source", sep='\t', encoding='utf-8')
df_prompts = download_tsv_from_google_sheet(promptset_source)

# validate prompts 
#TODO

# for testing purposes, select only 5 prompts
df_prompts = df_prompts.head(10)

# header bigos format
header_bigos_str = "audioname split dataset speaker_id samplingrate_orig sampling_rate ref_orig audiopath_bigos"
header_bigos = header_bigos_str.split()

df_header = header_bigos + ['prompt_id', ' tts_engine', 'tts_voice']

out_df_split = pd.DataFrame([], columns=df_header)

# iterate over multiple voices
out_dir_split = os.path.join(out_dir_root, subset_name, split)
for speaker_id, voice_name in azure_voices_pl.items():
    out_dir_spk =  os.path.join(out_dir_split, speaker_id)
    os.makedirs(out_dir_spk, exist_ok=True)
    print(f"Generating speech for voice: {voice_name}")
    out_df_spk = pd.DataFrame([], columns=df_header)
     
    # iterate over multiple columns per row
    for index, row in df_prompts.iterrows():

        prompt_set = row['prompt_set_id'] # in case multiple prompts sets are in single input file
        #print("Processing prompt set: ", prompt_set)
        #print("Subset name from config: ", subset_name)

        assert(prompt_set == subset_name)
        dataset = subset_name
        # prompt id is contactenated from promptset_id and prompt_index making it unique across multiple promptsets
        prompt_id = row['prompt_id']
        

        # id is used as the audio file id
        prompt_index = row['prompt_index']
        
        # pad audio_file_id to 5 digits
        audio_file_id = str(prompt_index).zfill(5)

        prompt = row['prompt']


        # prepare output in BIGOS format
        audioname = str.join("-",[prompt_set, split, speaker_id, audio_file_id])

        audiopath_bigos = "{}.wav".format(audioname)

        out_fp = os.path.join(out_dir_spk, audiopath_bigos)

        #print("Generating speech for prompt: ", prompt)
        if (os.path.exists(out_fp)):
            print("File already exists, skipping!\n", out_fp)
        else:
            generate_synth_speech(voice_name, prompt, out_fp)
            #print("Saving results to: ", out_fp)
        
        df_row = pd.DataFrame([[audioname, split, dataset, speaker_id, sampling_rate, sampling_rate, prompt, audiopath_bigos, prompt_id, tts_engine, voice_name]], columns=df_header)
        print(df_row)
        out_df_spk = pd.concat([out_df_spk, df_row], axis=0)

    # save the results for the speaker
    out_fp_spk = os.path.join(out_dir_spk, f"{speaker_id}.tsv")
    out_df_spk.to_csv(out_fp_spk, sep='\t', index=False)
    print("Saved results to: ", out_fp_spk)
    print("Done!")
    out_df_split = pd.concat([out_df_split, out_df_spk], axis=0)

# create TSV file for all speakers
out_fp_split = os.path.join(out_dir_split, f"{split}.tsv")
out_df_split.to_csv(out_fp_split, sep='\t', index=False)
print("Saved results to: ", out_fp_split)
print("Done!")




Generating speech for voice: pl-PL-AgnieszkaNeural
File already exists, skipping!
 bigos-synth-release/amu-medical_test-24/test/0001/amu-medical_test-24-test-0001-00001.wav
                             audioname split              dataset speaker_id  \
0  amu-medical_test-24-test-0001-00001  test  amu-medical_test-24       0001   

   samplingrate_orig  sampling_rate               ref_orig  \
0              16000          16000  Nadciśnienie tętnicze   

                           audiopath_bigos              prompt_id  tts_engine  \
0  amu-medical_test-24-test-0001-00001.wav  amu-medical_test-24-1       azure   

               tts_voice  
0  pl-PL-AgnieszkaNeural  
File already exists, skipping!
 bigos-synth-release/amu-medical_test-24/test/0001/amu-medical_test-24-test-0001-00002.wav
                             audioname split              dataset speaker_id  \
0  amu-medical_test-24-test-0001-00002  test  amu-medical_test-24       0001   

   samplingrate_orig  sampling_rate  ref_

In [4]:

import glob
import os
import tarfile

# Directory containing the audio files
audio_directory = "/home/michal/Development/github/pl-asr-bigos-tools/scripts/tts-based-eval/bigos-synth-release/amu-medical_test-24/test/"

archive_name = "/home/michal/Development/github/pl-asr-bigos-tools/scripts/tts-based-eval/bigos-synth-release/amu-medical_test-24/test/test.tar.gz"

# Search for all audio files ending with ".wav"
os.chdir(audio_directory)
audio_files = glob.glob("*/*.wav")

with tarfile.open(archive_name, "w:gz") as tar:
    # Add each audio file to the archive without preserving the directory structure
    for audio_file in audio_files:
        tar.add(audio_file, arcname=os.path.basename(audio_file))

print("Audio files added to", archive_name)

Audio files added to /home/michal/Development/github/pl-asr-bigos-tools/scripts/tts-based-eval/bigos-synth-release/amu-medical_test-24/test/test.tar.gz


In [5]:
import os
import shutil

def update_subset_to_hf_repo(subset_name, subset_dir_hf_release, bigos_hf_repo, hf_repo_url, overwrite=False, secret_repo=False):
    # current directory
    print("Current directory: ", os.getcwd())
    cur_dir = os.getcwd()
    if not subset_name:
        print("Please provide subset name as the first argument")
        return
    if not subset_dir_hf_release:
        print("Please provide subset directory as the second argument")
        return
    if not bigos_hf_repo:
        print("Please provide path to HF subset repository as the third argument")
        return
    if not hf_repo_url:
        print("Please provide HF subset repository URL as the fourth argument")
        return

    if not secret_repo:
        secret_repo = False
        print("Secret is not provided, using default value FALSE")

    # Copy files for the subset to be updated
    os.makedirs(os.path.join(bigos_hf_repo, 'data', subset_name), exist_ok=True)

    # If secret repo, copy only test tsv file
    # if already exists in the repo, skip
    if secret_repo:
        if overwrite == False & os.path.exists(os.path.join(bigos_hf_repo, 'data', subset_name, f'{subset_name}_test.tsv')):
            print(f"TSV file for {subset_name} already exists in the repository, skipping...")
        else:
            shutil.copy2(os.path.join(subset_dir_hf_release, f'{subset_name}_test.tsv'), os.path.join(bigos_hf_repo, 'data', subset_name), force=True)
        if overwrite == False & os.path.exists(os.path.join(bigos_hf_repo, 'data', subset_name, 'test.tar.gz')):
            print(f"Tar file for {subset_name} already exists in the repository, skipping...")
        else:
            shutil.copy2(os.path.join(subset_dir_hf_release, 'test.tar.gz'), os.path.join(bigos_hf_repo, 'data', subset_name), force=True)


    if not secret_repo:
        # If not secret repo, copy all files
        for file in os.listdir(subset_dir_hf_release):
            if file.endswith('.tsv'):
                shutil.copy2(os.path.join(subset_dir_hf_release, file), os.path.join(bigos_hf_repo, 'data', subset_name))
            elif file.endswith('.tar.gz'):
                shutil.copy2(os.path.join(subset_dir_hf_release, file), os.path.join(bigos_hf_repo, 'data', subset_name))

    # Change working directory to HF subset repository
    os.chdir(bigos_hf_repo)
    os.system('git add .')
    os.system('git commit -m "Update subset"') 
    os.system('git push')
    os.chdir(cur_dir)


In [6]:
subset_name = "amu-medical_test-24"
subset_dir_hf_release = "/home/michal/Development/github/pl-asr-bigos-tools/scripts/tts-based-eval/bigos-synth-release/amu-medical_test-24/test"
bigos_hf_repo = "/home/michal/Development/hugging-face/amu-cai/pl-asr-bigos-synth"
hf_repo_url = "https://huggingface.co/datasets/amu-cai/pl-asr-bigos-synth"
secret_repo = False
update_subset_to_hf_repo(subset_name, subset_dir_hf_release, bigos_hf_repo, hf_repo_url, overwrite=True, secret_repo=False)

Current directory:  /home/michal/Development/github/pl-asr-bigos-tools/scripts/tts-based-eval/bigos-synth-release/amu-medical_test-24/test
Secret is not provided, using default value FALSE
[main 2301a8f] Update subset
 1 file changed, 2 insertions(+), 2 deletions(-)
Uploading LFS objects: 100% (1/1), 943 KB | 0 B/s, done.


To https://huggingface.co/datasets/amu-cai/pl-asr-bigos-synth
   049aa98..2301a8f  main -> main


In [5]:
# test hf dataset

from datasets import load_dataset

# Load the dataset
dataset = load_dataset("amu-cai/pl-asr-bigos-synth", "amu-med_terms_adolesc_comm-24", download_mode="force_redownload")
print(dataset)

Downloading builder script:   0%|          | 0.00/8.85k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/33.0 [00:00<?, ?B/s]

Downloading and preparing dataset pl-asr-bigos-synth/amu-med_terms_adolesc_comm-24 to /home/michal/.cache/huggingface/datasets/amu-cai___pl-asr-bigos-synth/amu-med_terms_adolesc_comm-24/1.0.0/2f6597072545e408676fe2edefa7369a28363814e6cae2237fd3882269656e59...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/7.41M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/44.8k [00:00<?, ?B/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating  examples
Dataset pl-asr-bigos-synth downloaded and prepared to /home/michal/.cache/huggingface/datasets/amu-cai___pl-asr-bigos-synth/amu-med_terms_adolesc_comm-24/1.0.0/2f6597072545e408676fe2edefa7369a28363814e6cae2237fd3882269656e59. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

DatasetDict({
    test: Dataset({
        features: ['audioname', 'split', 'dataset', 'speaker_id', 'ref_orig', 'audio', 'samplingrate_orig', 'sampling_rate', 'audiopath_bigos', 'audiopath_local'],
        num_rows: 200
    })
})
