In [6]:
import asyncio
from elevenlabs.client import AsyncElevenLabs
from dotenv import load_dotenv
import os

load_dotenv()
api_key = os.getenv("PETTER_ELEVENLABS")

eleven = AsyncElevenLabs(api_key=api_key)

async def print_models() -> None:
    models = await eleven.voices.get_all()
    for voice in models.voices:
        print(voice.name, voice.labels, voice.voice_id)

# In Jupyter, use this instead of asyncio.run():
await print_models()


Aria {'accent': 'american', 'description': 'expressive', 'age': 'middle_aged', 'gender': 'female', 'use_case': 'social media'} 9BWtsMINqrJLrRacOk9x
Roger {'accent': 'american', 'description': 'confident', 'age': 'middle_aged', 'gender': 'male', 'use_case': 'social media'} CwhRBWXzGAHq8TQ4Fs17
Sarah {'accent': 'american', 'description': 'soft', 'age': 'young', 'gender': 'female', 'use_case': 'news'} EXAVITQu4vr4xnSDxMaL
Laura {'accent': 'american', 'description': 'upbeat', 'age': 'young', 'gender': 'female', 'use_case': 'social media'} FGY2WhTYpPnrIDTdsKH5
Charlie {'accent': 'australian', 'description': 'natural', 'age': 'middle_aged', 'gender': 'male', 'use_case': 'conversational'} IKne3meq5aSn9XLyUdCD
George {'accent': 'british', 'description': 'warm', 'age': 'middle_aged', 'gender': 'male', 'use_case': 'narration'} JBFqnCBsd6RMkjVDRZzb
Callum {'accent': 'american', 'description': 'intense', 'age': 'middle_aged', 'gender': 'male', 'use_case': 'characters'} N2lVS1w4EtoT3dr4eOWO
River {

In [15]:
import asyncio
from itertools import cycle
import os
import re
import ast
import pandas as pd
from pydub import AudioSegment
from dotenv import load_dotenv
from elevenlabs.client import AsyncElevenLabs
import soundfile as sf
from glob import iglob

load_dotenv()
api_key = os.getenv("PETTER_ELEVENLABS")
client = AsyncElevenLabs(api_key=api_key)

CONCURRENCY_LIMIT = 3
semaphore = asyncio.Semaphore(CONCURRENCY_LIMIT)

def get_audio_duration(file_path):
    with sf.SoundFile(file_path) as s:
        return s.frames / s.samplerate  # seconds

async def text_to_audio_and_save(text, speaker_id, index, save_path="./elevenlabs_audio_files"):
    async with semaphore:
        audio_stream = client.text_to_speech.convert(
            text=text,
            voice_id=speaker_id,
            model_id="eleven_multilingual_v2",
            output_format="mp3_44100_128",
        )
        output_mp3_path = f"{save_path}/audio_speaker_{speaker_id}_{index}.mp3"
        output_wav_path = f"{save_path}_resampled/audio_speaker_{speaker_id}_{index}.wav"

        # Write the MP3 chunks
        with open(output_mp3_path, "wb") as mp3_file:
            async for chunk in audio_stream:
                mp3_file.write(chunk)

        # Resample (synchronously) with pydub
        audio = AudioSegment.from_file(output_mp3_path, format="mp3")
        resampled_audio = audio.set_frame_rate(16000)
        resampled_audio.export(output_wav_path, format="wav")

        duration_s = get_audio_duration(output_wav_path)
        return output_wav_path, duration_s

async def generate_audio_by_sentences_async(utterances, speakers, batch_size=25):
    speaker_cycle = cycle(speakers)
    total_rows = len(utterances)
    total_duration_s = 0.0  # track total duration in seconds

    print(f"Starting round-robin TTS generation over {total_rows} rows...")

    for start_idx in range(0, total_rows, batch_size):
        end_idx = min(start_idx + batch_size, total_rows)
        chunk = utterances.iloc[start_idx:end_idx]

        tasks = []
        row_indices = []

        print(f"\nProcessing batch {start_idx} to {end_idx - 1}...")

        for index, row in chunk.iterrows():
            speaker_id = next(speaker_cycle)
            tasks.append(text_to_audio_and_save(row["sentence"], speaker_id, index))
            row_indices.append(index)

        # Gather results for this batch
        results = await asyncio.gather(*tasks)

        # Update DataFrame with paths & durations
        for (file_path, duration_s), idx in zip(results, row_indices):
            utterances.at[idx, 'audio_path'] = file_path
            utterances.at[idx, 'audio_duration_s'] = duration_s
            total_duration_s += duration_s

        # Save partial results
        utterances.to_csv("./elevenlabs_paths_dataset/dataset_final.csv", index=False)

        # Print how many minutes so far
        total_duration_min = total_duration_s / 60
        print(f"Saved partial results up to row {end_idx - 1}. "
              f"Total audio so far: {total_duration_min:.2f} minutes.")

    print("All batches processed.")
    return utterances

def get_path_list(path="./elevenlabs_audio_files"):
    path_list = []
    for fn in iglob(f'{path}/*'):
        print(fn)
        path_list.append(fn)
    return path_list

def extract_audio_numbers(file_paths):
    pattern = r'audio_speaker[1367]_(\\d+)\\.flac'
    extracted_numbers = []
    for path in file_paths:
        match = re.search(pattern, path)
        if match:
            extracted_numbers.append(int(match.group(1)))
    return extracted_numbers

def find_disruption_pairs(sorted_numbers):
    disruptions = []
    for i in range(1, len(sorted_numbers)):
        if sorted_numbers[i] - sorted_numbers[i - 1] != 1:
            disruptions.append((sorted_numbers[i - 1], sorted_numbers[i]))
    return disruptions

def convert_str_to_dict(string):
    return ast.literal_eval(string)

async def main():
    # Load the full dataset
    utterances = pd.read_csv("./northvolt.csv")
    utterances['sentence'] = utterances['sentence'].str.replace('|', '', regex=False)
    utterances

    # Define speakers
    speakers = [
        "aSLKtNoVBZlxQEMsnGL2",
        "7UMEOkIJdI4hjmR2SWNq",
        "fFe6F6cCl526GpIxiUxu",
        "XB0fDUnXU5powFXDhCwa",
        "pqHfZKP75CvOlQylNhV4"
    ]

    # Generate audio in batches of 25, printing partial durations each time
    await generate_audio_by_sentences_async(utterances, speakers, batch_size=20)
    



In [17]:
if __name__ == '__main__':
    await main()


Starting round-robin TTS generation over 40 rows...

Processing batch 0 to 19...
Saved partial results up to row 19. Total audio so far: 3.22 minutes.

Processing batch 20 to 39...
Saved partial results up to row 39. Total audio so far: 5.50 minutes.
All batches processed.


In [7]:
import pandas as pd

utterances = pd.read_csv("./northvolt.csv")
utterances['sentence'] = utterances['sentence'].str.replace('|', '', regex=False)
utterances

Unnamed: 0,sentence
0,Northvolt har satt som mål att 50 procent av v...
1,Gigafabriken förväntas tas i drift i slutet av...
2,Northvolt producerade sin första battericell v...
3,Northvolt och Hydro tillkännagav idag bildande...
4,Northvolt AB och vissa av dess dotterbolag ans...
5,Northvolt kommer att utveckla platsen till en ...
6,Northvolt har hittills säkrat kontrakt värda ö...
7,Northvolt är en europeisk leverantör av hållba...
8,Northvolt utvecklar tillverkningskapacitet för...
9,Northvolt avser att möjliggöra att 50 % av sin...


In [None]:
from huggingface_hub import Repository, login
from datasets import Dataset, load_dataset, DatasetDict
from dotenv import load_dotenv
import os
import pandas as pd
import librosa
import json
import soundfile as sf
import numpy as np
from datasets import Audio, Features, Value

# Login to Huggingface (you need to run `huggingface-cli login` beforehand)
# https://huggingface.co/datasets/my-north-ai/cv_mls_psfb_fs17_68k

load_dotenv()
token = os.getenv('HF_WRITE')
print(token)

# Reset the current active token
os.environ['HF_TOKEN'] = token

# Login with the new token
login(token=token)


# Define the path to your datasets
company = "northvolt"
csv_path = f'./elevenlabs_paths_dataset/{company}.csv'
audio_folder_path = './elevenlabs_audio_files/'

df = pd.read_csv(csv_path)

def get_audio_duration(file_path):
    try:
        return librosa.get_duration(filename=file_path)
    except Exception:
        return None

# Build our manifest with lazy loading: only store the *file path* in "audio".
manifest = []
total_duration = 0.0

for index, row in df.iterrows():
    if pd.notna(row['audio_path']):
        # Adjust the path if you're using .mp3
        audio_path = os.path.join(audio_folder_path, os.path.basename(row['audio_path']))
        audio_path = audio_path.split('.wav')[0] + '.mp3'

        print(f"Processing: {audio_path}")
        duration = get_audio_duration(audio_path)
        if duration is None:
            print(f"Warning: could not determine duration for {audio_path}")
            continue

        total_duration += duration

        # Store only the path in "audio" for lazy loading
        manifest.append({
            "audio": audio_path,
            "text": row["sentence"],
            "duration": duration,
            "path": audio_path,
            "entities": "Northvolt",
        })

print(f"\nTotal duration of all audio: {total_duration:.2f} seconds")
print(f"Total duration in hours: {total_duration / 3600:.2f} hrs")

# Optionally save the manifest
with open("manifest.json", "w") as f:
    json.dump(manifest, f)

# Define dataset features with Audio(...) for lazy loading
features = Features({
    "audio": Audio(sampling_rate=16000),  # Lazy load from file path
    "text": Value("string"),
    "duration": Value("float32"),
    "path": Value("string"),
    "entities": Value("string"),
})

# Create a single Dataset from the entire manifest
manifest_dict = {
    "audio": [item["audio"] for item in manifest],
    "text": [item["text"] for item in manifest],
    "duration": [item["duration"] for item in manifest],
    "path": [item["path"] for item in manifest],
    "entities": ["Northvolt" for _ in manifest],
}

full_dataset = Dataset.from_dict(manifest_dict, features=features)

# Shuffle and split into train/validation
full_dataset = full_dataset.shuffle(seed=42)
train_size = int(0.8 * len(full_dataset))  # 80% train, 20% validation
train_dataset = full_dataset.select(range(train_size))
val_dataset = full_dataset.select(range(train_size, len(full_dataset)))

dataset_dict = DatasetDict({
    "train": train_dataset,
    "validation": val_dataset
})

print(dataset_dict)

# Push to your Hugging Face Hub repo with both splits
dataset_dict.push_to_hub("grdphilip/elevenlabs_syndata")