In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!huggingface-cli login


In [None]:
!pip install --upgrade datasets


In [None]:
from datasets import load_dataset

# Load the dataset (might need internet connection and huggingface login if private)
ds = load_dataset("MBZUAI/ClArTTS")

In [None]:
# Check splits
print(ds)

# Access the train split for example
train_ds = ds["train"]

# See columns/features
print(train_ds.column_names)  # ['text', 'file', 'audio', 'sampling_rate', 'duration']

DatasetDict({
    train: Dataset({
        features: ['text', 'file', 'audio', 'sampling_rate', 'duration'],
        num_rows: 9500
    })
    test: Dataset({
        features: ['text', 'file', 'audio', 'sampling_rate', 'duration'],
        num_rows: 205
    })
})
['text', 'file', 'audio', 'sampling_rate', 'duration']


In [None]:
from collections import Counter

# Combine train and test splits
all_sampling_rates = ds['train']['sampling_rate'] + ds['test']['sampling_rate']

# Count occurrences of each sampling rate
sampling_rate_counts = Counter(all_sampling_rates)

# Display results
for rate, count in sampling_rate_counts.items():
    print(f"Sampling rate: {rate} Hz — {count} samples")


Sampling rate: 40100 Hz — 9705 samples


# Get The Phonems

In [None]:
!git clone https://github.com/nipponjo/tts-arabic-pytorch.git

In [None]:
import os

import sys
sys.path.append('tts-arabic-pytorch/text/')

from phonetise_buckwalter import (
    arabic_to_buckwalter,
    process_utterance
)

def custom_arabic_to_phones(text, return_phonemes=True):
    bw_text = arabic_to_buckwalter(text)
    if return_phonemes:
        return bw_text, process_utterance(bw_text).replace("+ ", "")
    return bw_text

base_output_dir = "/content/drive/MyDrive/New_ClArTTS"

for split in ["train", "test"]:
    split_dir = os.path.join(base_output_dir, split)
    os.makedirs(split_dir, exist_ok=True)

    ortho_path = os.path.join(split_dir, "orthographic-transcript.txt")
    phone_path = os.path.join(split_dir, "phonetic-transcript.txt")

    with open(ortho_path, "w", encoding="utf-8") as ortho_f, open(phone_path, "w", encoding="utf-8") as phone_f:
        print(f"Processing {split} set...")
        for i, sample in enumerate(ds[split]):
            try:
                text = sample["text"]
                file_path = sample["file"]
                file_name = os.path.splitext(os.path.basename(file_path))[0]

                bw_text, phonemes = custom_arabic_to_phones(text)

                ortho_f.write(f"{file_name}\t{bw_text}\n")
                phone_f.write(f"{file_name}\t{phonemes}\n")

                if i % 100 == 0:
                    print(f"[{split}] Processed {i} samples")
            except Exception as e:
                print(f"[{split}] Error at index {i}: {e}")


In [None]:
input_path = "/content/drive/MyDrive/New_ClArTTS/train/phonetic-transcript.txt"
output_path = "/content/drive/MyDrive/New_ClArTTS/train/phonetic-transcript_cleaned.txt"

# Phoneme set without '.'
invalid_phoneme = "."

with open(input_path, "r", encoding="utf-8") as infile, \
     open(output_path, "w", encoding="utf-8") as outfile:
    for line in infile:
        if not line.strip():
            continue  # skip empty lines
        try:
            utt_id, phoneme_seq = line.strip().split("\t")
            # Remove "." phoneme tokens
            cleaned_phonemes = " ".join(p for p in phoneme_seq.split() if p != invalid_phoneme)
            outfile.write(f"{utt_id}\t{cleaned_phonemes}\n")
        except ValueError:
            print(f"Skipping invalid line: {line.strip()}")


# Get Labs


In [None]:
import os

# Define paths to orthographic-transcript.txt files
train_transcript_path = "/content/drive/MyDrive/New_ClArTTS/train/orthographic-transcript.txt"
test_transcript_path = "/content/drive/MyDrive/New_ClArTTS/test/orthographic-transcript.txt"

# Function to create .lab files from transcript
def create_lab_files(transcript_path, lab_output_dir):
    os.makedirs(lab_output_dir, exist_ok=True)

    with open(transcript_path, "r", encoding="utf-8") as f:
        lines = f.readlines()

    for line in lines:
        parts = line.strip().split("\t")
        if len(parts) == 2:
            file_id = parts[0].strip("\t")
            buckwalter_text = parts[1].strip("\t")
            lab_file_path = os.path.join(lab_output_dir, f"{file_id}.lab")

            with open(lab_file_path, "w", encoding="utf-8") as lab_file:
                lab_file.write(buckwalter_text + "\n")
        else:
            print(f"Skipping malformed line: {line.strip()}")

    print(f"Created {len(lines)} .lab files in {lab_output_dir}")

# Process train and test
create_lab_files(train_transcript_path, "/content/drive/MyDrive/New_ClArTTS/train/lab")
create_lab_files(test_transcript_path, "/content/drive/MyDrive/New_ClArTTS/test/lab")


# Get Textgrig

In [None]:
!pip install soundfile


In [None]:
!pip install textgrid


In [None]:
from textgrid import TextGrid, IntervalTier
import os
import re

# === Config ===
train_transcript_path = "/content/drive/MyDrive/New_ClArTTS/train/phonetic-transcript_cleaned.txt"
test_transcript_path = "/content/drive/MyDrive/New_ClArTTS/test/phonetic-transcript.txt"
train_output_dir = "/content/drive/MyDrive/New_ClArTTS/train/textgrid"
test_output_dir = "/content/drive/MyDrive/New_ClArTTS/test/textgrid"

os.makedirs(train_output_dir, exist_ok=True)
os.makedirs(test_output_dir, exist_ok=True)

# === Efficiently Get Durations from Dataset ===
def get_duration_dict(dataset):
    return {
        (sample.get("file") or sample.get("id")).replace(".wav", ""): sample["duration"]
        for sample in dataset
    }

# === Sanitize Phonemes ===
def sanitize_phoneme(ph):
    return re.sub(r'[\"\n\r\t]', '', ph.strip())

# === Create TextGrid ===
def create_textgrid(file_id, phonemes_str, duration, save_path):
    phonemes = phonemes_str.strip().split()
    if not phonemes:
        print(f"Warning: No phonemes for {file_id}, skipping.")
        return

    tg = TextGrid(minTime=0.0, maxTime=duration)
    tier = IntervalTier(name="phones", minTime=0.0, maxTime=duration)

    phoneme_duration = duration / len(phonemes)
    time = 0.0

    for i, ph in enumerate(phonemes):
        start = time
        end = start + phoneme_duration
        if i == len(phonemes) - 1 or end > duration:
            end = duration  # clamp final time
        tier.add(start, end, sanitize_phoneme(ph))
        time = end

    tg.append(tier)
    tg.write(save_path)

# === Process and Write TextGrid Files ===
def process_transcript(transcript_path, duration_dict, output_dir):
    with open(transcript_path, "r", encoding="utf-8") as f:
        for line_number, line in enumerate(f, 1):
            if not line.strip():
                continue
            try:
                file_id, phoneme_str = line.strip().split('\t')
                file_id = file_id.strip("\t")
                duration = duration_dict.get(file_id)

                if duration is None:
                    print(f"Warning [Line {line_number}]: Duration not found for '{file_id}'")
                    continue

                save_path = os.path.join(output_dir, f"{file_id}.TextGrid")
                create_textgrid(file_id, phoneme_str, duration, save_path)
                print(f"Created TextGrid for {file_id}")

            except Exception as e:
                print(f"Error processing line {line_number}: {line.strip()} | {e}")

# === Run ===
train_durations = get_duration_dict(ds["train"])
test_durations = get_duration_dict(ds["test"])

process_transcript(train_transcript_path, train_durations, train_output_dir)
process_transcript(test_transcript_path, test_durations, test_output_dir)


# Git Wave File

In [None]:
! pip install librosa


In [None]:
import os
import soundfile as sf
import librosa
import numpy as np  # Make sure this is imported

base_dir = "/content/drive/MyDrive/New_ClArTTS/Audio"
train_dir = os.path.join(base_dir, "train")
test_dir = os.path.join(base_dir, "test")

os.makedirs(train_dir, exist_ok=True)
os.makedirs(test_dir, exist_ok=True)

TARGET_SR = 22050

def save_audio_files(dataset_split, split_name):
    for i, sample in enumerate(dataset_split):
        audio_array = sample["audio"]["array"] if isinstance(sample["audio"], dict) else sample["audio"]
        original_sr = sample["sampling_rate"]

        # Ensure audio_array is a NumPy array
        audio_array = np.array(audio_array, dtype=np.float32)

        # Resample if needed
        if original_sr != TARGET_SR:
            audio_array = librosa.resample(audio_array, orig_sr=original_sr, target_sr=TARGET_SR)

        # File name
        original_filename = os.path.basename(sample["file"])
        filename = original_filename if original_filename.endswith(".wav") else f"{original_filename}.wav"
        filepath = os.path.join(base_dir, split_name, filename)

        # Write audio
        sf.write(filepath, audio_array, TARGET_SR)

        if i % 100 == 0:
            print(f"Saved {i} audio files in {split_name} at 22050 Hz")

# Save for train and test splits
save_audio_files(ds["train"], "train")
save_audio_files(ds["test"], "test")
