<h1>Preparing the Data</h1>

In [1]:
import re
from pathlib import Path
import PyPDF2

In [2]:
DIALECTS = {
    # After parsing the PDF, these are the dialect labels that actually appear
    "العربية  الفصحى": "MSA",  # Note: the double space is a parsing artifact (verified empirically)
    "اللهجة المصرية": "Egy",
    "اللهجة السعودية": "Sau",
    "اللهجة الشامية": "Lav",
    "اللهجة المغربية": "Mor",
}

In [3]:
class ArabicTTSParser:
    """
    Parser for Arabic TTS/ASR dataset with multiple dialects.
    
    Extracts sentences grouped by dialect from a PDF file.
    """
    
    def __init__(self, filepath):
        """
        Initialize parser with file path.
        
        Args:
            filepath: Path to the Arabic sentences file (PDF)
        """
        self.filepath = filepath
        self.data = {}
        self.current_dialect = None
        
    def read_file(self):
        """Read the PDF file."""
        filepath = Path(self.filepath)             
        with open(self.filepath, 'rb') as f:
            pdf_reader = PyPDF2.PdfReader(f)
            text = []
            for page in pdf_reader.pages:
                page_text = page.extract_text()
                text.extend(page_text.split('\n'))
            return text
    
    def is_dialect_header(self, line):
        """
        Check if line is a dialect header.
        
        A header is any non-empty, non-numbered line (after the title is skipped).
        """
        # Should not start with a number (which indicates a sentence)
        if re.match(r'^\d+\.', line):
            return False
        
        return line
    
    def parse(self):
        """
        Main parsing method.
        
        Returns:
            dict: Dictionary with dialects as keys and sentence lists as values
        """
        lines = self.read_file()
        
        if not lines:
            print("No content to parse. Check file path and PDF library installation.")
            return self.data
        
        print(f"Read {len(lines)} lines from file")

        first_line_skipped = False
        
        for i, line in enumerate(lines):
            if isinstance(line, bytes):
                line = line.decode('utf-8', errors='ignore')
            
            line = line.strip()
            
            # Skip empty lines
            if not line:
                continue
            
            # Skip the first non-empty line (title)
            if not first_line_skipped:
                first_line_skipped = True
                continue
            
            # Check if this is a dialect header
            if self.is_dialect_header(line):
                self.current_dialect = DIALECTS[line]
                self.data[DIALECTS[line]] = []
                print(f"Found dialect: {line}")
                continue
            
            # Extract sentence if it's a numbered line
            # Match pattern: number + period + sentence
            match = re.match(r'^\d+\.(.*)', line)
            sentence = match.group(1).strip()
            if sentence:
                self.data[self.current_dialect].append(sentence)
        
        return self.data
    
    def get_statistics(self):
        """Get statistics about the parsed data."""
        stats = {
            'total_dialects': len(self.data),
            'total_sentences': sum(len(sentences) for sentences in self.data.values()),
            'sentences_per_dialect': {
                dialect: len(sentences) 
                for dialect, sentences in self.data.items()
            }
        }
        return stats
    
    def display_summary(self):
        """Display a summary of parsed data."""
        stats = self.get_statistics()
        print("\n" + "="*60)
        print("DATASET SUMMARY")
        print("="*60)
        print(f"Total Dialects: {stats['total_dialects']}")
        print(f"Total Sentences: {stats['total_sentences']}")
        print("\nSentences per Dialect:")
        for dialect, count in stats['sentences_per_dialect'].items():
            print(f"  • {dialect}: {count} sentences")
        print("="*60)

In [4]:
# Initialize parser
parser = ArabicTTSParser(r'data\raw\Arabic Speech Sentences for ASR.pdf')

print("Starting PDF parsing...")

# Parse the file
data = parser.parse()
parser.display_summary()
    
print("\nParsing complete!")

Starting PDF parsing...
Read 112 lines from file
Found dialect: العربية  الفصحى
Found dialect: اللهجة المصرية
Found dialect: اللهجة السعودية
Found dialect: اللهجة الشامية
Found dialect: اللهجة المغربية

DATASET SUMMARY
Total Dialects: 5
Total Sentences: 100

Sentences per Dialect:
  • MSA: 20 sentences
  • Egy: 20 sentences
  • Sau: 20 sentences
  • Lav: 20 sentences
  • Mor: 20 sentences

Parsing complete!


<h1>Text-to-Speech</h1>

In [5]:
import torch
from TTS.api import TTS
import os

# Get device
device = "cuda" if torch.cuda.is_available() else "cpu"

  import pkg_resources


In [6]:
# Init TTS
tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(device)

 > tts_models/multilingual/multi-dataset/xtts_v2 is already downloaded.


  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


 > Using model: xtts


  self.speakers = torch.load(speaker_file_path)
  return torch.load(f, map_location=map_location, **kwargs)


In [None]:
# matching speaker reference files
refs = {
    "MSA": "data/references/MSA.wav", # from: https://www.youtube.com/watch?v=nhDySCWLgG4
    "Egy": "data/references/Egy.wav", # from: https://www.youtube.com/watch?v=coHnimwMeaM
    "Sau": "data/references/Sau.wav", # from: https://www.youtube.com/watch?v=Y1HfRhfHwUc
    "Lav": "data/references/Lav.wav", # from: https://www.youtube.com/watch?v=T5XCtOvMHyM
    "Mor": "data/references/Mor.wav" # from: https://www.youtube.com/watch?v=uJ503bWppR4
}

BASE_OUT = "data/generated_speech/outputs"
os.makedirs(BASE_OUT, exist_ok=True)

for dialect, sentences in data.items():
    speaker_wav = refs[dialect]

    # Create a folder per dialect
    dialect_dir = os.path.join(BASE_OUT, dialect)
    os.makedirs(dialect_dir, exist_ok=True)

    for i, text in enumerate(sentences, start=1):
        out_path = os.path.join(dialect_dir, f"{dialect}_{i:02d}.wav")

        tts.tts_to_file(
            text=text,
            speaker_wav=speaker_wav,
            language="ar",
            file_path=out_path
        )

        print(f"[OK] {out_path}")


 > Text splitted to sentences.
['اللغة العربية من أقدم اللغات وأكثرها انتشارًا  في العالم .']
 > Processing time: 18.323134899139404
 > Real-time factor: 2.9496052193524696
[OK] data/generated_speech/outputs\MSA\MSA_01.wav
 > Text splitted to sentences.
['قرأت هذا  الصباح مقالً  عن تطور الذكاء الصطناعي  في الطب.']
 > Processing time: 19.92735767364502
 > Real-time factor: 3.6206182984828
[OK] data/generated_speech/outputs\MSA\MSA_02.wav
 > Text splitted to sentences.
['الشمس تشرق من الشرق وتغيب  في الغرب.']
 > Processing time: 8.142912864685059
 > Real-time factor: 2.0685625422385434
[OK] data/generated_speech/outputs\MSA\MSA_03.wav
 > Text splitted to sentences.
['يحب األطفال اللعب في الحدائق عندما يكون الجو معتدلً .']
 > Processing time: 11.281314373016357
 > Real-time factor: 2.107504591339727
[OK] data/generated_speech/outputs\MSA\MSA_04.wav
 > Text splitted to sentences.
['في كل صباح أتناول كوبًا من القهوة قبل الذهاب إلى  العمل.']
 > Processing time: 12.486547470092773
 > Real-tim

<h1> Adding Controlled Noise </h1>

In [8]:
import numpy as np
import soundfile as sf
import os
import librosa

In [9]:
def generate_pink_noise(length):
    white = np.random.randn(length)
    fft = np.fft.rfft(white)
    freqs = np.fft.rfftfreq(length)
    freqs[0] = 1e-10
    pink_fft = fft / np.sqrt(freqs)
    pink = np.fft.irfft(pink_fft, n=length)
    return pink / np.std(pink)


def generate_brown_noise(length):
    white = np.random.randn(length)
    fft = np.fft.rfft(white)
    freqs = np.fft.rfftfreq(length)
    freqs[0] = 1e-10
    brown_fft = fft / freqs
    brown = np.fft.irfft(brown_fft, n=length)
    return brown / np.std(brown)


def add_noise(audio, noise_type, snr_db):
    audio = audio.astype(np.float32)
    signal_power = np.mean(audio ** 2)
    noise_power = signal_power / (10 ** (snr_db / 10))
    length = len(audio)

    if noise_type == "white":
        noise = np.random.randn(length)
    elif noise_type == "pink":
        noise = generate_pink_noise(length)
    elif noise_type == "brown":
        noise = generate_brown_noise(length)
    elif os.path.exists(noise_type):
        noise, _ = sf.read(noise_type)
        if noise.ndim > 1:
            noise = np.mean(noise, axis=1)
        if len(noise) < length:
            noise = np.tile(noise, int(np.ceil(length / len(noise))))
        noise = noise[:length]
    else:
        raise ValueError(f"Unknown noise type: {noise_type}")

    noise = noise * np.sqrt(noise_power / np.mean(noise ** 2))
    noisy_audio = audio + noise

    max_val = np.max(np.abs(noisy_audio))
    if max_val > 1.0:
        noisy_audio = noisy_audio / max_val * 0.99 # Prevents audio clipping by scaling the signal if it exceeds the safe range

    return noisy_audio

In [10]:
INPUT_DIR = "data/generated_speech/outputs"
OUTPUT_DIR = "data/generated_speech/noisy_outputs"
SAMPLE_RATE = 16000
SNR_LEVELS = [0, 5, 10, 15, 20]

NOISE_TYPES = {
    "white": "white",
    "pink": "pink",
    "brown": "brown",
    "street": "data/noise/street_noise.wav",
    "babble": "data/noise/babble_noise.wav",
}

os.makedirs(OUTPUT_DIR, exist_ok=True)

total_files = 0

for dialect in data.keys():
    for i in range(1, 21):
        input_path = os.path.join(INPUT_DIR, dialect, f"{dialect}_{i:02d}.wav")

        # Load clean audio at 16kHz
        audio, _ = librosa.load(input_path, sr=SAMPLE_RATE, mono=True)

        for noise_name, noise_source in NOISE_TYPES.items():
            for snr in SNR_LEVELS:
                out_dir = os.path.join(
                    OUTPUT_DIR,
                    dialect,
                    noise_name,
                    f"snr_{snr:02d}"
                )
                os.makedirs(out_dir, exist_ok=True)

                noisy_audio = add_noise(audio, noise_source, snr)

                out_path = os.path.join(
                    out_dir,
                    f"{dialect}_{i:02d}.wav"
                )

                sf.write(out_path, noisy_audio, SAMPLE_RATE)
                total_files += 1

                print(f"  [OK] {out_path}")

print("\n" + "=" * 60)
print("PROCESSING COMPLETE")
print("=" * 60)
print(f"Total noisy files created: {total_files}")

  [OK] data/generated_speech/noisy_outputs\MSA\white\snr_00\MSA_01.wav
  [OK] data/generated_speech/noisy_outputs\MSA\white\snr_05\MSA_01.wav
  [OK] data/generated_speech/noisy_outputs\MSA\white\snr_10\MSA_01.wav
  [OK] data/generated_speech/noisy_outputs\MSA\white\snr_15\MSA_01.wav
  [OK] data/generated_speech/noisy_outputs\MSA\white\snr_20\MSA_01.wav
  [OK] data/generated_speech/noisy_outputs\MSA\pink\snr_00\MSA_01.wav
  [OK] data/generated_speech/noisy_outputs\MSA\pink\snr_05\MSA_01.wav
  [OK] data/generated_speech/noisy_outputs\MSA\pink\snr_10\MSA_01.wav
  [OK] data/generated_speech/noisy_outputs\MSA\pink\snr_15\MSA_01.wav
  [OK] data/generated_speech/noisy_outputs\MSA\pink\snr_20\MSA_01.wav
  [OK] data/generated_speech/noisy_outputs\MSA\brown\snr_00\MSA_01.wav
  [OK] data/generated_speech/noisy_outputs\MSA\brown\snr_05\MSA_01.wav
  [OK] data/generated_speech/noisy_outputs\MSA\brown\snr_10\MSA_01.wav
  [OK] data/generated_speech/noisy_outputs\MSA\brown\snr_15\MSA_01.wav
  [OK] data

<h1> ASR </h1>

In [11]:
import whisper

In [12]:
# Load the Whisper model
model = whisper.load_model("base")

def transcribe(audio_path):
    """Load audio with librosa and transcribe"""
    # Load audio file and resample to 16kHz (Whisper's requirement)
    audio, sr = librosa.load(audio_path, sr=16000, mono=True)
        
    # Transcribe with Arabic language
    result = model.transcribe(audio, language="ar")
    
    return result["text"].strip()

In [13]:
asr_outputs = {}

AUDIO_CLEAN = "data/generated_speech/outputs"

for dialect in data.keys():
    asr_outputs.setdefault(dialect, {})
    asr_outputs[dialect]["clean"] = []

    for i in range(1, 21):
        audio_path = f"{AUDIO_CLEAN}/{dialect}/{dialect}_{i:02d}.wav"
        hyp = transcribe(audio_path)
        asr_outputs[dialect]["clean"].append(hyp)

In [15]:
AUDIO_NOISY = "data/generated_speech/noisy_outputs"

for dialect in asr_outputs.keys():
    asr_outputs[dialect]["noisy"] = {}

    for noise in NOISE_TYPES.keys():
        asr_outputs[dialect]["noisy"][noise] = {}

        for snr in SNR_LEVELS:
            asr_outputs[dialect]["noisy"][noise][snr] = []

            for i in range(1, 21):
                audio_path = (
                    f"{AUDIO_NOISY}/{dialect}/{noise}/snr_{snr:02d}/"
                    f"{dialect}_{i:02d}.wav"
                )

                hyp = transcribe(audio_path)
                asr_outputs[dialect]["noisy"][noise][snr].append(hyp)

<h1> Performance Evaluation </h1>

In [76]:
import pyarabic.araby as araby
from jiwer import wer

In [77]:
def normalize_ar(text):
    text = text.strip()
    text = re.sub(r"[^\u0600-\u06FF\s]", "", text)  # keep Arabic only
    text = re.sub(r'[أإآٱ]', 'ا', text)
    text = araby.strip_diacritics(text)
    text = re.sub(r"\s+", " ", text)
    return text

In [78]:
wers = {}

for dialect in data.keys():
    wers[dialect] = {}

    # -------- CLEAN --------
    wers[dialect]["clean"] = []

    for ref, hyp in zip(data[dialect], asr_outputs[dialect]["clean"]):
        ref_n = normalize_ar(ref)
        hyp_n = normalize_ar(hyp)

        w = wer(ref_n, hyp_n)
        wers[dialect]["clean"].append(w)

    # -------- NOISY --------
    wers[dialect]["noisy"] = {}

    for noise in NOISE_TYPES.keys():
        wers[dialect]["noisy"][noise] = {}

        for snr in SNR_LEVELS:
            wers[dialect]["noisy"][noise][snr] = []

            for ref, hyp in zip(
                data[dialect],
                asr_outputs[dialect]["noisy"][noise][snr]
            ):
                ref_n = normalize_ar(ref)
                hyp_n = normalize_ar(hyp)

                w = wer(ref_n, hyp_n)
                wers[dialect]["noisy"][noise][snr].append(w)


In [79]:
wer_avgs = {}

for dialect in wers.keys():
    wer_avgs[dialect] = {}

    # -------- CLEAN --------
    clean_wers = wers[dialect]["clean"]
    wer_avgs[dialect]["clean"] = sum(clean_wers) / len(clean_wers)

    # -------- NOISY --------
    wer_avgs[dialect]["noisy"] = {}

    for noise in wers[dialect]["noisy"].keys():
        wer_avgs[dialect]["noisy"][noise] = {}

        for snr in wers[dialect]["noisy"][noise].keys():
            snr_wers = wers[dialect]["noisy"][noise][snr]
            wer_avgs[dialect]["noisy"][noise][snr] = (
                sum(snr_wers) / len(snr_wers)
            )

In [80]:
print("=" * 70)
print("AVERAGE WER RESULTS")
print("=" * 70)

for dialect in wer_avgs.keys():
    print(f"\nDialect: {dialect}")
    print(f"  Clean WER: {wer_avgs[dialect]['clean']:.3f}")

    for noise, snrs in wer_avgs[dialect]["noisy"].items():
        print(f"  Noise: {noise}")
        for snr, avg in snrs.items():
            print(f"    SNR {snr:>2} dB → WER: {avg:.3f}")

AVERAGE WER RESULTS

Dialect: MSA
  Clean WER: 0.329
  Noise: white
    SNR  0 dB → WER: 0.931
    SNR  5 dB → WER: 0.725
    SNR 10 dB → WER: 0.568
    SNR 15 dB → WER: 0.440
    SNR 20 dB → WER: 0.382
  Noise: pink
    SNR  0 dB → WER: 0.326
    SNR  5 dB → WER: 0.342
    SNR 10 dB → WER: 0.347
    SNR 15 dB → WER: 0.321
    SNR 20 dB → WER: 0.338
  Noise: brown
    SNR  0 dB → WER: 0.351
    SNR  5 dB → WER: 0.351
    SNR 10 dB → WER: 0.336
    SNR 15 dB → WER: 0.324
    SNR 20 dB → WER: 0.324
  Noise: street
    SNR  0 dB → WER: 0.569
    SNR  5 dB → WER: 0.449
    SNR 10 dB → WER: 0.364
    SNR 15 dB → WER: 0.335
    SNR 20 dB → WER: 0.321
  Noise: babble
    SNR  0 dB → WER: 0.555
    SNR  5 dB → WER: 0.424
    SNR 10 dB → WER: 0.371
    SNR 15 dB → WER: 0.332
    SNR 20 dB → WER: 0.347

Dialect: Egy
  Clean WER: 0.528
  Noise: white
    SNR  0 dB → WER: 0.941
    SNR  5 dB → WER: 0.816
    SNR 10 dB → WER: 0.660
    SNR 15 dB → WER: 0.567
    SNR 20 dB → WER: 0.527
  Noise: pink

<h2> Plots </h2>

In [82]:
import matplotlib.pyplot as plt
import seaborn as sns

os.makedirs("plots", exist_ok=True)

In [83]:
plt.rcParams.update({
    "font.size": 10,
    "axes.grid": True,
    "grid.alpha": 0.3
})

In [84]:
dialects = list(wer_avgs.keys())
clean_wers = [wer_avgs[d]["clean"] for d in dialects]

plt.figure(figsize=(6, 4))
plt.bar(dialects, clean_wers)
plt.ylabel("Word Error Rate (WER)")
plt.xlabel("Dialect")
plt.title("Clean Speech Recognition Performance per Dialect")
plt.ylim(0, 1)

plt.tight_layout()
plt.savefig("plots/Clean WER per Dialect (Bar Chart).png")

In [85]:
avg_wer_vs_snr = {}

for dialect in wer_avgs:
    avg_wer_vs_snr[dialect] = []
    for snr in SNR_LEVELS:
        vals = [
            wer_avgs[dialect]["noisy"][noise][snr]
            for noise in wer_avgs[dialect]["noisy"]
        ]
        avg_wer_vs_snr[dialect].append(sum(vals) / len(vals))

plt.figure(figsize=(7, 5))

for dialect, values in avg_wer_vs_snr.items():
    plt.plot(SNR_LEVELS, values, marker="o", label=dialect)

plt.xlabel("Signal-to-Noise Ratio (dB)")
plt.ylabel("Word Error Rate (WER)")
plt.title("WER vs SNR Averaged Over Noise Types")
plt.ylim(0, 1)
plt.legend(title="Dialect")

plt.tight_layout()
plt.savefig("plots/WER vs SNR (Noise-Averaged), per Dialect (Line Plot).png")

In [86]:
dialect = "Egy"

plt.figure(figsize=(7, 5))

for noise in wer_avgs[dialect]["noisy"]:
    values = [
        wer_avgs[dialect]["noisy"][noise][snr]
        for snr in SNR_LEVELS
    ]
    plt.plot(SNR_LEVELS, values, marker="o", label=noise)

plt.xlabel("Signal-to-Noise Ratio (dB)")
plt.ylabel("Word Error Rate (WER)")
plt.title(f"WER vs SNR for {dialect} Dialect Across Noise Types")
plt.ylim(0, 1)
plt.legend(title="Noise Type")

plt.tight_layout()
plt.savefig("plots/WER vs SNR per Noise Type (Single Dialect).png")

In [87]:
snr = 5
dialects = list(wer_avgs.keys())
noises = list(next(iter(wer_avgs.values()))["noisy"].keys())

heatmap_data = np.array([
    [wer_avgs[d]["noisy"][n][snr] for n in noises]
    for d in dialects
])

plt.figure(figsize=(7, 4))
sns.heatmap(
    heatmap_data,
    xticklabels=noises,
    yticklabels=dialects,
    annot=True,
    fmt=".2f",
    cmap="viridis"
)

plt.title(f"WER Heatmap Across Dialects and Noise Types (SNR = {snr} dB)")
plt.xlabel("Noise Type")
plt.ylabel("Dialect")

plt.tight_layout()
plt.savefig("plots/Heatmap.png")

In [88]:
dialect = "Egy"
SNRS = [0, 5, 10, 15, 20]
NOISES = list(wer_avgs[dialect]["noisy"].keys())

num_snrs = len(SNRS)
num_noises = len(NOISES)

x = np.arange(num_snrs)  # SNR positions
width = 0.15             # width of each bar

plt.figure(figsize=(8, 5))

for i, noise in enumerate(NOISES):
    wer_values = [
        wer_avgs[dialect]["noisy"][noise][snr]
        for snr in SNRS
    ]
    plt.bar(
        x + i * width,
        wer_values,
        width,
        label=noise
    )

plt.xlabel("Signal-to-Noise Ratio (dB)")
plt.ylabel("Word Error Rate (WER)")
plt.title(f"WER vs SNR for {dialect} Dialect Across Noise Types")
plt.xticks(
    x + width * (num_noises - 1) / 2,
    [str(snr) for snr in SNRS]
)
plt.ylim(0, 1)
plt.legend(title="Noise Type", ncol=2)

plt.tight_layout()
plt.savefig("plots/Grouped bar chart for ONE dialect.png")

<h2> Text Outputs </h2>

In [89]:
import json

In [81]:
print(data)
print(asr_outputs)
print(wers)

{'MSA': ['اللغة العربية من أقدم اللغات وأكثرها انتشارًا  في العالم .', 'قرأت هذا  الصباح مقالً  عن تطور الذكاء الصطناعي  في الطب.', 'الشمس تشرق من الشرق وتغيب  في الغرب.', 'يحب األطفال اللعب في الحدائق عندما يكون الجو معتدلً .', 'في كل صباح أتناول كوبًا من القهوة قبل الذهاب إلى  العمل.', 'يحتاج اإلنسان إلى الماء والهواء والغذاء ليعيش بصحة جيدة.', 'تسافر الطيور في أسراب بحثًا عن الدفء في فصل  الشتاء.', 'تعلم البرمجة أصبح مهارة ضرورية في هذا العصر  الرقمي.', 'القطط من أكثر الحيوانات األليفة المحبوبة لدى الناس.', 'يعمل األطباء بجد إلنقاذ األرواح  ومساعدة المرضى.', 'سأزور معرض الكتاب الدولي األسبوع القادم لشراء بعض الروايات.', 'في المساء نجتمع كأسرة لتناول العشاء والتحدث عن يومنا.', 'القراءة تفتح آفاقًا جديدة للعقل وتزيد من المعرفة.', 'في عطلة نهاية األسبوع أحب الذهاب إلى البحر مع أصدقائي .', 'تتميز مصر بتاريخ عريق وحضارة تمتد آللف السنين.', 'تطوير التقنيات الحديثة يسهم في تحسين جودة الحياة اليومية.', 'تحتاج النباتات إلى ضوء الشمس لتنمو وتزدهر.', 'من المهم أن نحافظ  على  البيئة من التلوث و

In [91]:

with open('data/text_outputs/parsed_arabic_tts_dataset.json', 'w', encoding='utf-8') as f:
    json.dump(data, f, ensure_ascii=False, indent=2)

with open('data/text_outputs/ASR_outputs.json', 'w', encoding='utf-8') as f:
    json.dump(asr_outputs, f, ensure_ascii=False, indent=2)

with open('data/text_outputs/WERs.json', 'w', encoding='utf-8') as f:
    json.dump(wers, f, ensure_ascii=False, indent=2)