# Piper TTS Dataset Preprocessing (Google Colab)

This notebook preprocesses your audio dataset for Piper TTS training.

**Your Dataset**: `/content/drive/MyDrive/english` folder

### 1. Mount Google Drive

In [None]:
from google.colab import drive
import os

# Mount Google Drive
drive.mount('/content/drive')

# Navigate to working directory
%cd /content

print(f"\n‚úÖ Google Drive mounted successfully")
print(f"Working directory: {os.getcwd()}")

### 2. Clone Piper Repository

In [None]:
import os

# Clone Piper only if not already present
if not os.path.exists('piper_repo'):
    print("Cloning Piper repository...")
    !git clone https://github.com/rhasspy/piper.git piper_repo
    print("‚úÖ Repository cloned")
else:
    print("‚úÖ Piper repository already exists")

piper_src_path = os.path.abspath("piper_repo/src/python")
print(f"Piper source: {piper_src_path}")

### 3. Install Build Dependencies

In [None]:
print("üì¶ Installing build dependencies...\n")

# Install core build tools
!pip install -q --upgrade pip setuptools wheel
!pip install -q cython numpy pybind11

print("‚úÖ Build tools installed")

### 4. Build and Install piper-phonemize from Source

Since piper-phonemize isn't on PyPI, we'll build it from the GitHub source.

In [None]:
import os

# Clone piper-phonemize repo
if not os.path.exists('piper-phonemize'):
    print("Cloning piper-phonemize repository...")
    !git clone https://github.com/rhasspy/piper-phonemize.git
    print("‚úÖ Repository cloned")
else:
    print("‚úÖ piper-phonemize repository already exists")

# Install system dependencies for espeak-ng
print("\nInstalling system dependencies...")
!apt-get update -qq
!apt-get install -y -qq espeak-ng libespeak-ng-dev

# Build and install piper-phonemize
print("\nBuilding piper-phonemize from source...")
%cd piper-phonemize
!pip install -e .
%cd /content

print("\n‚úÖ piper-phonemize installed")

# Verify installation
try:
    import piper_phonemize
    print("‚úÖ piper_phonemize imported successfully!")
except ImportError as e:
    print(f"‚ùå Import failed: {e}")

### 5. Install ML and Audio Dependencies

In [None]:
print("üì¶ Installing ML and audio processing libraries...\n")

# Install ML frameworks
!pip install -q torch>=2.0.0
!pip install -q lightning>=2.0.0

# Install audio processing
!pip install -q librosa<1
!pip install -q numba==0.62.1

# Install other requirements
!pip install -q onnx onnxruntime
!pip install -q tensorboard tensorboardX
!pip install -q pysilero-vad>=2.1
!pip install -q jsonargparse[signatures]>=4.27.7
!pip install -q pathvalidate>=3
!pip install -q phonemizer Unidecode tqdm inflect matplotlib pandas

print("\n‚úÖ All dependencies installed!")

### 6. Verify Dataset Folder

In [None]:
import os

# Your dataset path
dataset_path = '/content/drive/MyDrive/english'

if os.path.exists(dataset_path):
    print(f"‚úÖ Found dataset folder")
    print(f"\nüìÇ Contents:")
    !ls -la "{dataset_path}"
else:
    print(f"\n‚ùå ERROR: Dataset folder not found!")
    print(f"Expected location: {dataset_path}")
    print(f"\nPlease check that your audio files are uploaded to this folder in Google Drive.")

### 7. Analyze Dataset

In [None]:
import numpy as np
import pandas as pd

# Look for transcript file
data_file = os.path.join(dataset_path, 'txt.done.data')  # Adjust if your file has a different name

if os.path.exists(data_file):
    text = open(data_file, 'r', encoding='utf-8', errors='ignore').read()
    
    print("üìä Dataset Preview:")
    print(text[:1000])
    
    # Analyze vocabulary
    vocab = sorted(set(text))
    print(f"\nüìù Vocabulary size: {len(vocab)} unique characters")
    print(f"Characters: {vocab[:20]}...")  # Show first 20
else:
    print(f"‚ùå Transcript file not found at: {data_file}")
    print(f"\nSearching for text files...")
    !find "{dataset_path}" -type f \( -name "*.txt" -o -name "*.data" -o -name "*.csv" \) | head -10

### 8. Create metadata.csv

In [None]:
import pandas as pd

# Input and output paths
metadata_input = os.path.join(dataset_path, "txt.done.data")  # Update if your file has a different name
metadata_output = os.path.join(dataset_path, "metadata.csv")

if os.path.exists(metadata_input):
    # Read file split by double quotes
    df = pd.read_csv(metadata_input, sep='"', usecols=[0, 1], header=None)
    
    # Clean filename: remove leading '(' and whitespace
    df[0] = df[0].str.replace(r'\(', '', regex=True).str.strip()
    
    # Clean transcript
    df[1] = df[1].str.strip()
    
    # Add speaker info (optional)
    df[2] = 'female'  # Change to 'male' or your speaker ID
    
    # Save metadata.csv
    df.to_csv(metadata_output, sep='|', index=False, header=False)
    
    print(f"‚úÖ Created metadata.csv with {len(df)} entries")
    print(f"Saved to: {metadata_output}")
    print(f"\nPreview:")
    display(df.head())
else:
    print(f"‚ùå Input file not found: {metadata_input}")
    print("\nüí° Tip: Update the 'metadata_input' variable to match your transcript file name.")

### 9. Run Preprocessing

This converts audio files and creates training data.

In [None]:
import os

# Paths
dataset_input = '/content/drive/MyDrive/english'
piper_src_path = os.path.abspath("piper_repo/src/python")
output_dir = "/content/drive/MyDrive/training_dir"  # Save to Drive

print(f"Dataset: {dataset_input}")
print(f"Output: {output_dir}")
print(f"\nStarting preprocessing...\n")

# Run preprocessing
!PYTHONPATH="{piper_src_path}" python3 -m piper_train.preprocess \
  --language en \
  --input-dir "{dataset_input}" \
  --output-dir "{output_dir}" \
  --dataset-format ljspeech \
  --single-speaker \
  --sample-rate 22050

print("\n‚úÖ Preprocessing complete!")
print(f"Training data saved to: {output_dir}")

### 10. Build Monotonic Align Extension

In [None]:
import sys
import subprocess

monotonic_align_path = os.path.join(piper_src_path, "piper_train/vits/monotonic_align")

print(f"Building in: {monotonic_align_path}")

try:
    subprocess.check_call(
        f"cd '{monotonic_align_path}' && '{sys.executable}' setup.py build_ext --inplace",
        shell=True
    )
    print("\n‚úÖ Monotonic align built successfully")
except subprocess.CalledProcessError as e:
    print(f"\n‚ùå Build failed: {e}")

### 11. Verify Setup

In [None]:
import glob

# Check if all required files exist
checks = [
    ("metadata.csv", os.path.join(dataset_input, "metadata.csv")),
    ("config.json", os.path.join(output_dir, "config.json")),
    ("dataset.jsonl", os.path.join(output_dir, "dataset.jsonl")),
    ("monotonic_align", os.path.join(monotonic_align_path, "core*.so"))
]

print("\nüîç Verification:")
print("="*50)

all_good = True
for name, path in checks:
    # Handle wildcard for .so file
    if "*" in path:
        files = glob.glob(path)
        exists = len(files) > 0
    else:
        exists = os.path.exists(path)
    
    status = "‚úÖ" if exists else "‚ùå"
    print(f"{status} {name}")
    all_good = all_good and exists

print("="*50)
if all_good:
    print("\nüéâ All checks passed! Ready for training.")
    print(f"\nTraining directory: {output_dir}")
    print("\nNext step: Use this training_dir with your training notebook.")
else:
    print("\n‚ö†Ô∏è Some checks failed. Review the output above.")

### 12. Preview Training Data (Optional)

In [None]:
# Show first few lines of dataset.jsonl
dataset_jsonl = os.path.join(output_dir, "dataset.jsonl")

if os.path.exists(dataset_jsonl):
    print("üìÑ Preview of training data:\n")
    with open(dataset_jsonl, 'r') as f:
        for i, line in enumerate(f):
            if i < 3:  # Show first 3 entries
                print(line.strip())
            else:
                break
    
    # Count total entries
    with open(dataset_jsonl, 'r') as f:
        total = sum(1 for _ in f)
    print(f"\nüìä Total training examples: {total}")
else:
    print("‚ùå dataset.jsonl not found")