# Piper TTS Dataset Preprocessing (Google Colab)

This notebook preprocesses your audio dataset for Piper TTS training.

**Requirements**:
- Your dataset ZIP file uploaded to Google Drive
- Google Colab with GPU runtime (recommended)

### 1. Mount Google Drive

In [None]:
from google.colab import drive
import os

# Mount Google Drive
drive.mount('/content/drive')

# Set working directory to your Google Drive folder
drive_folder = '/content/drive/MyDrive/piper-model-training'
os.makedirs(drive_folder, exist_ok=True)
%cd {drive_folder}

print(f"\n‚úÖ Working directory: {os.getcwd()}")

### 2. Clone Piper Repository

In [None]:
import os

# Clone Piper only if not already present
if not os.path.exists('piper_repo'):
    print("Cloning Piper repository...")
    !git clone https://github.com/rhasspy/piper.git piper_repo
    print("‚úÖ Repository cloned")
else:
    print("‚úÖ Piper repository already exists")

piper_src_path = os.path.abspath("piper_repo/src/python")
print(f"Piper source: {piper_src_path}")

### 3. Install Dependencies

In [None]:
# Install required packages
!pip install -q cython numpy pandas matplotlib tensorflow
!pip install -q piper-phonemize

print("‚úÖ Dependencies installed")

### 4. Extract Dataset

In [None]:
import zipfile
import os

# TODO: Update this to match your ZIP file name in Google Drive
zip_filename = 'hindi_female_english.zip'  # Change this to your actual filename

zip_path = os.path.join(drive_folder, zip_filename)
output_dir = 'dataset'

if os.path.exists(zip_path):
    print(f"üì¶ Found zip file: {zip_filename}")
    print("Extracting...")
    
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(output_dir)
    
    print(f"‚úÖ Extracted to '{output_dir}/'")
else:
    print(f"\n‚ùå ERROR: Zip file not found!")
    print(f"Expected location: {zip_path}")
    print(f"\nPlease upload '{zip_filename}' to: {drive_folder}")
    print("\nOr update 'zip_filename' variable in this cell to match your file.")

### 5. Analyze Dataset

In [None]:
import numpy as np
import pandas as pd

# TODO: Update this path to match your dataset structure
# Common paths: 'dataset/english/txt.done.data' or 'dataset/metadata.txt'
data_file = 'dataset/english/txt.done.data'

if os.path.exists(data_file):
    text = open(data_file, 'r', encoding='utf-8', errors='ignore').read()
    
    print("üìä Dataset Preview:")
    print(text[:1000])
    
    # Analyze vocabulary
    vocab = sorted(set(text))
    print(f"\nüìù Vocabulary size: {len(vocab)} unique characters")
    print(f"Characters: {vocab[:20]}...")  # Show first 20
else:
    print(f"‚ùå Data file not found at: {data_file}")
    print(f"\nAvailable files in dataset:")
    !ls -la dataset/

### 6. Create metadata.csv

In [None]:
import pandas as pd

# TODO: Update paths based on your dataset structure
metadata_input = "dataset/english/txt.done.data"
metadata_output = "dataset/english/metadata.csv"

if os.path.exists(metadata_input):
    # Read file split by double quotes
    df = pd.read_csv(metadata_input, sep='"', usecols=[0, 1], header=None)
    
    # Clean filename: remove leading '(' and whitespace
    df[0] = df[0].str.replace(r'\(', '', regex=True).str.strip()
    
    # Clean transcript
    df[1] = df[1].str.strip()
    
    # Add speaker info (optional)
    df[2] = 'female'  #  Change to 'male' or speaker ID as needed
    
    # Save metadata.csv
    df.to_csv(metadata_output, sep='|', index=False, header=False)
    
    print(f"‚úÖ Created metadata.csv with {len(df)} entries")
    print(f"\nPreview:")
    display(df.head())
else:
    print(f"‚ùå Input file not found: {metadata_input}")

### 7. Run Preprocessing

This converts audio files and creates training data.

In [None]:
import os

# Define paths
dataset_path = os.path.abspath("dataset/english")  # Update if different
piper_src_path = os.path.abspath("piper_repo/src/python")
output_dir = "training_dir"

print(f"Dataset: {dataset_path}")
print(f"Output: {output_dir}")
print(f"\nStarting preprocessing...\n")

# Run preprocessing
!PYTHONPATH="{piper_src_path}" python3 -m piper_train.preprocess \
  --language en \
  --input-dir "{dataset_path}" \
  --output-dir "{output_dir}" \
  --dataset-format ljspeech \
  --single-speaker \
  --sample-rate 22050

print("\n‚úÖ Preprocessing complete!")

### 8. Build Monotonic Align Extension

In [None]:
import sys
import subprocess

monotonic_align_path = os.path.join(piper_src_path, "piper_train/vits/monotonic_align")

print(f"Building in: {monotonic_align_path}")

try:
    subprocess.check_call(
        f"cd '{monotonic_align_path}' && '{sys.executable}' setup.py build_ext --inplace",
        shell=True
    )
    print("\n‚úÖ Monotonic align built successfully")
except subprocess.CalledProcessError as e:
    print(f"\n‚ùå Build failed: {e}")

### 9. Verify Setup

In [None]:
# Check if all required files exist
checks = [
    ("metadata.csv", "dataset/english/metadata.csv"),
    ("config.json", "training_dir/config.json"),
    ("dataset.jsonl", "training_dir/dataset.jsonl"),
    ("monotonic_align", os.path.join(monotonic_align_path, "core*.so"))
]

print("\nüîç Verification:")
print("="*50)

all_good = True
for name, path in checks:
    # Handle wildcard for .so file
    if "*" in path:
        import glob
        files = glob.glob(path)
        exists = len(files) > 0
    else:
        exists = os.path.exists(path)
    
    status = "‚úÖ" if exists else "‚ùå"
    print(f"{status} {name}")
    all_good = all_good and exists

print("="*50)
if all_good:
    print("\nüéâ All checks passed! Ready for training.")
    print("\nNext step: Run the training notebook or copy training_dir to your local machine.")
else:
    print("\n‚ö†Ô∏è Some checks failed. Review the output above.")

### 10. Preview Training Data (Optional)

In [None]:
# Show first few lines of dataset.jsonl
dataset_jsonl = "training_dir/dataset.jsonl"

if os.path.exists(dataset_jsonl):
    print("üìÑ Preview of training data:\n")
    with open(dataset_jsonl, 'r') as f:
        for i, line in enumerate(f):
            if i < 3:  # Show first 3 entries
                print(line.strip())
            else:
                break
    
    # Count total entries
    with open(dataset_jsonl, 'r') as f:
        total = sum(1 for _ in f)
    print(f"\nüìä Total training examples: {total}")
else:
    print("‚ùå dataset.jsonl not found")