# TTS Pipeline Correlation Verification

Compares board outputs with calibration data at each stage:
1. **Text Encoder**: `text_emb` output
2. **Vector Estimator**: `denoised_latent` output (after 10 steps)
3. **Vocoder**: `wav_tts` audio output

In [None]:
import numpy as np
import os

def cosine_similarity(a, b):
    """Compute cosine similarity between two vectors"""
    return np.dot(a.flatten(), b.flatten()) / (np.linalg.norm(a) * np.linalg.norm(b))

def compare_outputs(name, output_path, calib_path, transpose=None):
    """Compare output with calibration, optionally with transpose"""
    if not os.path.exists(output_path):
        print(f"[SKIP] {name}: {output_path} not found")
        return None
    if not os.path.exists(calib_path):
        print(f"[SKIP] {name}: {calib_path} not found")
        return None
    
    output = np.fromfile(output_path, dtype=np.float32)
    calib = np.fromfile(calib_path, dtype=np.float32)
    
    if output.size != calib.size:
        print(f"[ERROR] {name}: Size mismatch - output={output.size}, calib={calib.size}")
        return None
    
    # Apply transpose if needed
    if transpose:
        output = output.reshape(transpose).transpose(0, 2, 1).flatten()
    
    sim = cosine_similarity(output, calib)
    print(f"[{name}]")
    print(f"  Output range: [{output.min():.4f}, {output.max():.4f}]")
    print(f"  Calib range:  [{calib.min():.4f}, {calib.max():.4f}]")
    print(f"  Cosine similarity: {sim:.4f} ({sim*100:.1f}%)")
    return sim

## 1. Text Encoder Output Verification

In [None]:
# Text encoder output: text_emb
# Output shape: (1, 128, 256) - needs transpose to (1, 256, 128) for comparison
text_emb_sim = compare_outputs(
    name="Text Encoder - text_emb",
    output_path="./board_output/text_emb.raw",
    calib_path="./qnn_calibration/vector_estimator/text_emb.raw",
    transpose=(1, 128, 256)
)

## 2. Vector Estimator Output Verification

In [None]:
# Vector estimator outputs at different step counts
print("=== Vector Estimator Outputs ===")

# 1-step output
ve_1step_sim = compare_outputs(
    name="Vector Estimator (1 step)",
    output_path="./board_output/denoised_latent.raw",
    calib_path="./qnn_calibration/vocoder/latent.raw",
    transpose=(1, 256, 144)
)

print()

# 3-step output
ve_3step_sim = compare_outputs(
    name="Vector Estimator (3 steps)",
    output_path="./board_output/denoised_latent_3steps.raw",
    calib_path="./qnn_calibration/vocoder/latent.raw",
    transpose=(1, 256, 144)
)

print()

# 10-step output
ve_10step_sim = compare_outputs(
    name="Vector Estimator (10 steps)",
    output_path="./board_output/denoised_latent_10steps.raw",
    calib_path="./qnn_calibration/vocoder/latent.raw",
    transpose=(1, 256, 144)
)

## 3. Vocoder Output Verification

In [None]:
# Vocoder audio outputs
print("=== Vocoder Audio Outputs ===")

# Calibration audio (ground truth)
voc_calib_sim = compare_outputs(
    name="Vocoder (calibration latent)",
    output_path="./board_output/wav_tts_calib.raw",
    calib_path="./board_output/wav_tts_calib.raw",  # self-comparison = 1.0
    transpose=None
)

print()

# 1-step audio
voc_1step_sim = compare_outputs(
    name="Vocoder (1-step latent) vs Calibration",
    output_path="./board_output/wav_tts.raw",
    calib_path="./board_output/wav_tts_calib.raw",
    transpose=None
)

print()

# 10-step audio
voc_10step_sim = compare_outputs(
    name="Vocoder (10-step latent) vs Calibration",
    output_path="./board_output/wav_tts_10steps.raw",
    calib_path="./board_output/wav_tts_calib.raw",
    transpose=None
)

## 4. Summary Table

In [None]:
print("="*60)
print("CORRELATION SUMMARY")
print("="*60)
print(f"{'Stage':<35} {'Similarity':>12} {'Status':>10}")
print("-"*60)

results = [
    ("Text Encoder (text_emb)", text_emb_sim),
    ("Vector Estimator (1 step)", ve_1step_sim),
    ("Vector Estimator (3 steps)", ve_3step_sim),
    ("Vector Estimator (10 steps)", ve_10step_sim),
    ("Vocoder (1-step) vs Calib Audio", voc_1step_sim),
    ("Vocoder (10-step) vs Calib Audio", voc_10step_sim),
]

for name, sim in results:
    if sim is not None:
        status = "OK" if sim > 0.8 else "WARN" if sim > 0.5 else "LOW"
        print(f"{name:<35} {sim*100:>10.1f}% {status:>10}")
    else:
        print(f"{name:<35} {'N/A':>12} {'SKIP':>10}")

print("="*60)

## 5. Audio Playback (Optional)

In [None]:
# Convert to WAV files for playback
from scipy.io import wavfile
import os

audio_files = [
    ("./board_output/wav_tts_calib.raw", "./board_output/output_calib.wav"),
    ("./board_output/wav_tts.raw", "./board_output/output_1step.wav"),
    ("./board_output/wav_tts_10steps.raw", "./board_output/output_10steps.wav"),
]

for raw_path, wav_path in audio_files:
    if os.path.exists(raw_path):
        data = np.fromfile(raw_path, dtype=np.float32)
        wavfile.write(wav_path, 44100, data)
        print(f"Saved: {wav_path} ({data.size/44100:.2f} sec)")