In [None]:
# =========================================================
# üéôÔ∏è VoiceMorphAI ‚Äî Public Working Version (No HF login)
# =========================================================

!pip install -q torch torchvision torchaudio diffusers transformers accelerate gradio librosa soundfile soxr matplotlib scipy

import gradio as gr
import torch, librosa, soundfile as sf, numpy as np, io, matplotlib.pyplot as plt
from diffusers import AudioLDM2Pipeline   # open public model

# ---- Load a public audio-diffusion model (no token needed)
model_id = "cvssp/audioldm2-large"  # ‚úÖ open-access model
pipe = AudioLDM2Pipeline.from_pretrained(model_id)
pipe = pipe.to("cuda" if torch.cuda.is_available() else "cpu")

# ---- Core conversion function
def convert_voice(prompt, input_audio):
    audio, sr = librosa.load(input_audio, sr=16000)
    dur = librosa.get_duration(y=audio, sr=sr)
    result = pipe(prompt=prompt, num_inference_steps=10, audio_length_in_s=dur).audios[0]
    out_path = "converted.wav"
    sf.write(out_path, result, 16000)

    # spectrogram
    fig, ax = plt.subplots(figsize=(8,3))
    spec = librosa.amplitude_to_db(np.abs(librosa.stft(result)), ref=np.max)
    img = librosa.display.specshow(spec, sr=16000, x_axis="time", y_axis="log", ax=ax)
    plt.colorbar(img, ax=ax, format="%+2.0f dB")
    plt.title("Generated Spectrogram")
    buf = io.BytesIO()
    plt.savefig(buf, format="png")
    buf.seek(0)
    return out_path, buf

# ---- Launch Gradio UI
gr.Interface(
    fn=convert_voice,
    inputs=[
        gr.Textbox(label="Describe new voice style (e.g. calm female, robotic, whisper)"),
        gr.Audio(type="filepath", label="Input voice (.wav)")
    ],
    outputs=[gr.Audio(label="Generated Voice"), gr.Image(label="Spectrogram")],
    title="üéµ VoiceMorphAI ‚Äî Diffusion Voice Conversion (Public Model)",
    description="Upload any voice sample and describe a target style. Generates new voice using AudioLDM2 diffusion model."
).launch(share=True)