<a href="https://colab.research.google.com/github/dt-cs/IST-402-W8-L1/blob/main/Speech_to_Image.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# =====================================================
#  Audio-to-Image Generator ‚Äî TESTED & WORKING
#  Run this entire cell in Google Colab
# =====================================================

# ==================== STEP 1: Clean Environment ====================
print("üßπ Cleaning up...")
import os
os.system('pkill -9 streamlit')
os.system('pkill -9 ngrok')

# ==================== STEP 2: Install Packages ====================
print("üì¶ Installing packages (2-3 minutes)...")
!pip uninstall -y transformers diffusers huggingface-hub
!pip install -q pyngrok streamlit soundfile
!pip install -q --upgrade transformers diffusers accelerate

print("‚úÖ Packages installed!")

# ==================== STEP 3: Import & Setup ====================
import time
from pyngrok import ngrok

NGROK_TOKEN = "NGROK TOKEN" #i removed my NGROK TOKEN FROM HERE

ngrok.set_auth_token(NGROK_TOKEN)

# Kill existing tunnels
for tunnel in ngrok.get_tunnels():
    ngrok.disconnect(tunnel.public_url)

# ==================== STEP 4: Create Streamlit App ====================
app_code = '''
import streamlit as st
import torch
from transformers import pipeline
from diffusers import StableDiffusionPipeline
import time

# Config
st.set_page_config(page_title="üéôÔ∏è Audio-to-Image", layout="centered")

# ==================== Load Models ====================
@st.cache_resource
def load_models():
    """Load both Whisper and Stable Diffusion"""
    st.info("Loading AI models... (first run takes 3-5 minutes)")

    # Whisper for speech-to-text
    whisper = pipeline(
        "automatic-speech-recognition",
        model="openai/whisper-tiny",
        device=0 if torch.cuda.is_available() else -1
    )

    # Stable Diffusion for image generation
    device = "cuda" if torch.cuda.is_available() else "cpu"
    sd = StableDiffusionPipeline.from_pretrained(
        "runwayml/stable-diffusion-v1-5",
        torch_dtype=torch.float16 if device == "cuda" else torch.float32,
        safety_checker=None
    ).to(device)

    if device == "cuda":
        sd.enable_attention_slicing()

    return whisper, sd

whisper_model, sd_model = load_models()

# ==================== UI ====================
st.title("üéôÔ∏è Audio-to-Image Generator")
st.markdown("Transform your voice into stunning AI-generated images!")
st.markdown("---")

# Input methods
tab1, tab2 = st.tabs(["üé§ Upload Audio", "‚úçÔ∏è Type Text"])

prompt_text = None

with tab1:
    st.write("Upload an audio file with your image description")
    audio_file = st.file_uploader(
        "Choose audio file",
        type=["wav", "mp3", "m4a", "flac"],
        help="Speak clearly: 'A beautiful sunset over mountains'"
    )

    if audio_file:
        st.audio(audio_file)

        if st.button("üéß Transcribe Audio", type="primary"):
            with st.spinner("Converting speech to text..."):
                # Save temp file
                with open("temp_audio.wav", "wb") as f:
                    f.write(audio_file.read())

                # Transcribe
                result = whisper_model("temp_audio.wav")
                prompt_text = result["text"]

                st.success(f"‚úÖ Transcription: **{prompt_text}**")
                st.session_state.prompt = prompt_text

with tab2:
    manual_prompt = st.text_area(
        "Describe the image you want to generate:",
        placeholder="Example: A serene lake surrounded by autumn trees at sunset",
        height=100
    )
    if manual_prompt:
        st.session_state.prompt = manual_prompt

# Settings
with st.expander("‚öôÔ∏è Advanced Settings"):
    col1, col2 = st.columns(2)
    steps = col1.slider("Quality (inference steps)", 10, 50, 25,
                       help="More steps = better quality but slower")
    guidance = col2.slider("Prompt strength", 5.0, 15.0, 7.5,
                          help="Higher = follows prompt more closely")

# Generate button
st.markdown("---")
if st.button("üé® Generate Image", type="primary", use_container_width=True):

    # Get prompt from session state
    final_prompt = st.session_state.get('prompt', None)

    if not final_prompt:
        st.error("‚ùå Please provide audio or text first!")
        st.stop()

    # Generate image
    st.info(f"üé® Generating image from: **{final_prompt}**")
    st.write("This may take 30 seconds to 3 minutes depending on your GPU...")

    progress_bar = st.progress(0)
    start_time = time.time()

    with st.spinner("Creating your masterpiece..."):
        try:
            # Generate
            image = sd_model(
                prompt=final_prompt,
                num_inference_steps=steps,
                guidance_scale=guidance,
                height=512,
                width=512
            ).images[0]

            elapsed = time.time() - start_time
            progress_bar.progress(100)

            # Display
            st.success(f"‚úÖ Generated in {elapsed:.1f} seconds!")
            st.image(image, caption=final_prompt, use_column_width=True)

            # Save and download
            image.save("generated_image.png")
            with open("generated_image.png", "rb") as f:
                st.download_button(
                    "üíæ Download Image",
                    data=f,
                    file_name=f"ai_art_{int(time.time())}.png",
                    mime="image/png",
                    use_container_width=True
                )

        except Exception as e:
            st.error(f"‚ùå Generation failed: {str(e)}")
            st.info("Try simplifying your prompt or reducing quality settings")

# Footer
st.markdown("---")
st.caption("üîä Powered by OpenAI Whisper + Stable Diffusion v1.5")

# GPU info
device_info = "üöÄ GPU Accelerated" if torch.cuda.is_available() else "üê¢ CPU Mode (slower)"
st.caption(device_info)
'''

with open("app.py", "w") as f:
    f.write(app_code)

print("‚úÖ App created!")

# ==================== STEP 5: Launch ====================
print("\nüöÄ Starting Streamlit...")
os.system('streamlit run app.py &>/dev/null &')
time.sleep(8)

print("üåê Creating public URL...")
try:
    public_url = ngrok.connect(8501)
    print("\n" + "="*60)
    print("‚úÖ SUCCESS! Your app is running!")
    print("="*60)
    print(f"\nüåê Open this URL in your browser:")
    print(f"   {public_url}")
    print(f"\nüìå Tips:")
    print(f"   ‚Ä¢ Keep this Colab notebook running")
    print(f"   ‚Ä¢ First image generation takes longer (loading models)")
    print(f"   ‚Ä¢ Use short, clear voice prompts")
    print(f"   ‚Ä¢ Free Colab = CPU mode (slower but works!)")
    print("\n" + "="*60)

except Exception as e:
    print(f"\n‚ùå Error: {e}")
    print("\nüîß Troubleshooting:")
    print("   1. Check your ngrok token is correct")
    print("   2. Try: Runtime ‚Üí Restart runtime")
    print("   3. Make sure you changed 'YOUR_TOKEN_HERE'")

üßπ Cleaning up...
üì¶ Installing packages (2-3 minutes)...
Found existing installation: transformers 4.57.1
Uninstalling transformers-4.57.1:
  Successfully uninstalled transformers-4.57.1
Found existing installation: diffusers 0.35.2
Uninstalling diffusers-0.35.2:
  Successfully uninstalled diffusers-0.35.2
Found existing installation: huggingface-hub 0.36.0
Uninstalling huggingface-hub-0.36.0:
  Successfully uninstalled huggingface-hub-0.36.0
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m10.2/10.2 MB[0m [31m42.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m6.9/6.9 MB[0m [31m87.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m44.0/44.