Improve Audio Component (#5966)

* replace <audio> with wavesurfer: add recording, playing and trimming, playback * add changeset * merge cleanup * improving recording styling * add recording timer * add trim region duration * allow trimming recordings * clean up playing logic * add pause_recording event * remove crop min/max * add waveform options param * remove trimmingmode and use mode * streaming + cleanup * add changeset * clean up types * mobile adjustments * add min/max length + trim accessibility * update pnpm lock * amend source to a list and allow source switching * fix no microphone found logic * change undo logic to reset trims * tweaks * tweak reset logic * ensure recording is sent to backend * fix audio duration reactivity * list tweak * clean up * change source -> sources + restore wasm changes * formatting * fix tests * fix test * add default sources value in fe + fix audio demos * fix audio file name test * add better sources typing * ui test tweaks * add default value in templates.py * formatting * remove unused prop * add audio story * add changeset * revert sources changes * remove story id * fix be test * fix be test * fix notebooks * formatting * fix test * fix test again --------- Co-authored-by: gradio-pr-bot <gradio-pr-bot@users.noreply.github.com> Co-authored-by: Abubakar Abid <abubakar@huggingface.co> Co-authored-by: pngwn <hello@pngwn.io>
gradio-app · Oct 27, 2023 · 9cad212 · 9cad212
1 parent 205f613
commit 9cad212
Show file tree

Hide file tree

Showing 58 changed files with 5,045 additions and 3,792 deletions.
diff --git a/.changeset/great-moles-matter.md b/.changeset/great-moles-matter.md
@@ -0,0 +1,10 @@
+---
+"@gradio/app": minor
+"@gradio/audio": minor
+"@gradio/icons": minor
+"@gradio/storybook": minor
+"@gradio/utils": minor
+"gradio": minor
+---
+
+feat:Improve Audio Component
diff --git a/demo/asr/run.ipynb b/demo/asr/run.ipynb
@@ -1 +1 @@
-{"cells": [{"cell_type": "markdown", "id": "302934307671667531413257853548643485645", "metadata": {}, "source": ["# Gradio Demo: asr"]}, {"cell_type": "code", "execution_count": null, "id": "272996653310673477252411125948039410165", "metadata": {}, "outputs": [], "source": ["!pip install -q gradio torch torchaudio transformers"]}, {"cell_type": "code", "execution_count": null, "id": "288918539441861185822528903084949547379", "metadata": {}, "outputs": [], "source": ["import gradio as gr\n", "from transformers import pipeline\n", "import numpy as np\n", "\n", "transcriber = pipeline(\"automatic-speech-recognition\", model=\"openai/whisper-base.en\")\n", "\n", "def transcribe(audio):\n", "    sr, y = audio\n", "    y = y.astype(np.float32)\n", "    y /= np.max(np.abs(y))\n", "\n", "    return transcriber({\"sampling_rate\": sr, \"raw\": y})[\"text\"]\n", "\n", "\n", "demo = gr.Interface(\n", "    transcribe,\n", "    gr.Audio(source=\"microphone\"),\n", "    \"text\",\n", ")\n", "\n", "if __name__ == \"__main__\":\n", "    demo.launch()\n"]}], "metadata": {}, "nbformat": 4, "nbformat_minor": 5}
+{"cells": [{"cell_type": "markdown", "id": "302934307671667531413257853548643485645", "metadata": {}, "source": ["# Gradio Demo: asr"]}, {"cell_type": "code", "execution_count": null, "id": "272996653310673477252411125948039410165", "metadata": {}, "outputs": [], "source": ["!pip install -q gradio torch torchaudio transformers"]}, {"cell_type": "code", "execution_count": null, "id": "288918539441861185822528903084949547379", "metadata": {}, "outputs": [], "source": ["import gradio as gr\n", "from transformers import pipeline\n", "import numpy as np\n", "\n", "transcriber = pipeline(\"automatic-speech-recognition\", model=\"openai/whisper-base.en\")\n", "\n", "def transcribe(audio):\n", "    sr, y = audio\n", "    y = y.astype(np.float32)\n", "    y /= np.max(np.abs(y))\n", "\n", "    return transcriber({\"sampling_rate\": sr, \"raw\": y})[\"text\"]\n", "\n", "\n", "demo = gr.Interface(\n", "    transcribe,\n", "    gr.Audio(sources=[\"microphone\"]),\n", "    \"text\",\n", ")\n", "\n", "if __name__ == \"__main__\":\n", "    demo.launch()\n"]}], "metadata": {}, "nbformat": 4, "nbformat_minor": 5}
diff --git a/demo/asr/run.py b/demo/asr/run.py
@@ -14,7 +14,7 @@ def transcribe(audio):
 
 demo = gr.Interface(
     transcribe,
-    gr.Audio(source="microphone"),
+    gr.Audio(sources=["microphone"]),
     "text",
 )
 

diff --git a/demo/blocks_kitchen_sink/run.ipynb b/demo/blocks_kitchen_sink/run.ipynb
diff --git a/demo/blocks_kitchen_sink/run.py b/demo/blocks_kitchen_sink/run.py
@@ -167,7 +167,7 @@ def clear():
         with gr.Tab("Audio"):
             with gr.Row():
                 gr.Audio()
-                gr.Audio(source="microphone")
+                gr.Audio(sources=["microphone"])
                 gr.Audio(join(KS_FILES, "cantina.wav"))
         with gr.Tab("Other"):
             # gr.Image(source="webcam")

diff --git a/demo/kitchen_sink/run.ipynb b/demo/kitchen_sink/run.ipynb
diff --git a/demo/kitchen_sink/run.py b/demo/kitchen_sink/run.py
@@ -107,7 +107,7 @@ def fn(
         gr.Image(label="Webcam", source="webcam"),
         gr.Video(label="Video"),
         gr.Audio(label="Audio"),
-        gr.Audio(label="Microphone", source="microphone"),
+        gr.Audio(label="Microphone", sources=["microphone"]),
         gr.File(label="File"),
         gr.Dataframe(label="Dataframe", headers=["Name", "Age", "Gender"]),
     ],

diff --git a/demo/main_note/run.ipynb b/demo/main_note/run.ipynb
@@ -1 +1 @@
-{"cells": [{"cell_type": "markdown", "id": "302934307671667531413257853548643485645", "metadata": {}, "source": ["# Gradio Demo: main_note"]}, {"cell_type": "code", "execution_count": null, "id": "272996653310673477252411125948039410165", "metadata": {}, "outputs": [], "source": ["!pip install -q gradio scipy numpy matplotlib"]}, {"cell_type": "code", "execution_count": null, "id": "288918539441861185822528903084949547379", "metadata": {}, "outputs": [], "source": ["# Downloading files from the demo repo\n", "import os\n", "os.mkdir('audio')\n", "!wget -q -O audio/cantina.wav https://github.com/gradio-app/gradio/raw/main/demo/main_note/audio/cantina.wav\n", "!wget -q -O audio/recording1.wav https://github.com/gradio-app/gradio/raw/main/demo/main_note/audio/recording1.wav"]}, {"cell_type": "code", "execution_count": null, "id": "44380577570523278879349135829904343037", "metadata": {}, "outputs": [], "source": ["from math import log2, pow\n", "import os\n", "\n", "import numpy as np\n", "from scipy.fftpack import fft\n", "\n", "import gradio as gr\n", "\n", "A4 = 440\n", "C0 = A4 * pow(2, -4.75)\n", "name = [\"C\", \"C#\", \"D\", \"D#\", \"E\", \"F\", \"F#\", \"G\", \"G#\", \"A\", \"A#\", \"B\"]\n", "\n", "\n", "def get_pitch(freq):\n", "    h = round(12 * log2(freq / C0))\n", "    n = h % 12\n", "    return name[n]\n", "\n", "\n", "def main_note(audio):\n", "    rate, y = audio\n", "    if len(y.shape) == 2:\n", "        y = y.T[0]\n", "    N = len(y)\n", "    T = 1.0 / rate\n", "    yf = fft(y)\n", "    yf2 = 2.0 / N * np.abs(yf[0 : N // 2])\n", "    xf = np.linspace(0.0, 1.0 / (2.0 * T), N // 2)\n", "\n", "    volume_per_pitch = {}\n", "    total_volume = np.sum(yf2)\n", "    for freq, volume in zip(xf, yf2):\n", "        if freq == 0:\n", "            continue\n", "        pitch = get_pitch(freq)\n", "        if pitch not in volume_per_pitch:\n", "            volume_per_pitch[pitch] = 0\n", "        volume_per_pitch[pitch] += 1.0 * volume / total_volume\n", "    volume_per_pitch = {k: float(v) for k, v in volume_per_pitch.items()}\n", "    return volume_per_pitch\n", "\n", "\n", "demo = gr.Interface(\n", "    main_note,\n", "    gr.Audio(source=\"microphone\"),\n", "    gr.Label(num_top_classes=4),\n", "    examples=[\n", "        [os.path.join(os.path.abspath(''),\"audio/recording1.wav\")],\n", "        [os.path.join(os.path.abspath(''),\"audio/cantina.wav\")],\n", "    ],\n", "    interpretation=\"default\",\n", ")\n", "\n", "if __name__ == \"__main__\":\n", "    demo.launch()\n"]}], "metadata": {}, "nbformat": 4, "nbformat_minor": 5}
+{"cells": [{"cell_type": "markdown", "id": "302934307671667531413257853548643485645", "metadata": {}, "source": ["# Gradio Demo: main_note"]}, {"cell_type": "code", "execution_count": null, "id": "272996653310673477252411125948039410165", "metadata": {}, "outputs": [], "source": ["!pip install -q gradio scipy numpy matplotlib"]}, {"cell_type": "code", "execution_count": null, "id": "288918539441861185822528903084949547379", "metadata": {}, "outputs": [], "source": ["# Downloading files from the demo repo\n", "import os\n", "os.mkdir('audio')\n", "!wget -q -O audio/cantina.wav https://github.com/gradio-app/gradio/raw/main/demo/main_note/audio/cantina.wav\n", "!wget -q -O audio/recording1.wav https://github.com/gradio-app/gradio/raw/main/demo/main_note/audio/recording1.wav"]}, {"cell_type": "code", "execution_count": null, "id": "44380577570523278879349135829904343037", "metadata": {}, "outputs": [], "source": ["from math import log2, pow\n", "import os\n", "\n", "import numpy as np\n", "from scipy.fftpack import fft\n", "\n", "import gradio as gr\n", "\n", "A4 = 440\n", "C0 = A4 * pow(2, -4.75)\n", "name = [\"C\", \"C#\", \"D\", \"D#\", \"E\", \"F\", \"F#\", \"G\", \"G#\", \"A\", \"A#\", \"B\"]\n", "\n", "\n", "def get_pitch(freq):\n", "    h = round(12 * log2(freq / C0))\n", "    n = h % 12\n", "    return name[n]\n", "\n", "\n", "def main_note(audio):\n", "    rate, y = audio\n", "    if len(y.shape) == 2:\n", "        y = y.T[0]\n", "    N = len(y)\n", "    T = 1.0 / rate\n", "    yf = fft(y)\n", "    yf2 = 2.0 / N * np.abs(yf[0 : N // 2])\n", "    xf = np.linspace(0.0, 1.0 / (2.0 * T), N // 2)\n", "\n", "    volume_per_pitch = {}\n", "    total_volume = np.sum(yf2)\n", "    for freq, volume in zip(xf, yf2):\n", "        if freq == 0:\n", "            continue\n", "        pitch = get_pitch(freq)\n", "        if pitch not in volume_per_pitch:\n", "            volume_per_pitch[pitch] = 0\n", "        volume_per_pitch[pitch] += 1.0 * volume / total_volume\n", "    volume_per_pitch = {k: float(v) for k, v in volume_per_pitch.items()}\n", "    return volume_per_pitch\n", "\n", "\n", "demo = gr.Interface(\n", "    main_note,\n", "    gr.Audio(sources=[\"microphone\"]),\n", "    gr.Label(num_top_classes=4),\n", "    examples=[\n", "        [os.path.join(os.path.abspath(''),\"audio/recording1.wav\")],\n", "        [os.path.join(os.path.abspath(''),\"audio/cantina.wav\")],\n", "    ],\n", "    interpretation=\"default\",\n", ")\n", "\n", "if __name__ == \"__main__\":\n", "    demo.launch()\n"]}], "metadata": {}, "nbformat": 4, "nbformat_minor": 5}
diff --git a/demo/main_note/run.py b/demo/main_note/run.py
@@ -42,7 +42,7 @@ def main_note(audio):
 
 demo = gr.Interface(
     main_note,
-    gr.Audio(source="microphone"),
+    gr.Audio(sources=["microphone"]),
     gr.Label(num_top_classes=4),
     examples=[
         [os.path.join(os.path.dirname(__file__),"audio/recording1.wav")],