<a href="https://colab.research.google.com/github/detektor777/colab_list_audio/blob/main/fishaudio.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#@title ##**Install** { display-mode: "form" }
%%capture
!apt-get update
!apt-get install -y libsox-dev ffmpeg

!pip install huggingface_hub
!git clone https://github.com/fishaudio/fish-speech.git
!pip install -e /content/fish-speech[stable]
!pip install tiktoken lightning pyrootutils loralib vector_quantize_pytorch loguru hydra-core
!huggingface-cli download fishaudio/fish-speech-1.5 --local-dir /content/checkpoints/fish-speech-1.5
!npm install -g localtunnel
!pip install gradio

In [None]:
#@title ##**Run Web-UI** { display-mode: "form" }

!npm install -g localtunnel > /dev/null 2>&1

!fuser -k 7860/tcp > /dev/null 2>&1

import subprocess
import threading
import time
import socket
import urllib.request
from IPython.display import HTML, display

button_displayed = False

def iframe_thread(port):
    global button_displayed
    while True:
        time.sleep(0.5)
        sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
        result = sock.connect_ex(('127.0.0.1', port))
        if result == 0:
            break
        sock.close()

    if not button_displayed:
        password = urllib.request.urlopen('https://ipv4.icanhazip.com').read().decode('utf8').strip("\n")
        p = subprocess.Popen(["lt", "--port", "{}".format(port)], stdout=subprocess.PIPE, stderr=subprocess.DEVNULL)
        for line in p.stdout:
            if line.decode().startswith('your url is:'):
                url = line.decode().split(': ')[1].strip()
                break

        html = f"""
        <script>
        function copyPassword() {{
            navigator.clipboard.writeText("{password}");
            alert("Password copied to clipboard!");
        }}
        </script>
        <button onclick="window.open('{url}', '_blank')">Open Tunnel</button>
        <button onclick="copyPassword()">Copy Password</button>
        <p>Password: {password}</p>
        """
        display(HTML(html))
        button_displayed = True

if not button_displayed:
    threading.Thread(target=iframe_thread, daemon=True, args=(7860,)).start()

!python /content/fish-speech/tools/run_webui.py > /dev/null 2>&1

**Optional**

In [None]:
#@title ##**Run** { display-mode: "form" }
import os
from IPython.display import Audio, display

project_root = "/content/fish-speech"
checkpoint_path = "/content/checkpoints/fish-speech-1.5"
decoder_checkpoint_path = "/content/checkpoints/fish-speech-1.5/firefly-gan-vq-fsq-8x1024-21hz-generator.pth"
output_dir = "/content/output"

os.makedirs(output_dir, exist_ok=True)

os.environ['PYTHONPATH'] = f"{project_root}{os.pathsep}{os.environ.get('PYTHONPATH', '')}"

text = 'Hello world' #@param {type:"string"}

chunk_length = 200 #@param {type:"slider", min:0, max:300, step:8}
max_new_tokens = 0 #@param {type:"slider", min:0, max:2048, step:8}
top_p = 0.7 #@param {type:"slider", min:0.6, max:0.9, step:0.01}
repetition_penalty = 1.2 #@param {type:"slider", min:1.0, max:1.5, step:0.01}
temperature = 0.7 #@param {type:"slider", min:0.6, max:0.9, step:0.01}
seed = 0 #@param {type:"slider", min:0, max:1000000, step:1}

!python -m fish_speech.models.text2semantic.inference \
    --text "$text" \
    --checkpoint-path "$checkpoint_path" \
    --output-dir "$output_dir" \
    --chunk-length $chunk_length \
    --max-new-tokens $max_new_tokens \
    --top-p $top_p \
    --repetition-penalty $repetition_penalty \
    --temperature $temperature \
    --seed $seed 2>/dev/null

codes_file = os.path.join(output_dir, "codes_0.npy")

output_audio = os.path.join(output_dir, "fake.wav")
!python -m fish_speech.models.vqgan.inference \
    -i "$codes_file" \
    --checkpoint-path "$decoder_checkpoint_path" \
    --output-path "$output_audio" 2>/dev/null

if os.path.exists(output_audio):
    display(Audio(filename=output_audio))
else:
    print(f"File {output_audio} was not generated. Check errors above.")

**Clone voice**

In [None]:
#@title ##**Upload Audio** { display-mode: "form" }
from google.colab import files
import os

uploaded = files.upload()

os.makedirs('/content/audio', exist_ok=True)

for filename, content in uploaded.items():
    with open(f'/content/audio/{filename}', 'wb') as f:
        f.write(content)
    audio_filename = filename

print(f"Audio file {audio_filename} successfully uploaded to /content/audio/")

import os
import shutil

decoder_checkpoint_path = "/content/checkpoints/fish-speech-1.5/firefly-gan-vq-fsq-8x1024-21hz-generator.pth"
temp_folder = "/content/temp_audio_folder"

if 'audio_filename' not in globals():
    print("Error: First, execute the audio upload cell!")
else:
    audio_path = f"/content/audio/{audio_filename}"
    npy_filename = os.path.splitext(audio_filename)[0] + ".npy"
    prompt_tokens = f"/content/audio/{npy_filename}"

    if not os.path.exists(audio_path):
        print(f"Error: File {audio_path} not found. Please re-upload the audio.")
    else:
        os.makedirs(temp_folder, exist_ok=True)
        shutil.copy(audio_path, temp_folder)

        !python /content/fish-speech/tools/vqgan/extract_vq.py \
            "$temp_folder" \
            --num-workers 1 \
            --batch-size 16 \
            --config-name "firefly_gan_vq" \
            --checkpoint-path "$decoder_checkpoint_path"

        generated_npy = f"{temp_folder}/{os.path.splitext(audio_filename)[0]}.npy"
        if os.path.exists(generated_npy):
            shutil.move(generated_npy, prompt_tokens)
            print(f"Tokens successfully saved to {prompt_tokens}")
        else:
            print(f"Error: File {prompt_tokens} was not created. Check the command output.")

        shutil.rmtree(temp_folder)


In [None]:
#@title ##**Generate Audio from Text** { display-mode: "form" }
import os
from IPython.display import Audio, display

checkpoint_path = "/content/checkpoints/fish-speech-1.5"
decoder_checkpoint_path = "/content/checkpoints/fish-speech-1.5/firefly-gan-vq-fsq-8x1024-21hz-generator.pth"
output_dir = "/content/output"

reference_text = '' #@param {type:"string"}
generate_text = '' #@param {type:"string"}

chunk_length = 200 #@param {type:"slider", min:0, max:300, step:8}
max_new_tokens = 0 #@param {type:"slider", min:0, max:2048, step:8}
top_p = 0.7 #@param {type:"slider", min:0.6, max:0.9, step:0.01}
repetition_penalty = 1.2 #@param {type:"slider", min:1.0, max:1.5, step:0.01}
temperature = 0.7 #@param {type:"slider", min:0.6, max:0.9, step:0.01}
seed = 0 #@param {type:"slider", min:0, max:1000000, step:1}

if 'prompt_tokens' not in globals():
    print("Error: First, execute the voice cloning cell!")
else:
    if not os.path.exists(prompt_tokens):
        print(f"Error: File {prompt_tokens} not found. Please perform voice cloning.")
    else:
        os.makedirs(output_dir, exist_ok=True)

        !python -m fish_speech.models.text2semantic.inference \
            --text "$generate_text" \
            --prompt-text "$reference_text" \
            --prompt-tokens "$prompt_tokens" \
            --checkpoint-path "$checkpoint_path" \
            --output-dir "$output_dir" \
            --num-samples 1 \
            --chunk-length $chunk_length \
            --max-new-tokens $max_new_tokens \
            --top-p $top_p \
            --repetition-penalty $repetition_penalty \
            --temperature $temperature \
            --seed $seed 2>/dev/null

        codes_file = os.path.join(output_dir, "codes_0.npy")
        output_audio = os.path.join(output_dir, "fake.wav")

        !python -m fish_speech.models.vqgan.inference \
            -i "$codes_file" \
            --checkpoint-path "$decoder_checkpoint_path" \
            --output-path "$output_audio" 2>/dev/null

        if os.path.exists(output_audio):
            print("Generation completed! Here is your audio:")
            display(Audio(filename=output_audio))
        else:
            print(f"Error: File {output_audio} was not generated.")