In [None]:
import os
from dotenv import load_dotenv
from urllib.parse import urlparse, parse_qs
import requests
# ① .env 파일 로드
load_dotenv()
SPEECH_SERVICE_ENDPOINT = os.getenv("SPEECH_SERVICE_ENDPOINT")
SPEECH_SERVICE_KEY      = os.getenv("SPEECH_SERVICE_KEY")
SPEECH_REGION = os.getenv("SPEECH_REGION")

일반 rest api로 연결

In [11]:
import gradio as gr
import requests

def request_stt(file_path):
    # endpoint
    endpoint = SPEECH_SERVICE_ENDPOINT

    query_params = {
        "language" : "ko-KR",
        "format" : "detailed"
    }
    # method:POST
    # headers
    headers = {
        "Content-Type" : "audio/wav",
        "Ocp-Apim-Subscription-Key" : SPEECH_SERVICE_KEY

    }
    # body
    with open(file_path, "rb") as audio:
        audio_data = audio.read()


    response = requests.post(endpoint, params=query_params, headers=headers, data = audio_data)
    print(response.status_code, response.text)


    if response.status_code == 200:
        response_json = response.json()
        text = response_json['DisplayText']

        return text
    
    else:
        return ""


request_stt("/Users/parkjiyon/Desktop/MSAI6/finalProject/fastapi-app/app/data/whatstheweatherlike.wav")

  from .autonotebook import tqdm as notebook_tqdm


200 {"RecognitionStatus":"Success","Offset":1900000,"Duration":10000000,"DisplayText":"Watch the wedding like.","NBest":[{"Confidence":0.60624075,"Lexical":"watch the wedding like","ITN":"watch the wedding like","MaskedITN":"watch the wedding like","Display":"Watch the wedding like."}]}


'Watch the wedding like.'

sdk 언어 감지 동기 방식

In [None]:
import azure.cognitiveservices.speech as speechsdk
import threading

key = SPEECH_SERVICE_KEY
region = SPEECH_REGION

speech_config = speechsdk.SpeechConfig(subscription=key, region=region)
auto_detect_config = speechsdk.AutoDetectSourceLanguageConfig(languages=["en-US", "ko-KR", "zh-CN"])
audio_config = speechsdk.AudioConfig(filename="/Users/parkjiyon/Desktop/MSAI6/finalProject/fastapi-app/app/data/audio1.wav")

recognizer = speechsdk.SpeechRecognizer(
    speech_config=speech_config,
    auto_detect_source_language_config=auto_detect_config,
    audio_config=audio_config
)

all_results = []
detected_lang = None
done = threading.Event()

def handle_final_result(evt):
    global detected_lang

    # 첫번째 인식 결과에서 자동 감지된 언어 가져오기 (한번만)
    if detected_lang is None:
        detected_lang = evt.result.properties.get(speechsdk.PropertyId.SpeechServiceConnection_AutoDetectSourceLanguageResult)
    
    all_results.append(evt.result.text)

def handle_session_stopped(evt):
    done.set()

def handle_canceled(evt):
    done.set()

recognizer.recognized.connect(handle_final_result)
recognizer.session_stopped.connect(handle_session_stopped)
recognizer.canceled.connect(handle_canceled)

recognizer.start_continuous_recognition()
done.wait()
recognizer.stop_continuous_recognition()

full_transcript = ' '.join(all_results)

print(f"Detected language: {detected_lang}")
print(f"Transcription:    {full_transcript}")


Detected language: ko-KR
Transcription:    데이터. 테스트 및 끝점 같은 콘텐츠는 사용자 지정 음성 포털에서 프로젝트로 구성됩니다. 각 프로젝트는 도메인 및 국가 슬래시 언어에만 적용됩니다. 예를 들어 미국에서 영어를 사용하는 콜센터에 대한 프로젝트를 만들 수 있습니다. 1번째 프로젝트를 만들려면 음성 텍스트 슬래시 사용자 지정 음성 선택한 다음 세 프로젝트를 클릭합니다. 프로젝트를 만들려면 마법사에서 제공하는 지침을 따릅니다. 프로젝트를 만든 후 내게 탭이 표시됩니다. 데이터 테스트 학습 및 배포. 닭을 사용하는 방법을 배우려면 다음 단계에서 제공되는 링크를 사용합니다.


sdk 비동기 방식

In [None]:
import azure.cognitiveservices.speech as speechsdk
import asyncio

key = SPEECH_SERVICE_KEY
region = SPEECH_REGION

speech_config = speechsdk.SpeechConfig(subscription=key, region=region)
auto_detect_config = speechsdk.AutoDetectSourceLanguageConfig(languages=["en-US", "ko-KR", "zh-CN"])
audio_config = speechsdk.AudioConfig(filename="/Users/parkjiyon/Desktop/MSAI6/finalProject/fastapi-app/app/data/audio1.wav")

recognizer = speechsdk.SpeechRecognizer(
    speech_config=speech_config,
    auto_detect_source_language_config=auto_detect_config,
    audio_config=audio_config
)

all_results = []
detected_lang = None

async def recognize_continuous():
    global detected_lang

    done = asyncio.Event()

    def handle_final_result(evt):
        global detected_lang
        if detected_lang is None:
            detected_lang = evt.result.properties.get(
                speechsdk.PropertyId.SpeechServiceConnection_AutoDetectSourceLanguageResult)
            print(f"Detected language: {detected_lang}")
        if evt.result.reason == speechsdk.ResultReason.RecognizedSpeech:
            print(f"Partial transcription: {evt.result.text}")
            all_results.append(evt.result.text)

    def handle_session_stopped(evt):
        done.set()

    def handle_canceled(evt):
        print(f"Recognition canceled: {evt.reason}")
        if evt.reason == speechsdk.CancellationReason.Error:
            print(f"Error details: {evt.error_details}")
        done.set()

    recognizer.recognized.connect(handle_final_result)
    recognizer.session_stopped.connect(handle_session_stopped)
    recognizer.canceled.connect(handle_canceled)

    recognizer.start_continuous_recognition()
    await done.wait()
    recognizer.stop_continuous_recognition()

    full_transcript = ' '.join(all_results)
    print("\nFull transcription:")
    print(full_transcript)

asyncio.run(recognize_continuous())


RuntimeError: asyncio.run() cannot be called from a running event loop

In [4]:
with gr.Blocks() as demo:

    gr.Markdown("# AI Speech World!")

    def change_audio(audio_path):
        return audio_path


    with gr.Column():
        gr.Markdown("### STT ###")

        input_mic = gr.Audio(label="마이크 입력", sources="microphone", type="filepath", show_download_button=True)
        output_textbox = gr.Textbox(label="텍스트", interactive=False)

        input_mic.change(change_audio, inputs=[input_mic], outputs=[output_textbox])

    demo.launch()

* Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.
