In [5]:
# apt-get update && apt-get install -y ffmpeg

!pip install --upgrade google-cloud-texttospeech
!pip install --upgrade google-cloud-speech

Collecting google-cloud-texttospeech
  Downloading google_cloud_texttospeech-2.33.0-py3-none-any.whl.metadata (10.0 kB)
Collecting google-api-core!=2.0.*,!=2.1.*,!=2.10.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,!=2.8.*,!=2.9.*,<3.0.0,>=1.34.1 (from google-api-core[grpc]!=2.0.*,!=2.1.*,!=2.10.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,!=2.8.*,!=2.9.*,<3.0.0,>=1.34.1->google-cloud-texttospeech)
  Downloading google_api_core-2.28.1-py3-none-any.whl.metadata (3.3 kB)
Collecting google-auth!=2.24.0,!=2.25.0,<3.0.0,>=2.14.1 (from google-cloud-texttospeech)
  Downloading google_auth-2.43.0-py2.py3-none-any.whl.metadata (6.6 kB)
Collecting grpcio<2.0.0,>=1.33.2 (from google-cloud-texttospeech)
  Downloading grpcio-1.76.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (3.7 kB)
Collecting proto-plus<2.0.0,>=1.22.3 (from google-cloud-texttospeech)
  Downloading proto_plus-1.26.1-py3-none-any.whl.metadata (2.2 kB)
Collecting protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=

In [6]:
import os
os.environ['PATH'] = f"{os.path.expanduser('~/google-cloud-sdk/bin')}:{os.environ['PATH']}"

In [7]:
import os
import base64
from google.cloud import texttospeech

def text_to_speech(text, output_file):
    """텍스트를 음성 파일로 변환"""
    # 클라이언트 초기화
    client = texttospeech.TextToSpeechClient()

    # 텍스트 입력 설정
    synthesis_input = texttospeech.SynthesisInput(text=text)

    # 음성 설정
    voice = texttospeech.VoiceSelectionParams(
        language_code="ko-KR",
        ssml_gender=texttospeech.SsmlVoiceGender.NEUTRAL,
    )

    # 오디오 출력 형식 설정
    audio_config = texttospeech.AudioConfig(
        audio_encoding=texttospeech.AudioEncoding.MP3
    )

    # TTS 요청
    response = client.synthesize_speech(
        input=synthesis_input, voice=voice, audio_config=audio_config
    )

    # 오디오 파일로 저장
    with open(output_file, "wb") as out:
        out.write(response.audio_content)
        print(f'Audio content written to file "{output_file}"')


In [8]:
from train_dataset import train_addresses
from test_dataset import test_addresses
address = train_addresses 
print(address[:4])

test_address = test_addresses 
print(test_address[:4])

['서울특별시 영등포구 압구정로 136, SK뷰 293동 3047호', '서울특별시 광진구 디지털로 40, 파크자이 672동 4159호', '서울특별시 종로구 백제고분로 48, 센트럴자이 705동 1544호', '서울특별시 양천구 역삼로 671, 삼성아파트 808동 192호']
['서울특별시 서초구 테헤란로 941, 현대아파트 553동 3722호', '서울특별시 은평구 마포대로 571, 래미안 198동 392호', '서울특별시 노원구 사당로 302, 엘에이치 669동 1290호', '서울특별시 서초구 선릉로 337, 파크자이 1087동 550호']


In [10]:
import re

# 출력 폴더 생성 (선택사항)
output_dir = "address_audio"
os.makedirs(output_dir, exist_ok=True)

# 각 주소를 개별 mp3 파일로 변환
for i, addr in enumerate(address, 1):
    # 공백 제거 및 파일명으로 사용 가능하게 변환
    filename = addr.replace(" ", "").replace(",", "_")
    # 파일명에 사용할 수 없는 문자 제거 (/, \, :, *, ?, ", <, >, | 등)
    filename = re.sub(r'[\\/:*?"<>|]', '', filename)

    output_filename = os.path.join(output_dir, f"{filename}.mp3")
    text_to_speech(addr, output_filename)

Audio content written to file "address_audio/서울특별시영등포구압구정로136_SK뷰293동3047호.mp3"
Audio content written to file "address_audio/서울특별시광진구디지털로40_파크자이672동4159호.mp3"
Audio content written to file "address_audio/서울특별시종로구백제고분로48_센트럴자이705동1544호.mp3"
Audio content written to file "address_audio/서울특별시양천구역삼로671_삼성아파트808동192호.mp3"
Audio content written to file "address_audio/서울특별시용산구압구정로700_센트럴자이351동2339호.mp3"
Audio content written to file "address_audio/서울특별시용산구천호대로960_대우아파트480동3185호.mp3"
Audio content written to file "address_audio/서울특별시영등포구마포대로274_힐스테이트830동3715호.mp3"
Audio content written to file "address_audio/서울특별시양천구백제고분로632_더샵1179동2753호.mp3"
Audio content written to file "address_audio/서울특별시영등포구논현로531_센트럴자이842동3137호.mp3"
Audio content written to file "address_audio/서울특별시금천구디지털로420_삼성아파트432동3645호.mp3"
Audio content written to file "address_audio/서울특별시영등포구봉은사로791_푸르지오498동150호.mp3"
Audio content written to file "address_audio/서울특별시성동구천호대로415_대우아파트428동3924호.mp3"
Audio content written to file "add

In [11]:
test_output_dir = "test_address_audio"
os.makedirs(test_output_dir, exist_ok=True)

# 각 주소를 개별 mp3 파일로 변환
for i, addr in enumerate(test_address, 1):
    # 공백 제거 및 파일명으로 사용 가능하게 변환
    filename = addr.replace(" ", "").replace(",", "_")
    # 파일명에 사용할 수 없는 문자 제거 (/, \, :, *, ?, ", <, >, | 등)
    filename = re.sub(r'[\\/:*?"<>|]', '', filename)

    output_filename = os.path.join(test_output_dir, f"{filename}.mp3")
    text_to_speech(addr, output_filename)

Audio content written to file "test_address_audio/서울특별시서초구테헤란로941_현대아파트553동3722호.mp3"
Audio content written to file "test_address_audio/서울특별시은평구마포대로571_래미안198동392호.mp3"
Audio content written to file "test_address_audio/서울특별시노원구사당로302_엘에이치669동1290호.mp3"
Audio content written to file "test_address_audio/서울특별시서초구선릉로337_파크자이1087동550호.mp3"
Audio content written to file "test_address_audio/서울특별시노원구백제고분로755_호반베르디움983동4235호.mp3"
Audio content written to file "test_address_audio/서울특별시관악구강남대로150_대우아파트929동1637호.mp3"
Audio content written to file "test_address_audio/서울특별시도봉구선릉로850_롯데캐슬498동1702호.mp3"
Audio content written to file "test_address_audio/서울특별시도봉구논현로949_아이파크686동2090호.mp3"
Audio content written to file "test_address_audio/서울특별시서대문구여의대로576_엘에이치727동3368호.mp3"
Audio content written to file "test_address_audio/서울특별시도봉구천호대로775_e편한세상504동2107호.mp3"
Audio content written to file "test_address_audio/서울특별시도봉구압구정로968_호반베르디움654동2687호.mp3"
Audio content written to file "test_address_audio/서울특별시강북구고덕로4

In [12]:
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
from datasets import load_dataset


device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

model_id = "openai/whisper-large-v3-turbo"

model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
)
model.to(device)

processor = AutoProcessor.from_pretrained(model_id)

pipe = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    torch_dtype=torch_dtype,
    device=device,
)


`torch_dtype` is deprecated! Use `dtype` instead!
`torch_dtype` is deprecated! Use `dtype` instead!
Device set to use cuda:0


In [15]:
import IPython.display as ipd

sample = "/workspace/test_address_audio/강원특별자치도강릉시평화로944_호반베르디움658동3850호.mp3"
ipd.Audio(sample)

In [16]:
from pathlib import Path
from datasets import Dataset, Audio, DatasetDict

# 학습용 데이터 수집
train_audio_files = []
train_texts = []

train_audio_root_dir = "./address_audio"

for file in Path(train_audio_root_dir).glob("*.mp3"):
    train_audio_files.append(str(file))
    # 파일명에서 주소 텍스트 추출
    text = file.stem.replace("_", " ")
    train_texts.append(text)

print(f"학습용 데이터: {len(train_audio_files)}개 파일 수집")

# 평가용 데이터 수집
test_audio_files = []
test_texts = []

test_audio_root_dir = "./test_address_audio"

for file in Path(test_audio_root_dir).glob("*.mp3"):
    test_audio_files.append(str(file))
    # 파일명에서 주소 텍스트 추출
    text = file.stem.replace("_", " ")
    test_texts.append(text)

print(f"평가용 데이터: {len(test_audio_files)}개 파일 수집")

# 학습용 Dataset 생성
train_dataset = Dataset.from_dict({
    "audio": train_audio_files,
    "text": train_texts
}).cast_column("audio", Audio())

# 평가용 Dataset 생성
test_dataset = Dataset.from_dict({
    "audio": test_audio_files,
    "text": test_texts
}).cast_column("audio", Audio())

# DatasetDict로 묶기
audio_dataset = DatasetDict({
    "train": train_dataset,
    "test": test_dataset
})

print(audio_dataset)

학습용 데이터: 3400개 파일 수집
평가용 데이터: 340개 파일 수집
DatasetDict({
    train: Dataset({
        features: ['audio', 'text'],
        num_rows: 3400
    })
    test: Dataset({
        features: ['audio', 'text'],
        num_rows: 340
    })
})


In [18]:
# 허깅페이스에 업로드
audio_dataset.push_to_hub("daje/korean-address-voice-v2")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Map:   0%|          | 0/3400 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/34 [00:00<?, ?ba/s]

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Map:   0%|          | 0/340 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/4 [00:00<?, ?ba/s]

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

CommitInfo(commit_url='https://huggingface.co/datasets/daje/korean-address-voice-v2/commit/cabb44cdfcfd1e6d012424a1f5b2465aafa44523', commit_message='Upload dataset', commit_description='', oid='cabb44cdfcfd1e6d012424a1f5b2465aafa44523', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/daje/korean-address-voice-v2', endpoint='https://huggingface.co', repo_type='dataset', repo_id='daje/korean-address-voice-v2'), pr_revision=None, pr_num=None)