In [None]:
!pip install openai-whisper librosa

Collecting openai-whisper
  Downloading openai-whisper-20231117.tar.gz (798 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m798.6/798.6 kB[0m [31m12.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting tiktoken (from openai-whisper)
  Downloading tiktoken-0.7.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m49.2 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch->openai-whisper)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch->openai-whisper)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12

In [None]:
!sudo apt install ffmpeg

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
ffmpeg is already the newest version (7:4.4.2-0ubuntu0.22.04.1).
0 upgraded, 0 newly installed, 0 to remove and 45 not upgraded.


In [None]:
import torch
import whisper
import librosa

def transcribe_audio(audio_path, model_name="base", language=None):
    # GPU 사용 가능 여부 확인
    device = "cuda" if torch.cuda.is_available() else "cpu"
    print(f"Using device: {device}")

    # Whisper 모델 로드 (GPU 사용)
    model = whisper.load_model(model_name).to(device)

    # MP3 파일 로드
    audio, sr = librosa.load(audio_path, sr=16000)

    # 오디오 데이터를 GPU로 이동 (필요한 경우)
    audio_tensor = torch.from_numpy(audio).to(device)

    # Whisper를 사용하여 음성을 텍스트로 변환 (GPU 사용)
    with torch.no_grad():
        if language:
            result = model.transcribe(audio_tensor, language=language)
        else:
            result = model.transcribe(audio_tensor)

    return result

# 사용 예시
audio_file = input("경로를 입력하세요: ")  # MP3 파일 경로를 지정하세요
result = transcribe_audio(audio_file, model_name="base")
#result = transcribe_audio(audio_file, model_name="base", language="ko")  # 한국어로 설정, 필요에 따라 변경 가능

print(result["text"])

# 세그먼트별로 결과 출력 (선택사항)
for segment in result["segments"]:
    print(f"[{segment['start']:.2f}s -> {segment['end']:.2f}s] {segment['text']}")

경로를 입력하세요: /content/drive/MyDrive/IAP/STT/demo.wav
Using device: cuda


100%|████████████████████████████████████████| 139M/139M [00:01<00:00, 100MiB/s]


 문장에서 가장 적절한 따뜻한 고르는 부분입니다. 1번부터 10번까지 총 10무제가 출지되며 각 질문마다 4개의 복기를 들려줍니다.
[0.00s -> 5.00s]  문장에서 가장 적절한 따뜻한 고르는 부분입니다.
[5.00s -> 11.00s]  1번부터 10번까지 총 10무제가 출지되며 각 질문마다 4개의 복기를 들려줍니다.


## Wave2Vec 2.0 (Facebook AI)

In [4]:
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
import torch
import torchaudio
import numpy as np

def process_audio_chunk(model, processor, audio_chunk, sampling_rate):
    # 16kHz로 리샘플링
    resampler = torchaudio.transforms.Resample(sampling_rate, 16000)
    audio_chunk = resampler(audio_chunk)

    # 모델 입력 준비
    inputs = processor(audio_chunk.squeeze().numpy(), sampling_rate=16000, return_tensors="pt", padding=True)

    # GPU로 데이터 이동
    inputs = {k: v.to(device) for k, v in inputs.items()}

    # 음성 인식
    with torch.no_grad():
        logits = model(**inputs).logits

    predicted_ids = torch.argmax(logits, dim=-1)
    transcription = processor.batch_decode(predicted_ids)
    return transcription

# 모델 및 프로세서 로드
#model_name = "facebook/wav2vec2-large-960h"
model_name = "kresnik/wav2vec2-large-xlsr-korean"
processor = Wav2Vec2Processor.from_pretrained(model_name)
model = Wav2Vec2ForCTC.from_pretrained(model_name)

# GPU 설정
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# 오디오 파일 로드
speech_array, sampling_rate = torchaudio.load('/content/drive/MyDrive/IAP/STT/demo.wav')

# 스테레오를 모노로 변환 (필요한 경우)
if speech_array.shape[0] > 1:
    speech_array = torch.mean(speech_array, dim=0, keepdim=True)

# 청크 단위로 오디오 처리
chunk_size = 10 * sampling_rate  # 10초 단위로 청크 나누기
transcriptions = []

for start in range(0, speech_array.shape[1], chunk_size):
    end = min(start + chunk_size, speech_array.shape[1])
    audio_chunk = speech_array[:, start:end]
    transcription = process_audio_chunk(model, processor, audio_chunk, sampling_rate)
    transcriptions.extend(transcription)

# 전체 텍스트 출력
full_transcription = ' '.join(transcriptions)
print(full_transcription)



Some weights of the model checkpoint at kresnik/wav2vec2-large-xlsr-korean were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at kresnik/wav2vec2-large-xlsr-korean and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRA

문장에서 가장 적절한 다뼈 고르던 부분입니 일본부터 십 억까지 총 열은제가 출제되며 각 증원마받다 네 개 다 기을 들려 줍니다
