#### 安裝套件

#### 載入函式庫與參數設定

In [1]:
import os
import pandas as pd
import torch
import numpy as np
from datasets import Dataset, Audio
from transformers import (
    WhisperProcessor,
    WhisperForConditionalGeneration,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
)
from dataclasses import dataclass
from typing import Any, Dict, List, Union
import evaluate
from tqdm import tqdm
from transformers import EarlyStoppingCallback
from ctranslate2.converters import TransformersConverter
import whisperx
from transformers import WhisperProcessor, WhisperTokenizer
import json
import zipfile
from jiwer import wer
import re

In [2]:
def setup_directories(version: str = "v99"):
    # Create result directory and version subdirectory
    result_dir = "model_result"
    result_dir = os.path.join(result_dir, "task1")
    version_dir = os.path.join(result_dir, version)
    model_dir = os.path.join(version_dir, "model")

    # Create directories if they don't exist
    os.makedirs(result_dir, exist_ok=True)
    os.makedirs(version_dir, exist_ok=True) 
    os.makedirs(model_dir, exist_ok=True)
    print(result_dir, version_dir, model_dir)
    return version_dir, model_dir

In [3]:
# Setup directories
version = "v1"
version_dir, model_dir = setup_directories(version)


# Check CUDA availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

# 設定模型路徑
final_model_path = os.path.join(model_dir, "fine-tuning-whisper")
final_ct2_model_path = os.path.join(model_dir, "fine-tuning-whisper-ct2")
print(f"Model saved to {final_model_path}")

# load fine-tuning whisper
zip_model_path = "/home/student1/ai/fine-tuning-whisper.zip"
extract_model_path = "/home/student1/model_result/task1/v1/model/fine-tuning-whisper"
with zipfile.ZipFile(zip_model_path, 'r') as zip_ref:
  zip_ref.extractall(extract_model_path)

model_result/task1 model_result/task1/v1 model_result/task1/v1/model
Using device: cuda
GPU: NVIDIA GeForce RTX 3090 Ti
Model saved to model_result/task1/v1/model/fine-tuning-whisper


#### 資料前處理

In [4]:
def prepare_dataset(validation_path: str): # train_path: str,

    val_audio_files = [f for f in os.listdir(validation_path) if f.endswith('.wav')]
    val_dict = {
        "audio": [os.path.join(validation_path, audio_file) for audio_file in val_audio_files],
        "text": ["" for _ in val_audio_files]  # 空字串作為預設文字
    }
    val_dataset = Dataset.from_dict(val_dict)
    val_dataset = val_dataset.cast_column("audio", Audio(sampling_rate=16000))
    # 回傳兩個資料集
    return {
        "val": val_dataset
    }

In [5]:
def prepare_features(batch, processor):
    # Process audio
    audio = batch["audio"]

    # Compute log-mel input features from input audio array
    batch["input_features"] = processor.feature_extractor(
        audio["array"],
        sampling_rate=audio["sampling_rate"],
        return_tensors="pt"
    ).input_features[0]

    # Encode target text to label ids 並限制最大長度為 448
    batch["labels"] = processor.tokenizer(
        batch["text"],
        max_length=448,
        truncation=True,
        padding="max_length",

    ).input_ids
    batch["file_path"] = batch["audio"]["path"]
    batch["audio_file_name"] = os.path.splitext(os.path.basename(batch["audio"]["path"]))[0]

    return batch

In [6]:
# Prepare dataset
processor = WhisperProcessor.from_pretrained("/home/student1/ai/model_result/task1/v1/model/fine-tuning-whisper")
# "audio_dataset/sample/Training_Dataset",
dataset = prepare_dataset("/home/student1/ai/Private_dataset/private")
# Process datasets
processed_dataset = {}
for split in ["val"]: # "train", "test",
    print(f"Processing {split} dataset...")
    processed_dataset[split] = dataset[split].map(
        lambda x: prepare_features(x, processor=processor),
        remove_columns= dataset[split].column_names,
        num_proc=1
    )

Processing val dataset...


Map:   0%|          | 0/709 [00:00<?, ? examples/s]

#### 進行轉檔 Huggingface to Ctranslate2

In [7]:
def convert_hf_model_to_ct2(model_name_or_path: str, output_dir: str, quantization: str = "float32", trust_remote_code: bool = True, device: str = "cuda"):
    """
    Hugging Face convert to  CTranslate2

    parameter:
        model_name_or_path (str): Hugging Face moedel name or local directory.
        output_dir (str): Output directory where the CTranslate2 model is saved.
        quantization (str): Weight quantization scheme (possible values are: int8, int8_float32, int8_float16, int8_bfloat16, int16, float16, bfloat16, float32).
        trust_remote_code (bool): Allow converting models using custom code.
    """
    # 初始化轉換器
    converter = TransformersConverter(model_name_or_path,copy_files=["preprocessor_config.json", "tokenizer.json"] , trust_remote_code=trust_remote_code)
    # 執行轉換
    converter.convert(output_dir=output_dir,force=True)
    model = whisperx.load_model(output_dir, device=device)
    return model

In [8]:
print("Converting model to CTranslate2 format...")
model = convert_hf_model_to_ct2(
    model_name_or_path=final_model_path,
    output_dir=final_ct2_model_path,
    # quantization="float16",
    trust_remote_code=True
)

Converting model to CTranslate2 format...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Lightning automatically upgraded your loaded checkpoint from v1.5.4 to v2.5.1.post0. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../miniconda3/envs/lin/lib/python3.11/site-packages/whisperx/assets/pytorch_model.bin`


No language specified, language will be first be detected for each audio file (increases inference time).
>>Performing voice activity detection using Pyannote...
Model was trained with pyannote.audio 0.0.1, yours is 3.3.2. Bad things might happen unless you revert pyannote.audio to 0.x.
Model was trained with torch 1.10.0+cu102, yours is 2.7.0+cu126. Bad things might happen unless you revert torch to 1.x.


#### WhisperX進行轉錄

In [9]:
# Save predictions to TSV file  VALIDATION
def save_predictions_to_tsv(predictions,filenames, version_dir,export_json_file):
    task1_output_file = os.path.join(version_dir, "task1_answer.txt")
    output_file = os.path.join(version_dir, "val_results.txt")
    print("Saving predictions to file...")
    json_output_file = os.path.join(version_dir, "val_time_step.json")
    with open(json_output_file, "w", encoding="utf-8") as f:
        json.dump(export_json_file, f, ensure_ascii=False, indent=2)
    print(f"Results saved to {task1_output_file}")

In [10]:
import re
import whisperx
import torch
from tqdm import tqdm

def calculate_output(model, eval_dataloader, version_dir):  
    print("Calculating ouput score...")
    predictions = []
    filenames = []
    export_json_file = {}

    # 預先載入兩種語言的對齊模型
    align_model_en, metadata_en = whisperx.load_align_model(language_code="en", device="cuda")
    align_model_zh, metadata_zh = whisperx.load_align_model(language_code="zh", device="cuda")

    for batch in tqdm(eval_dataloader, desc="處理評估資料", unit="batch"):
        with torch.no_grad():
            audio = batch["file_path"]
            file_id_str = batch["audio_file_name"]
            file_id_num = int(re.sub(r"\D", "", file_id_str))
            language_hint = "zh" if file_id_num >= 80000 else "en"

            # 語音轉文字
            result = model.transcribe(audio, batch_size=1, language=language_hint)

            # 防止空結果崩潰
            if len(result.get("segments", [])) == 0:
                print(f"⚠️ 無轉錄結果: {file_id_str}")
                predictions.append("")
                filenames.append(file_id_str)
                export_json_file[file_id_str] = {
                    "language": language_hint,
                    "segments": []
                }
                continue

            segments = result["segments"]
            transcript_dict = {
                "language": result["language"],
                "segments": []
            }

            # 選擇對應語言的對齊模型
            if language_hint == "zh":
                alignment = whisperx.align(segments, align_model_zh, metadata_zh, audio, device="cuda")
            else:
                alignment = whisperx.align(segments, align_model_en, metadata_en, audio, device="cuda")

            result["word_segments"] = alignment["word_segments"]
            word_segments = alignment["word_segments"]

            for seg in segments:
                words_in_seg = [
                    {
                        "word": w["word"],
                        "start": w["start"],
                        "end": w["end"],
                        "probability": w.get("probability", None)
                    }
                    for w in word_segments
                    if w["start"] >= seg["start"] and w["end"] <= seg["end"]
                ]
                transcript_dict["segments"].append({
                    "text": seg["text"],
                    "start": seg["start"],
                    "end": seg["end"],
                    "words": words_in_seg
                })

            pred_str = segments[0]["text"]
            export_json_file[file_id_str] = transcript_dict
            predictions.append(pred_str)
            filenames.append(file_id_str)

    save_predictions_to_tsv(predictions, filenames, version_dir, export_json_file)
    return 0


In [11]:
calculate_output(model, processed_dataset["val"], version_dir)

Calculating ouput score...


It can be re-enabled by calling
   >>> import torch
   >>> torch.backends.cuda.matmul.allow_tf32 = True
   >>> torch.backends.cudnn.allow_tf32 = True
See https://github.com/pyannote/pyannote-audio/issues/1370 for more details.

處理評估資料:  47%|████▋     | 334/709 [06:39<05:18,  1.18batch/s]

No active speech found in audio
⚠️ 無轉錄結果: 84296


處理評估資料: 100%|██████████| 709/709 [14:14<00:00,  1.20s/batch]

Saving predictions to file...
Results saved to model_result/task1/v1/task1_answer.txt





0