# install

In [3]:
# %%capture
# !pip install datasets>=2.6.1
# !pip install transformers==4.41.1
# !pip install sentence-transformers==2.7.0
# !pip install peft==0.10.0
# !pip install evaluate>=0.30
# !pip install jiwer
# !pip install accelerate -U
# !pip install transformers[torch]
# !pip install wandb
# !pip install matplotlib

In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import wandb
import torch
from dataclasses import dataclass
from typing import Any, Dict, List, Union

from datasets import Dataset, DatasetDict
from datasets import Audio

In [None]:
from dotenv import load_dotenv

load_dotenv()

hf_token = os.getenv("HF_TOKEN")
wandb_key = os.getenv("WANDB_API_KEY")

# import

# 데이터 전처리

In [2]:
import os
import glob
import numpy as np
import pandas as pd

from tqdm.auto import tqdm

In [6]:
from pathlib import Path

text_base_dir = Path("C:/Users/Playdata/Desktop/test/train_cut/D03_Transcription")
audio_base_dir = Path("C:/Users/Playdata/Desktop/test/train_cut/D03_Audio")

# .txt 파일 glob 재설정 (재귀적 탐색)
labeled_data_list = sorted(text_base_dir.rglob("*.txt"))  # ✅ 모든 하위 폴더의 .txt 탐색
raw_data_list = sorted(audio_base_dir.rglob("*.wav"))     # ✅ 모든 하위 폴더의 .wav 탐색

In [7]:
print(f"file_list : {raw_data_list[:10]}")
print(len(raw_data_list))

file_list : [WindowsPath('C:/Users/Playdata/Desktop/test/train_cut/D03_Audio/J14/S000001/0001.wav'), WindowsPath('C:/Users/Playdata/Desktop/test/train_cut/D03_Audio/J14/S000001/0002.wav'), WindowsPath('C:/Users/Playdata/Desktop/test/train_cut/D03_Audio/J14/S000001/0003.wav'), WindowsPath('C:/Users/Playdata/Desktop/test/train_cut/D03_Audio/J14/S000001/0004.wav'), WindowsPath('C:/Users/Playdata/Desktop/test/train_cut/D03_Audio/J14/S000001/0005.wav'), WindowsPath('C:/Users/Playdata/Desktop/test/train_cut/D03_Audio/J14/S000001/0006.wav'), WindowsPath('C:/Users/Playdata/Desktop/test/train_cut/D03_Audio/J14/S000001/0007.wav'), WindowsPath('C:/Users/Playdata/Desktop/test/train_cut/D03_Audio/J14/S000001/0008.wav'), WindowsPath('C:/Users/Playdata/Desktop/test/train_cut/D03_Audio/J14/S000001/0009.wav'), WindowsPath('C:/Users/Playdata/Desktop/test/train_cut/D03_Audio/J14/S000001/0010.wav')]
62327


# config

In [8]:
class Config():
    def __init__(self):
        self.train_batch_size = 16 # 기존 : 64
        self.valid_batch_size = 16 # 기존 : 32
        self.lr = 1e-4
        self.seed = 2024
        self.n_epochs = 3
        self.gradient_accumulation_steps = 4
        # self.warm_up = 1000
        # self.max_steps = 4000

In [9]:
config = Config()
config.train_batch_size

16

In [10]:
effective_batch_size = config.train_batch_size * config.gradient_accumulation_steps

# Device

In [11]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cpu')

# Set Seed

In [12]:
# 시드 설정 X

def set_max_performance():
    # 시드 설정을 아예 하지 않음
    # 또는 아래처럼 일부 기본값으로 초기화해도 됨 (필수 아님)

    # CuDNN 자동 최적 알고리즘 활성화 (성능 향상 가능)
    torch.backends.cudnn.deterministic = False
    torch.backends.cudnn.benchmark = True

    # 필요시 여러 개의 GPU 중 가장 빠른 걸 자동 선택하게 할 수도 있음
    if torch.cuda.is_available():
        device = torch.device("cuda")
        print(f"Using GPU: {torch.cuda.get_device_name(0)}")
    else:
        device = torch.device("cpu")
        print("Using CPU")

set_max_performance()

Using CPU


# Load Dataset

In [13]:
save_dir = Path("C:/Users/Playdata/Desktop/test")

df = pd.read_csv(save_dir / "path_and_transcript_validation.csv")
print(df.shape)
df.head()

(62327, 2)


Unnamed: 0,transcript,raw_data
0,안녕하세요. 이번에 그 정보기술 전략에 지원한 지원자입니다.,C:\Users\Playdata\Desktop\test\train_cut\D03_A...
1,아 네 안녕하세요. 지원자님 어떤 일로 연락 주셨을까요.,C:\Users\Playdata\Desktop\test\train_cut\D03_A...
2,아 다름이 아니라 정보기술 전략팀이 뭔지 좀 더 상세히 좀 알고 싶어서요.,C:\Users\Playdata\Desktop\test\train_cut\D03_A...
3,아 네 정보기술 전략 직무에 대해 조금 더 자세히 알고 싶으시단 말씀이시죠.,C:\Users\Playdata\Desktop\test\train_cut\D03_A...
4,네 맞습니다.,C:\Users\Playdata\Desktop\test\train_cut\D03_A...


In [17]:
df.tail()

Unnamed: 0,transcript,raw_data
62322,총 37시간입니다.,C:\encore-skn11\정원\train_cut\train_cut\D03_Aud...
62323,예 알겠습니다.,C:\encore-skn11\정원\train_cut\train_cut\D03_Aud...
62324,네 그 외에 다른 문의사항 있으신가요?,C:\encore-skn11\정원\train_cut\train_cut\D03_Aud...
62325,아니 없습니다. 수고하세요.,C:\encore-skn11\정원\train_cut\train_cut\D03_Aud...
62326,네 다른 문의사항 있으시면 큐앤에이나 유선을 이용하여 질문 주시면 문의사항 도와드리...,C:\encore-skn11\정원\train_cut\train_cut\D03_Aud...


# Download Processors

In [17]:
from transformers import WhisperProcessor
from transformers import WhisperFeatureExtractor, WhisperTokenizer

model_name = "SungBeom/whisper-small-ko"
lang = "ko"

# 파인튜닝을 진행하고자 하는 모델의 feature extractor를 로드
feature_extractor = WhisperFeatureExtractor.from_pretrained(model_name)

# 파인튜닝을 진행하고자 하는 모델의 tokenizer를 로드
tokenizer = WhisperTokenizer.from_pretrained(model_name, language = lang, task="transcribe")

# All we need is Processor
processor = WhisperProcessor.from_pretrained(model_name, language = lang, task="transcribe")

preprocessor_config.json:   0%|          | 0.00/339 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


tokenizer_config.json:   0%|          | 0.00/805 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

normalizer.json: 0.00B [00:00, ?B/s]

added_tokens.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


# Dataset

In [18]:
from torch.utils.data import Dataset, DataLoader
import torchaudio
from torchaudio import transforms

class SimpleASRDataset(Dataset):
    def __init__(self,
                 df,
                 processor,
                 resample_rate=16000):
        self.df = df
        self.device = device
        self.processor = processor
        self.resample_rate = resample_rate
        self.audios = self.df.raw_data.to_list()
        self.transcripts = self.df.transcript.to_list()

    def __len__(self):
        return self.df.shape[0]

    def __getitem__(self, idx):

        # audio
        wav, sample_rate = torchaudio.load(self.audios[idx])
        transform = transforms.Resample(sample_rate, self.resample_rate)
        resampled_waveform = transform(wav)

        input_features = self.processor(resampled_waveform.squeeze(0),     # Batch Size 제거
                                        sampling_rate= self.resample_rate, # whisper-small은 sampling rate을 16000인 데이터에 대해서만 input으로 받는다고 합니다.
                                        return_tensors="pt"
                                        ).input_features.squeeze(0)

        # transcripts
        labels = self.processor.tokenizer(self.transcripts[idx],
                                          padding=True,
                                          truncation=True,
                                          return_tensors="pt").input_ids.squeeze(0)

        # return
        return {'input_features': input_features, 'labels': labels}

In [20]:
processor.tokenizer("넌 누구니?", return_tensors="pt")

{'input_ids': tensor([[50258, 50264, 50359, 50363, 33386,   234, 36385,  1425,    30, 50257]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [21]:
ds = SimpleASRDataset(df = df, processor = processor)
sample = next(iter(ds))
sample

{'input_features': tensor([[-0.7505, -0.7505, -0.7505,  ..., -0.7505, -0.7505, -0.7505],
         [-0.7505, -0.7505, -0.7505,  ..., -0.7505, -0.7505, -0.7505],
         [-0.7505, -0.7505, -0.7505,  ..., -0.7505, -0.7505, -0.7505],
         ...,
         [-0.7505, -0.7505, -0.7505,  ..., -0.7505, -0.7505, -0.7505],
         [-0.7505, -0.7505, -0.7505,  ..., -0.7505, -0.7505, -0.7505],
         [-0.7505, -0.7505, -0.7505,  ..., -0.7505, -0.7505, -0.7505]]),
 'labels': tensor([50258, 50264, 50359, 50363, 49200, 12831, 15377,    13, 40692,  4296,
          4980, 16112,  2401, 21619, 19617,   252,   113,  1517, 47284,  3049,
         47284,  4264,  7416,    13, 50257])}

In [22]:
sample['input_features'].shape, sample['labels'].shape

(torch.Size([80, 3000]), torch.Size([25]))

In [23]:
sample['labels']

tensor([50258, 50264, 50359, 50363, 49200, 12831, 15377,    13, 40692,  4296,
         4980, 16112,  2401, 21619, 19617,   252,   113,  1517, 47284,  3049,
        47284,  4264,  7416,    13, 50257])

# DataCollatorSpeechSeq2SeqWithPadding

In [19]:
from dataclasses import dataclass
from typing import Any, Dict, List, Union

@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    def __init__(self, processor):
        self.processor = processor

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:

        # 인풋 데이터와 라벨 데이터의 길이가 다르며, 따라서 서로 다른 패딩 방법이 적용되어야 한다. 그러므로 두 데이터를 분리해야 한다.
        # 먼저 오디오 인풋 데이터를 간단히 토치 텐서로 반환하는 작업을 수행한다.
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")

        # Tokenize된 레이블 시퀀스를 가져온다.
        label_features = [{"input_ids": feature["labels"]} for feature in features]
        # 레이블 시퀀스에 대해 최대 길이만큼 패딩 작업을 실시한다.
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")

        # 패딩 토큰을 -100으로 치환하여 loss 계산 과정에서 무시되도록 한다.
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        # 이전 토크나이즈 과정에서 bos 토큰이 추가되었다면 bos 토큰을 잘라낸다.
        # 해당 토큰은 이후 언제든 추가할 수 있다.
        if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().cpu().item():
            labels = labels[:, 1:]

        batch["labels"] = labels

        return batch

In [20]:
# 데이터 콜레이터 초기화
data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)

# Datasets

In [21]:
# 기존 version

def prepare_datasets(df,
                    train_ratio,
                    config = config,
                    shuffle=True,
                    seed=42
                    ):
    if shuffle:
        df = df.sample(frac=1, random_state=seed).reset_index(drop=True)
        
    total_rows = df.shape[0]
    cutoff = int(train_ratio * total_rows)
    print(cutoff)

    # DataFrame Indexing
    train_df = df[:cutoff].reset_index(drop = True)
    valid_df = df[cutoff:].reset_index(drop = True)

    # MyDataset
    train_ds = SimpleASRDataset(df = train_df, processor = processor)
    valid_ds = SimpleASRDataset(df = valid_df, processor = processor)

    print("Dataset Completed")
    return train_ds, valid_ds

In [22]:
train_ds, valid_ds = prepare_datasets(df, train_ratio = 0.8, config = config)

49861
Dataset Completed


In [23]:
# # ✅ 분할 비율 및 설정값 정의
# train_ratio = 0.6
# val_ratio = 0.2
shuffle_data = True
random_seed = 42

In [29]:
sample = next(iter(train_ds))
sample['input_features'].shape, sample['labels'].shape

(torch.Size([80, 3000]), torch.Size([14]))

# DataLoaders

In [30]:
import os
os.cpu_count()

32

In [31]:
# 기존 버전
# 지금은 필요하지 않습니다.

def prepare_loaders(df,
                    train_ratio,
                    config = config,
                    collate_fn = data_collator
                    ):

    total_rows = df.shape[0]
    cutoff = int(train_ratio * total_rows)
    print(cutoff)

    # DataFrame Indexing
    train_df = df[:cutoff].reset_index(drop = True)
    valid_df = df[cutoff:].reset_index(drop = True)

    # MyDataset
    train_ds = SimpleASRDataset(df = train_df, processor = processor)
    valid_ds = SimpleASRDataset(df = valid_df, processor = processor)

    # DataLoader
    train_loader = DataLoader(train_ds,
                              batch_size = config.train_batch_size,
                              shuffle= True,
                            #   num_workers= 1,
                              collate_fn= data_collator,
                              drop_last = True
                              )
    valid_loader = DataLoader(valid_ds,
                              batch_size = config.valid_batch_size,
                              shuffle= False,
                            #   num_workers= 1,
                              collate_fn= data_collator,
                              drop_last = True
                              )

    print("DataLoader Completed")
    return train_loader, valid_loader

In [32]:
train_loader, valid_loader = prepare_loaders(df, 0.8)

49861
DataLoader Completed


In [33]:
sample = next(iter(train_loader))
sample['input_features'].shape, sample['labels'].shape

(torch.Size([16, 80, 3000]), torch.Size([16, 61]))

# Evaluation Metric : `CER`

In [24]:
import evaluate

def compute_metrics(pred):
    # import evaluate
    metric = evaluate.load('cer')

    pred_ids = pred.predictions
    label_ids = pred.label_ids

    # pad_token을 -100으로 치환
    label_ids[label_ids == -100] = tokenizer.pad_token_id

    # metrics 계산 시 special token들을 빼고 계산하도록 설정
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = tokenizer.batch_decode(label_ids, skip_special_tokens=True)

    cer = 100 * metric.compute(predictions=pred_str, references=label_str)

    return {"cer": cer}

# Download & Load Model

In [35]:
print(model_name)

SungBeom/whisper-small-ko


In [25]:
from transformers import WhisperForConditionalGeneration

model = WhisperForConditionalGeneration.from_pretrained(model_name)
model.config.forced_decoder_ids = None
model.config.suppress_tokens = []

config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/967M [00:00<?, ?B/s]

In [37]:
# GPU
model = model.to(device)

In [38]:
# model cuda? Check!
next(model.parameters()).is_cuda

True

# Optimizer

In [26]:
optimizer = torch.optim.AdamW(model.parameters(), lr = config.lr, weight_decay=0.01)
print("Optimizer Defined")

Optimizer Defined


# Scheduler

In [27]:
steps_per_epoch = len(train_ds) // effective_batch_size  # 기존 : config.train_batch_size
steps_per_epoch

779

In [28]:
scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer,
                                               max_lr = config.lr,
                                               steps_per_epoch = int(steps_per_epoch),
                                               epochs = config.n_epochs
                                               )

# wandb init

In [None]:
# Colab에서 진행하는 경우, wandb에 Google 계정으로 가입되어있다면, 자동으로 로그인이 되기도 합니다. (지금도 가능한 지는 모르겠네요)

# login at CLI

wandb.login(key=wandb_key)

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: C:\Users\user\_netrc
[34m[1mwandb[0m: Currently logged in as: [33mjwk20001007[0m ([33mjwk1007[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [None]:
run = wandb.init(project= 'Whisper-ko-finetune',
                 config = config,
                 job_type = 'Training',
                 name = "whisper-small-0701-v2",
                 anonymous = 'must'
                 )

# TrainingArguments

In [45]:
print(steps_per_epoch)

779


In [29]:
max_iters = int(steps_per_epoch * config.n_epochs)
max_iters

2337

In [47]:
print(f"Effective batch size: {effective_batch_size}")
print(f"Steps per epoch: {steps_per_epoch}")
print(f"Total max steps: {max_iters}")

Effective batch size: 64
Steps per epoch: 779
Total max steps: 2337


In [32]:
from transformers import Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir="./model",                        # 원하는 경로
    # per_gpu_train_batch_size= config.train_batch_size,
    per_device_train_batch_size= config.train_batch_size,
    gradient_accumulation_steps= config.gradient_accumulation_steps, # 기존 : 1
    learning_rate= config.lr,
    # warmup_steps=500,                     # Scheduler를 이전 셀에서 선언해서 입력하지 않습니다.
    # evaluation_strategy = 'epoch',        # epoch 기준으로 평가할 수 있습니다.
    evaluation_strategy = 'steps',
    eval_steps=350,
    # num_train_epochs= config.n_epochs,
    max_steps = max_iters,                  # epoch 대신 설정
    seed = config.seed,                     # 이전에서 선언했지만, 여기서 한 번 더 해도 상관없습니다.
    gradient_checkpointing=True,
    group_by_length = True,
    weight_decay=0.01,  # L2 정규화
    lr_scheduler_type="linear",  # 학습률 스케줄러
    # fp16=True,                            # mixed_precision="fp16"
    # bf16=True,                              # mixed_precision="bf16"
    per_gpu_eval_batch_size= config.valid_batch_size,
    # per_device_eval_batch_size= config.valid_batch_size,
    predict_with_generate=True,
    generation_max_length=225,
    logging_strategy="steps",
    logging_steps=200,
    report_to=["wandb"],
    load_best_model_at_end=True,
    metric_for_best_model="cer",            # 한국어의 경우 'wer'보다는 'cer'이 더 적합할 것
    greater_is_better=False,
    save_strategy="steps",                  # 'epoch': epoch 기준으로 저장 가능
    save_steps=350,
    save_total_limit = 1,
    # push_to_hub=False,
)


# Trainer

In [33]:
from transformers import Seq2SeqTrainer, EarlyStoppingCallback

trainer = Seq2SeqTrainer(
    args=training_args,
    model=model,
    train_dataset = train_ds,
    eval_dataset = valid_ds,  # or "test"
    data_collator = data_collator,
    compute_metrics=compute_metrics,
    tokenizer=processor.feature_extractor,
    optimizers = (optimizer, scheduler),
    callbacks=[EarlyStoppingCallback(early_stopping_patience=5)]
)


max_steps is given, it will override any value given in num_train_epochs


# Training

In [50]:
trainer.train()

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.43.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.
`use_cache = True` is incompatible with gradient checkpointing. Setting `use_cache = False`...


Step,Training Loss,Validation Loss,Cer
350,2.2776,0.28729,8.038269
700,0.382,0.268233,7.319475
1050,0.2718,0.235926,6.414792
1400,0.2228,0.202487,5.468522
1750,0.1854,0.180468,4.697308
2100,0.1167,0.164454,4.150281


Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future version. Using `--per_device_eval_batch_size` is preferred.
Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future version. Using `--per_device_eval_batch_size` is preferred.
Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future version. Using `--per_device_eval_batch_size` is preferred.
`generation_config` default values have been modified to match model-specific defaults: {'suppress_tokens': [], 'begin_suppress_tokens': [220, 50257]}. If this is not desired, please set these values explicitly.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
A custom logits processor of type <class 'transformers.generation.logits_process.SuppressTokensLogitsProcessor'> ha

TrainOutput(global_step=2337, training_loss=0.4168257523562391, metrics={'train_runtime': 19050.6082, 'train_samples_per_second': 7.851, 'train_steps_per_second': 0.123, 'total_flos': 4.3129088077824e+19, 'train_loss': 0.4168257523562391, 'epoch': 2.9971126082771895})

In [45]:
print("Training Finished")

Training Finished


> Evaluation 진행

In [46]:
trainer.evaluate()

Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future version. Using `--per_device_eval_batch_size` is preferred.
Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future version. Using `--per_device_eval_batch_size` is preferred.
Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future version. Using `--per_device_eval_batch_size` is preferred.


Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future version. Using `--per_device_eval_batch_size` is preferred.


{'eval_loss': 0.3501710593700409,
 'eval_cer': 10.046542156453249,
 'eval_runtime': 1343.6309,
 'eval_samples_per_second': 9.278,
 'eval_steps_per_second': 0.581,
 'epoch': 1.025665704202759}

In [47]:
val_metrics = trainer.evaluate()
print("📊 Validation CER:", val_metrics["eval_cer"])

Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future version. Using `--per_device_eval_batch_size` is preferred.
Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future version. Using `--per_device_eval_batch_size` is preferred.
Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future version. Using `--per_device_eval_batch_size` is preferred.
Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future version. Using `--per_device_eval_batch_size` is preferred.


📊 Validation CER: 10.046542156453249


In [48]:
print("Evaluation Finished")

Evaluation Finished


> Test 진행

In [None]:
test_results = trainer.predict(test_dataset=test_ds)
print("📊 Test CER:", test_results.metrics["test_cer"])

Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future version. Using `--per_device_eval_batch_size` is preferred.
Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future version. Using `--per_device_eval_batch_size` is preferred.
Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future version. Using `--per_device_eval_batch_size` is preferred.


📊 Test CER: 15.901157840967137


# save

In [34]:
## Save

from transformers import Seq2SeqTrainer, EarlyStoppingCallback
from transformers import Seq2SeqTrainingArguments

trainer.model.save_pretrained('./model')
tokenizer.save_pretrained('./model')

Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [], 'begin_suppress_tokens': [220, 50257]}


('./model\\tokenizer_config.json',
 './model\\special_tokens_map.json',
 './model\\vocab.json',
 './model\\merges.txt',
 './model\\normalizer.json',
 './model\\added_tokens.json')

In [None]:
from transformers import WhisperFeatureExtractor, WhisperTokenizer, WhisperProcessor

tokenizer = WhisperTokenizer.from_pretrained('./model')
feature_extractor = WhisperFeatureExtractor.from_pretrained("kimthegarden/whisper-small-ko-low-qual-voice")
processor = WhisperProcessor(feature_extractor=feature_extractor, tokenizer=tokenizer)
processor.save_pretrained('./model')


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


[]

# training is finished

In [None]:
import gc

torch.cuda.empty_cache()
_ = gc.collect()

print("Train Completed")

Train Completed


# wandb finish()

In [None]:
run.finish()

In [36]:
from huggingface_hub import whoami
print(whoami())

{'type': 'user', 'id': '67fc9b4baf940ad6c49fef50', 'name': 'kimthegarden', 'fullname': 'JEONGWONKIM', 'email': 'jwk20001007@gmail.com', 'emailVerified': True, 'canPay': False, 'periodEnd': None, 'isPro': False, 'avatarUrl': '/avatars/95b113bc05badc7021436ba071d30e66.svg', 'orgs': [], 'auth': {'type': 'access_token', 'accessToken': {'displayName': 'jeongwon', 'role': 'write', 'createdAt': '2025-07-02T01:57:45.264Z'}}}


In [None]:
from transformers import WhisperForConditionalGeneration, WhisperProcessor
from huggingface_hub import login

# 1. 로그인
login(token=hf_token)  # Write 권한 포함된 토큰 입력

# 2. 저장된 모델 로드
model = WhisperForConditionalGeneration.from_pretrained("./model")
processor = WhisperProcessor.from_pretrained("./model")

# 3. push to hub
model.push_to_hub("kimthegarden/whisper-small-ko-low-qual-voice",use_auth_token=True)
processor.push_to_hub("kimthegarden/whisper-small-ko-low-qual-voice",use_auth_token=True)