# install

In [None]:
# %%capture
# !pip install datasets>=2.6.1
# !pip install transformers==4.41.1
# !pip install sentence-transformers==2.7.0
# !pip install peft==0.10.0
# !pip install evaluate>=0.30
# !pip install jiwer
# !pip install accelerate -U
# !pip install transformers[torch]
# !pip install wandb
# !pip install matplotlib

# import

In [2]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import wandb
import torch
from dataclasses import dataclass
from typing import Any, Dict, List, Union

from datasets import Dataset, DatasetDict
from datasets import Audio

# Google Mount

In [3]:
# from google.colab import drive
# drive.mount('/content/drive')

In [None]:
# text_base_dir = f"/workspace/train_cut/D03_Transcription"
# audio_base_dir = f"/workspace/train_cut/D03_Audio"
# save_dir = f"/workspace/"

# 데이터 전처리

In [5]:
import os
import glob
import numpy as np
import pandas as pd

from tqdm.auto import tqdm

In [None]:
# # 1. 오디오 파일 경로 취합
# # import glob

# path = audio_base_dir + "*/*/*" # "오디오 파일들이 포함된 경로를 입력한다. - 예) /content/drive/MyDrive/NLP_Project_data/raw_data/*"
# raw_data_list = glob.glob(path)
# raw_data_list = sorted(raw_data_list)

In [None]:
from pathlib import Path

text_base_dir = Path("C:/Users/Playdata/Desktop/test/train_cut/D03_Transcription")
audio_base_dir = Path("C:/Users/Playdata/Desktop/test/train_cut/D03_Audio")

# .txt 파일 glob 재설정 (재귀적 탐색)
labeled_data_list = sorted(text_base_dir.rglob("*.txt"))  # ✅ 모든 하위 폴더의 .txt 탐색
raw_data_list = sorted(audio_base_dir.rglob("*.wav"))     # ✅ 모든 하위 폴더의 .wav 탐색

In [7]:
print(f"file_list : {raw_data_list[:10]}")
print(len(raw_data_list))

file_list : ['/workspace/D03_Audio/J13/S000001/0001.wav', '/workspace/D03_Audio/J13/S000001/0002.wav', '/workspace/D03_Audio/J13/S000001/0003.wav', '/workspace/D03_Audio/J13/S000001/0004.wav', '/workspace/D03_Audio/J13/S000001/0005.wav', '/workspace/D03_Audio/J13/S000001/0006.wav', '/workspace/D03_Audio/J13/S000002/0001.wav', '/workspace/D03_Audio/J13/S000002/0002.wav', '/workspace/D03_Audio/J13/S000002/0003.wav', '/workspace/D03_Audio/J13/S000002/0005.wav']
272973


# config

In [8]:
class Config():
    def __init__(self):
        self.train_batch_size = 32 # 기존 : 64
        self.valid_batch_size = 32 # 기존 : 32
        self.lr = 2e-4
        self.seed = 2024
        self.n_epochs = 1
        self.gradient_accumulation_steps = 2
        # self.warm_up = 1000
        # self.max_steps = 4000

In [9]:
config = Config()
config.train_batch_size

32

In [10]:
effective_batch_size = config.train_batch_size * config.gradient_accumulation_steps

# Device

In [11]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

# Set Seed

In [12]:
# 시드 설정 X

def set_max_performance():
    # 시드 설정을 아예 하지 않음
    # 또는 아래처럼 일부 기본값으로 초기화해도 됨 (필수 아님)

    # CuDNN 자동 최적 알고리즘 활성화 (성능 향상 가능)
    torch.backends.cudnn.deterministic = False
    torch.backends.cudnn.benchmark = True

    # 필요시 여러 개의 GPU 중 가장 빠른 걸 자동 선택하게 할 수도 있음
    if torch.cuda.is_available():
        device = torch.device("cuda")
        print(f"Using GPU: {torch.cuda.get_device_name(0)}")
    else:
        device = torch.device("cpu")
        print("Using CPU")

set_max_performance()

Using GPU: NVIDIA A100-SXM4-80GB


# Load Dataset

In [None]:
save_dir = Path("C:/projects/genbotlabs/models/STT")

df = pd.read_csv(save_dir / "path_and_transcript_final.csv")
print(df.shape)
df.head()

(272973, 2)


Unnamed: 0,transcript,raw_data
0,네 감사합니다. NCS 교육과정 문의 체험입니다.,/workspace/D03_Audio/J13/S000001/0001.wav
1,네 여보세요.,/workspace/D03_Audio/J13/S000001/0002.wav
2,아 네 저기 그 NCS 인사담당자 기본 심화과정 신청하고 싶은데 교육 시간이 어...,/workspace/D03_Audio/J13/S000001/0003.wav
3,"아 네 맞습니다. 홈페이지에 나와있는 어 네, 네. 기본과정이나 심화과정 2 다...",/workspace/D03_Audio/J13/S000001/0004.wav
4,네 알겠습니다.,/workspace/D03_Audio/J13/S000001/0005.wav


In [14]:
df.tail()

Unnamed: 0,transcript,raw_data
272968,확실한 자료는 없습니다.,/workspace/D03_Audio/J15/S003011/0014.wav
272969,취업률 자료까지 있었다면 좋을 텐데 아쉽네요. 어 혼자 생각해 보고 강의 신청 여부...,/workspace/D03_Audio/J15/S003011/0015.wav
272970,네 더 필요하신 거 있으실까요?,/workspace/D03_Audio/J15/S003011/0016.wav
272971,지금은 없어요. 생기면 다시 전화드릴게요.,/workspace/D03_Audio/J15/S003011/0017.wav
272972,네 감사합니다.,/workspace/D03_Audio/J15/S003011/0018.wav


# Download Processors

In [15]:
from transformers import WhisperProcessor
from transformers import WhisperFeatureExtractor, WhisperTokenizer

model_name = "openai/whisper-small"
lang = "Korean"

# 파인튜닝을 진행하고자 하는 모델의 feature extractor를 로드
feature_extractor = WhisperFeatureExtractor.from_pretrained(model_name)

# 파인튜닝을 진행하고자 하는 모델의 tokenizer를 로드
tokenizer = WhisperTokenizer.from_pretrained(model_name, language = lang, task="transcribe")

# All we need is Processor
processor = WhisperProcessor.from_pretrained(model_name, language = lang, task="transcribe")

preprocessor_config.json: 0.00B [00:00, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

normalizer.json: 0.00B [00:00, ?B/s]

added_tokens.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


# Dataset

In [16]:
from torch.utils.data import Dataset, DataLoader
import torchaudio
from torchaudio import transforms

class SimpleASRDataset(Dataset):
    def __init__(self,
                 df,
                 processor,
                 resample_rate=16000):
        self.df = df
        self.device = device
        self.processor = processor
        self.resample_rate = resample_rate
        self.audios = self.df.raw_data.to_list()
        self.transcripts = self.df.transcript.to_list()

    def __len__(self):
        return self.df.shape[0]

    def __getitem__(self, idx):

        # audio
        wav, sample_rate = torchaudio.load(self.audios[idx])
        transform = transforms.Resample(sample_rate, self.resample_rate)
        resampled_waveform = transform(wav)

        input_features = self.processor(resampled_waveform.squeeze(0),     # Batch Size 제거
                                        sampling_rate= self.resample_rate, # whisper-small은 sampling rate을 16000인 데이터에 대해서만 input으로 받는다고 합니다.
                                        return_tensors="pt"
                                        ).input_features.squeeze(0)

        # transcripts
        labels = self.processor.tokenizer(self.transcripts[idx],
                                          padding=True,
                                          truncation=True,
                                          return_tensors="pt").input_ids.squeeze(0)

        # return
        return {'input_features': input_features, 'labels': labels}

In [17]:
processor.tokenizer("넌 누구니?", return_tensors="pt")

{'input_ids': tensor([[50258, 50264, 50359, 50363, 33386,   234, 36385,  1425,    30, 50257]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [18]:
ds = SimpleASRDataset(df = df, processor = processor)
sample = next(iter(ds))
sample

{'input_features': tensor([[-1.0557, -1.0557, -1.0557,  ..., -1.0557, -1.0557, -1.0557],
         [-1.0557, -1.0557, -1.0557,  ..., -1.0557, -1.0557, -1.0557],
         [-1.0557, -1.0557, -1.0557,  ..., -1.0557, -1.0557, -1.0557],
         ...,
         [-1.0557, -1.0557, -1.0557,  ..., -1.0557, -1.0557, -1.0557],
         [-1.0557, -1.0557, -1.0557,  ..., -1.0557, -1.0557, -1.0557],
         [-1.0557, -1.0557, -1.0557,  ..., -1.0557, -1.0557, -1.0557]]),
 'labels': tensor([50258, 50264, 50359, 50363,   220,  8808, 24399,    13, 20786,    50,
         24915, 36265,  7097,  6170, 13086,  2785, 39667, 24651,  7416,    13,
         50257])}

In [19]:
sample['input_features'].shape, sample['labels'].shape

(torch.Size([80, 3000]), torch.Size([21]))

In [20]:
sample['labels']

tensor([50258, 50264, 50359, 50363,   220,  8808, 24399,    13, 20786,    50,
        24915, 36265,  7097,  6170, 13086,  2785, 39667, 24651,  7416,    13,
        50257])

# DataCollatorSpeechSeq2SeqWithPadding

In [21]:
from dataclasses import dataclass
from typing import Any, Dict, List, Union

@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    def __init__(self, processor):
        self.processor = processor

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:

        # 인풋 데이터와 라벨 데이터의 길이가 다르며, 따라서 서로 다른 패딩 방법이 적용되어야 한다. 그러므로 두 데이터를 분리해야 한다.
        # 먼저 오디오 인풋 데이터를 간단히 토치 텐서로 반환하는 작업을 수행한다.
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")

        # Tokenize된 레이블 시퀀스를 가져온다.
        label_features = [{"input_ids": feature["labels"]} for feature in features]
        # 레이블 시퀀스에 대해 최대 길이만큼 패딩 작업을 실시한다.
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")

        # 패딩 토큰을 -100으로 치환하여 loss 계산 과정에서 무시되도록 한다.
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        # 이전 토크나이즈 과정에서 bos 토큰이 추가되었다면 bos 토큰을 잘라낸다.
        # 해당 토큰은 이후 언제든 추가할 수 있다.
        if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().cpu().item():
            labels = labels[:, 1:]

        batch["labels"] = labels

        return batch

In [22]:
# 데이터 콜레이터 초기화
data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)

# Datasets

In [24]:
# 기존 version

def prepare_datasets(df,
                    train_ratio,
                    config = config,
                    ):

    total_rows = df.shape[0]
    cutoff = int(train_ratio * total_rows)
    print(cutoff)

    # DataFrame Indexing
    train_df = df[:cutoff].reset_index(drop = True)
    valid_df = df[cutoff:].reset_index(drop = True)

    # MyDataset
    train_ds = SimpleASRDataset(df = train_df, processor = processor)
    valid_ds = SimpleASRDataset(df = valid_df, processor = processor)

    print("Dataset Completed")
    return train_ds, valid_ds

In [25]:
train_ds, valid_ds = prepare_datasets(df, train_ratio = 0.8, config = config)

218378
Dataset Completed


In [23]:
# # new version

# def prepare_datasets(df,
#                      train_ratio,
#                      val_ratio,
#                      processor,
#                      config,
#                      shuffle=True,
#                      seed=42):
#     """
#     DataFrame을 train / val / test로 분할하고 Dataset 객체로 반환
#     """
#     assert train_ratio + val_ratio < 1.0, "train + val 비율의 합은 1보다 작아야 test 세트가 존재합니다."

#     # 셔플
#     if shuffle:
#         df = df.sample(frac=1, random_state=seed).reset_index(drop=True)

#     # 분할 지점 계산
#     total_rows = df.shape[0]
#     train_cut = int(train_ratio * total_rows)
#     val_cut = int((train_ratio + val_ratio) * total_rows)

#     print(f"총 샘플 수: {total_rows}")
#     print(f"Train: {train_cut}, Val: {val_cut - train_cut}, Test: {total_rows - val_cut}")

#     # DataFrame 분리
#     train_df = df[:train_cut].reset_index(drop=True)
#     val_df = df[train_cut:val_cut].reset_index(drop=True)
#     test_df = df[val_cut:].reset_index(drop=True)

#     # Dataset 생성
#     train_ds = SimpleASRDataset(df=train_df, processor=processor)
#     val_ds = SimpleASRDataset(df=val_df, processor=processor)
#     test_ds = SimpleASRDataset(df=test_df, processor=processor)

#     return train_ds, val_ds, test_ds

In [26]:
# # ✅ 분할 비율 및 설정값 정의
# train_ratio = 0.6
# val_ratio = 0.2
shuffle_data = True
random_seed = 42

In [29]:
# train_ds, val_ds, test_ds = prepare_datasets(df=df,
#                                               train_ratio=train_ratio,
#                                               val_ratio=val_ratio,
#                                               processor=processor,
#                                               config=config,
#                                               shuffle=shuffle_data,
#                                               seed=random_seed)

총 샘플 수: 14487
Train: 8692, Val: 2897, Test: 2898


In [27]:
sample = next(iter(train_ds))
sample['input_features'].shape, sample['labels'].shape

(torch.Size([80, 3000]), torch.Size([21]))

# DataLoaders

In [28]:
import os
os.cpu_count()

255

In [29]:
# 기존 버전
# 지금은 필요하지 않습니다.

def prepare_loaders(df,
                    train_ratio,
                    config = config,
                    collate_fn = data_collator
                    ):

    total_rows = df.shape[0]
    cutoff = int(train_ratio * total_rows)
    print(cutoff)

    # DataFrame Indexing
    train_df = df[:cutoff].reset_index(drop = True)
    valid_df = df[cutoff:].reset_index(drop = True)

    # MyDataset
    train_ds = SimpleASRDataset(df = train_df, processor = processor)
    valid_ds = SimpleASRDataset(df = valid_df, processor = processor)

    # DataLoader
    train_loader = DataLoader(train_ds,
                              batch_size = config.train_batch_size,
                              shuffle= True,
                            #   num_workers= 1,
                              collate_fn= data_collator,
                              drop_last = True
                              )
    valid_loader = DataLoader(valid_ds,
                              batch_size = config.valid_batch_size,
                              shuffle= False,
                            #   num_workers= 1,
                              collate_fn= data_collator,
                              drop_last = True
                              )

    print("DataLoader Completed")
    return train_loader, valid_loader

In [31]:
train_loader, valid_loader = prepare_loaders(df, 0.8)

218378
DataLoader Completed


In [33]:
# # new version

# def prepare_loaders(df,
#                     train_ratio,
#                     val_ratio,
#                     processor,
#                     config,
#                     collate_fn,
#                     shuffle=True,
#                     seed=42):
#     """
#     DataFrame을 train / val / test로 분할하고 DataLoader 객체로 반환
#     """
#     assert train_ratio + val_ratio < 1.0, "train + val 비율의 합은 1보다 작아야 test 세트가 존재합니다."

#     if shuffle:
#         df = df.sample(frac=1, random_state=seed).reset_index(drop=True)

#     total_rows = df.shape[0]
#     train_cut = int(train_ratio * total_rows)
#     val_cut = int((train_ratio + val_ratio) * total_rows)

#     print(f"총 샘플 수: {total_rows}")
#     print(f"Train: {train_cut}, Val: {val_cut - train_cut}, Test: {total_rows - val_cut}")

#     train_df = df[:train_cut].reset_index(drop=True)
#     val_df = df[train_cut:val_cut].reset_index(drop=True)
#     test_df = df[val_cut:].reset_index(drop=True)

#     train_ds = SimpleASRDataset(df=train_df, processor=processor)
#     val_ds = SimpleASRDataset(df=val_df, processor=processor)
#     test_ds = SimpleASRDataset(df=test_df, processor=processor)

#     train_loader = DataLoader(train_ds,
#                               batch_size=config.train_batch_size,
#                               shuffle=True,
#                               collate_fn=collate_fn,
#                               drop_last=True)

#     val_loader = DataLoader(val_ds,
#                             batch_size=config.valid_batch_size,
#                             shuffle=False,
#                             collate_fn=collate_fn,
#                             drop_last=False)

#     test_loader = DataLoader(test_ds,
#                              batch_size=config.valid_batch_size,
#                              shuffle=False,
#                              collate_fn=collate_fn,
#                              drop_last=False)

#     return train_loader, val_loader, test_loader

In [34]:
# train_loader, val_loader, test_loader = prepare_loaders(df=df,
#                                                         train_ratio=train_ratio,
#                                                         val_ratio=val_ratio,
#                                                         processor=processor,
#                                                         config=config,
#                                                         collate_fn=data_collator,
#                                                         shuffle=shuffle_data,
#                                                         seed=random_seed)

총 샘플 수: 14487
Train: 8692, Val: 2897, Test: 2898


In [32]:
sample = next(iter(train_loader))
sample['input_features'].shape, sample['labels'].shape

(torch.Size([32, 80, 3000]), torch.Size([32, 65]))

# Evaluation Metric : `CER`

In [33]:
import evaluate

def compute_metrics(pred):
    # import evaluate
    metric = evaluate.load('cer')

    pred_ids = pred.predictions
    label_ids = pred.label_ids

    # pad_token을 -100으로 치환
    label_ids[label_ids == -100] = tokenizer.pad_token_id

    # metrics 계산 시 special token들을 빼고 계산하도록 설정
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = tokenizer.batch_decode(label_ids, skip_special_tokens=True)

    cer = 100 * metric.compute(predictions=pred_str, references=label_str)

    return {"cer": cer}

# Download & Load Model

In [34]:
print(model_name)

openai/whisper-small


In [35]:
from transformers import WhisperForConditionalGeneration

model = WhisperForConditionalGeneration.from_pretrained(model_name)
model.config.forced_decoder_ids = None
model.config.suppress_tokens = []

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/967M [00:00<?, ?B/s]

generation_config.json: 0.00B [00:00, ?B/s]

In [36]:
# GPU
model = model.to(device)

In [37]:
# model cuda? Check!
next(model.parameters()).is_cuda

True

# Optimizer

In [38]:
optimizer = torch.optim.Adam(model.parameters(), lr = config.lr)
print("Optimizer Defined")

Optimizer Defined


# Scheduler

In [39]:
steps_per_epoch = len(train_ds) // effective_batch_size  # 기존 : config.train_batch_size
steps_per_epoch

3412

In [40]:
scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer,
                                               max_lr = config.lr,
                                               steps_per_epoch = int(steps_per_epoch),
                                               epochs = config.n_epochs
                                               )

# wandb init

In [41]:
# Colab에서 진행하는 경우, wandb에 Google 계정으로 가입되어있다면, 자동으로 로그인이 되기도 합니다. (지금도 가능한 지는 모르겠네요)

# login at CLI
!wandb login --relogin '2ea04dd0d2989d8bfd46b99ce355214f661b1b46'

[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin


In [42]:
run = wandb.init(project= 'Korean-Whisper-Fine-Tune',
                 config = config,
                 job_type = 'Training',
                 name = "whisper-small-base-0630",
                 anonymous = 'must'
                 )

[34m[1mwandb[0m: Currently logged in as: [33mjwk20001007[0m ([33mjwk1007[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [43]:
# import torch
# torch.cuda.empty_cache()

# TrainingArguments

In [44]:
print(steps_per_epoch)

3412


In [45]:
max_iters = int(steps_per_epoch * config.n_epochs)
max_iters

3412

In [46]:
print(f"Effective batch size: {effective_batch_size}")
print(f"Steps per epoch: {steps_per_epoch}")
print(f"Total max steps: {max_iters}")

Effective batch size: 64
Steps per epoch: 3412
Total max steps: 3412


In [48]:
from transformers import Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir="./model",                        # 원하는 경로
    # per_gpu_train_batch_size= config.train_batch_size,
    per_device_train_batch_size= config.train_batch_size,
    gradient_accumulation_steps= config.gradient_accumulation_steps, # 기존 : 1
    learning_rate= config.lr,
    # warmup_steps=500,                     # Scheduler를 이전 셀에서 선언해서 입력하지 않습니다.
    # evaluation_strategy = 'epoch',        # epoch 기준으로 평가할 수 있습니다.
    eval_strategy = 'steps',
    eval_steps=500,
    # num_train_epochs= config.n_epochs,
    max_steps = max_iters,                  # epoch 대신 설정
    seed = config.seed,                     # 이전에서 선언했지만, 여기서 한 번 더 해도 상관없습니다.
    gradient_checkpointing=True,
    group_by_length = True,
    # fp16=True,                            # mixed_precision="fp16"
    bf16=True,                              # mixed_precision="bf16"
    per_gpu_eval_batch_size= config.valid_batch_size,
    # per_device_eval_batch_size= config.valid_batch_size,
    predict_with_generate=True,
    generation_max_length=225,
    logging_strategy="steps",
    logging_steps=250,
    report_to=["wandb"],
    load_best_model_at_end=True,
    metric_for_best_model="cer",            # 한국어의 경우 'wer'보다는 'cer'이 더 적합할 것
    greater_is_better=False,
    save_strategy="steps",                  # 'epoch': epoch 기준으로 저장 가능
    save_steps=500,
    save_total_limit = 1,
    # push_to_hub=False,
)


# Trainer

In [50]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    args=training_args,
    model=model,
    train_dataset = train_ds,
    eval_dataset = valid_ds,  # or "test"
    data_collator = data_collator,
    compute_metrics=compute_metrics,
    tokenizer=processor.feature_extractor,
    optimizers = (optimizer, scheduler),
    # callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)


max_steps is given, it will override any value given in num_train_epochs


# Training

In [51]:
trainer.train()

KeyboardInterrupt: 

In [None]:
print("Training Finished")

> Evaluation 진행

In [None]:
# trainer.evaluate()

In [None]:
val_metrics = trainer.evaluate()
print("📊 Validation CER:", val_metrics["eval_cer"])

In [None]:
print("Evaluation Finished")

> Test 진행

In [58]:
test_results = trainer.predict(test_dataset=test_ds)
print("📊 Test CER:", test_results.metrics["test_cer"])

Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future version. Using `--per_device_eval_batch_size` is preferred.
Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future version. Using `--per_device_eval_batch_size` is preferred.
Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future version. Using `--per_device_eval_batch_size` is preferred.


📊 Test CER: 15.901157840967137


# save

In [123]:
## Save

trainer.model.save_pretrained('./model')
tokenizer.save_pretrained('./model')

Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [], 'begin_suppress_tokens': [220, 50257]}


('./model/tokenizer_config.json',
 './model/special_tokens_map.json',
 './model/vocab.json',
 './model/merges.txt',
 './model/normalizer.json',
 './model/added_tokens.json')

In [126]:
from transformers import WhisperFeatureExtractor, WhisperTokenizer, WhisperProcessor

tokenizer = WhisperTokenizer.from_pretrained('./model')
feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-small")
processor = WhisperProcessor(feature_extractor=feature_extractor, tokenizer=tokenizer)
processor.save_pretrained('./model')


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


[]

# training is finished

In [124]:
import gc

torch.cuda.empty_cache()
_ = gc.collect()

print("Train Completed")

Train Completed


# wandb finish()

In [125]:
run.finish()

0,1
eval/cer,▃█▇▅▃▂▁▂▁▁
eval/loss,▁▃██▆▆▁▃▂▂
eval/runtime,▁▃▄█▂▁▁▄▂▂
eval/samples_per_second,█▆▄▁▇██▅▆▆
eval/steps_per_second,█▆▄▁▇██▅▆▆
train/epoch,▁▁▂▂▃▃▃▃▄▄▅▅▆▆▆▆▇▇██
train/global_step,▁▁▂▂▃▃▃▃▄▄▅▅▆▆▆▆▇▇██
train/grad_norm,█▆▅▅▄▂▃▂▁
train/learning_rate,▃▆██▇▅▃▂▁
train/loss,█▄▄▃▂▂▁▁▁

0,1
eval/cer,17.79124
eval/loss,0.64426
eval/runtime,797.3118
eval/samples_per_second,3.815
eval/steps_per_second,0.478
total_flos,1.835518580195328e+19
train/epoch,6.96933
train/global_step,994.0
train/grad_norm,0.34609
train/learning_rate,0.0


In [None]:
from transformers import WhisperForConditionalGeneration, WhisperProcessor
from huggingface_hub import login

# 1. 로그인
login("hf_xxxxx")  # Write 권한 포함된 토큰 입력

# 2. 저장된 모델 로드
model = WhisperForConditionalGeneration.from_pretrained("./model")
processor = WhisperProcessor.from_pretrained("./model")

# 3. push to hub
model.push_to_hub("kimthegarden/whisper-korean-stt-v3-250630")
processor.push_to_hub("kimthegarden/whisper-korean-stt-v3-250630")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [], 'begin_suppress_tokens': [220, 50257]}


model.safetensors:   0%|          | 0.00/967M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/kimthegarden/whisper-korean-stt-v1-250625/commit/723b6aefec0a6710237abdff2b5be509fd11dc13', commit_message='Upload processor', commit_description='', oid='723b6aefec0a6710237abdff2b5be509fd11dc13', pr_url=None, repo_url=RepoUrl('https://huggingface.co/kimthegarden/whisper-korean-stt-v1-250625', endpoint='https://huggingface.co', repo_type='model', repo_id='kimthegarden/whisper-korean-stt-v1-250625'), pr_revision=None, pr_num=None)