In [5]:
sample_text = "쉟"
j2hcj(h2j(sample_text))

'ㅅㅞㅀ'

In [6]:
__all__ = ["split_syllable_char", "split_syllables",
           "join_jamos", "join_jamos_char",
           "CHAR_INITIALS", "CHAR_MEDIALS", "CHAR_FINALS"]

import itertools

INITIAL = 0x001
MEDIAL = 0x010
FINAL = 0x100
CHAR_LISTS = {
    INITIAL: list(map(chr, [
        0x3131, 0x3132, 0x3134, 0x3137, 0x3138, 0x3139,
        0x3141, 0x3142, 0x3143, 0x3145, 0x3146, 0x3147,
        0x3148, 0x3149, 0x314a, 0x314b, 0x314c, 0x314d,
        0x314e
    ])),
    MEDIAL: list(map(chr, [
        0x314f, 0x3150, 0x3151, 0x3152, 0x3153, 0x3154,
        0x3155, 0x3156, 0x3157, 0x3158, 0x3159, 0x315a,
        0x315b, 0x315c, 0x315d, 0x315e, 0x315f, 0x3160,
        0x3161, 0x3162, 0x3163
    ])),
    FINAL: list(map(chr, [
        0x3131, 0x3132, 0x3133, 0x3134, 0x3135, 0x3136,
        0x3137, 0x3139, 0x313a, 0x313b, 0x313c, 0x313d,
        0x313e, 0x313f, 0x3140, 0x3141, 0x3142, 0x3144,
        0x3145, 0x3146, 0x3147, 0x3148, 0x314a, 0x314b,
        0x314c, 0x314d, 0x314e
    ]))
}
CHAR_INITIALS = CHAR_LISTS[INITIAL]
CHAR_MEDIALS = CHAR_LISTS[MEDIAL]
CHAR_FINALS = CHAR_LISTS[FINAL]
CHAR_SETS = {k: set(v) for k, v in CHAR_LISTS.items()}
CHARSET = set(itertools.chain(*CHAR_SETS.values()))
CHAR_INDICES = {k: {c: i for i, c in enumerate(v)}
                for k, v in CHAR_LISTS.items()}


def is_hangul_syllable(c):
    return 0xac00 <= ord(c) <= 0xd7a3  # Hangul Syllables


def is_hangul_jamo(c):
    return 0x1100 <= ord(c) <= 0x11ff  # Hangul Jamo


def is_hangul_compat_jamo(c):
    return 0x3130 <= ord(c) <= 0x318f  # Hangul Compatibility Jamo


def is_hangul_jamo_exta(c):
    return 0xa960 <= ord(c) <= 0xa97f  # Hangul Jamo Extended-A


def is_hangul_jamo_extb(c):
    return 0xd7b0 <= ord(c) <= 0xd7ff  # Hangul Jamo Extended-B


def is_hangul(c):
    return (is_hangul_syllable(c) or
            is_hangul_jamo(c) or
            is_hangul_compat_jamo(c) or
            is_hangul_jamo_exta(c) or
            is_hangul_jamo_extb(c))


def is_supported_hangul(c):
    return is_hangul_syllable(c) or is_hangul_compat_jamo(c)


def check_hangul(c, jamo_only=False):
    if not ((jamo_only or is_hangul_compat_jamo(c)) or is_supported_hangul(c)):
        raise ValueError(f"'{c}' is not a supported hangul character. "
                         f"'Hangul Syllables' (0xac00 ~ 0xd7a3) and "
                         f"'Hangul Compatibility Jamos' (0x3130 ~ 0x318f) are "
                         f"supported at the moment.")


def get_jamo_type(c):
    check_hangul(c)
    assert is_hangul_compat_jamo(c), f"not a jamo: {ord(c):x}"
    return sum(t for t, s in CHAR_SETS.items() if c in s)


def split_syllable_char(c):
    """
    Splits a given korean syllable into its components. Each component is
    represented by Unicode in 'Hangul Compatibility Jamo' range.

    Arguments:
        c: A Korean character.

    Returns:
        A triple (initial, medial, final) of Hangul Compatibility Jamos.
        If no jamo corresponds to a position, `None` is returned there.

    Example:
        >>> split_syllable_char("안")
        ("ㅇ", "ㅏ", "ㄴ")
        >>> split_syllable_char("고")
        ("ㄱ", "ㅗ", None)
        >>> split_syllable_char("ㅗ")
        (None, "ㅗ", None)
        >>> split_syllable_char("ㅇ")
        ("ㅇ", None, None)
    """
    check_hangul(c)
    if len(c) != 1:
        raise ValueError("Input string must have exactly one character.")

    init, med, final = None, None, None
    if is_hangul_syllable(c):
        offset = ord(c) - 0xac00
        x = (offset - offset % 28) // 28
        init, med, final = x // 21, x % 21, offset % 28
        if not final:
            final = None
        else:
            final -= 1
    else:
        pos = get_jamo_type(c)
        if pos & INITIAL == INITIAL:
            pos = INITIAL
        elif pos & MEDIAL == MEDIAL:
            pos = MEDIAL
        elif pos & FINAL == FINAL:
            pos = FINAL
        idx = CHAR_INDICES[pos][c]
        if pos == INITIAL:
            init = idx
        elif pos == MEDIAL:
            med = idx
        else:
            final = idx
    return tuple(CHAR_LISTS[pos][idx] if idx is not None else None
                 for pos, idx in
                 zip([INITIAL, MEDIAL, FINAL], [init, med, final]))


def split_syllables(s, ignore_err=True, pad=None):
    """
    Performs syllable-split on a string.

    Arguments:
        s (str): A string (possibly mixed with non-Hangul characters).
        ignore_err (bool): If set False, it ensures that all characters in
            the string are Hangul-splittable and throws a ValueError otherwise.
            (default: True)
        pad (str): Pad empty jamo positions (initial, medial, or final) with
            `pad` character. This is useful for cases where fixed-length
            strings are needed. (default: None)

    Returns:
        Hangul-split string

    Example:
        >>> split_syllables("안녕하세요")
        "ㅇㅏㄴㄴㅕㅇㅎㅏㅅㅔㅇㅛ"
        >>> split_syllables("안녕하세요~~", ignore_err=False)
        ValueError: encountered an unsupported character: ~ (0x7e)
        >>> split_syllables("안녕하세요ㅛ", pad="x")
        'ㅇㅏㄴㄴㅕㅇㅎㅏxㅅㅔxㅇㅛxxㅛx'
    """

    def try_split(c):
        try:
            return split_syllable_char(c)
        except ValueError:
            if ignore_err:
                return (c,)
            raise ValueError(f"encountered an unsupported character: "
                             f"{c} (0x{ord(c):x})")

    s = map(try_split, s)
    if pad is not None:
        tuples = map(lambda x: tuple(pad if y is None else y for y in x), s)
    else:
        tuples = map(lambda x: filter(None, x), s)
    return "".join(itertools.chain(*tuples))


def join_jamos_char(init, med, final=None):
    """
    Combines jamos into a single syllable.

    Arguments:
        init (str): Initial jao.
        med (str): Medial jamo.
        final (str): Final jamo. If not supplied, the final syllable is made
            without the final. (default: None)

    Returns:
        A Korean syllable.
    """
    chars = (init, med, final)
    for c in filter(None, chars):
        check_hangul(c, jamo_only=True)

    idx = tuple(CHAR_INDICES[pos][c] if c is not None else c
                for pos, c in zip((INITIAL, MEDIAL, FINAL), chars))
    init_idx, med_idx, final_idx = idx
    # final index must be shifted once as
    # final index with 0 points to syllables without final
    final_idx = 0 if final_idx is None else final_idx + 1
    return chr(0xac00 + 28 * 21 * init_idx + 28 * med_idx + final_idx)


def join_jamos(s, ignore_err=True):
    """
    Combines a sequence of jamos to produce a sequence of syllables.

    Arguments:
        s (str): A string (possible mixed with non-jamo characters).
        ignore_err (bool): If set False, it will ensure that all characters
            will be consumed for the making of syllables. It will throw a
            ValueError when it fails to do so. (default: True)

    Returns:
        A string

    Example:
        >>> join_jamos("ㅇㅏㄴㄴㅕㅇㅎㅏㅅㅔㅇㅛ")
        "안녕하세요"
        >>> join_jamos("ㅇㅏㄴㄴㄴㅕㅇㅎㅏㅅㅔㅇㅛ")
        "안ㄴ녕하세요"
        >>> join_jamos()
    """
    last_t = 0
    queue = []
    new_string = ""

    def flush(n=0):
        new_queue = []
        while len(queue) > n:
            new_queue.append(queue.pop())
        if len(new_queue) == 1:
            if not ignore_err:
                raise ValueError(f"invalid jamo character: {new_queue[0]}")
            result = new_queue[0]
        elif len(new_queue) >= 2:
            try:
                result = join_jamos_char(*new_queue)
            except (ValueError, KeyError):
                # Invalid jamo combination
                if not ignore_err:
                    raise ValueError(f"invalid jamo characters: {new_queue}")
                result = "".join(new_queue)
        else:
            result = None
        return result

    for c in s:
        if c not in CHARSET:
            if queue:
                new_c = flush() + c
            else:
                new_c = c
            last_t = 0
        else:
            t = get_jamo_type(c)
            new_c = None
            if t & FINAL == FINAL:
                if not (last_t == MEDIAL):
                    new_c = flush()
            elif t == INITIAL:
                new_c = flush()
            elif t == MEDIAL:
                if last_t & INITIAL == INITIAL:
                    new_c = flush(1)
                else:
                    new_c = flush()
            last_t = t
            queue.insert(0, c)
        if new_c:
            new_string += new_c
    if queue:
        new_string += flush()
    return new_string

In [7]:
join_jamos("ㅅㅞㅀㅅㅞㅀ")

'쉟쉟'

In [None]:
!pip uninstall torchvision
!pip install torchvision

In [7]:
!pip install jamo




[notice] A new release of pip available: 22.3.1 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [1]:


import torch
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
import torchaudio
from jamo import h2j, j2hcj

In [2]:
model_name = "Kkonjeong/wav2vec2-base-korean"
model = Wav2Vec2ForCTC.from_pretrained(model_name).to("cuda")
processor = Wav2Vec2Processor.from_pretrained(model_name)

config.json:   0%|          | 0.00/2.37k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


model.safetensors:   0%|          | 0.00/378M [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/257 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.09k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/697 [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/30.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/96.0 [00:00<?, ?B/s]

In [8]:

# model.eval()

def load_and_preprocess_audio(file_path):
    speech_array, sampling_rate = torchaudio.load(file_path)
    if sampling_rate != 16000:
        resampler = torchaudio.transforms.Resample(sampling_rate, 16000)
        speech_array = resampler(speech_array)
    input_values = processor(speech_array.squeeze().numpy(), sampling_rate=16000).input_values[0]
    return input_values

def predict(file_path):
    input_values = load_and_preprocess_audio(file_path)
    input_values = torch.tensor(input_values).unsqueeze(0).to("cuda")
    with torch.no_grad():
        logits = model(input_values).logits
    predicted_ids = torch.argmax(logits, dim=-1)
    transcription = processor.batch_decode(predicted_ids)[0]
    return transcription

audio_file_path = f"C:\\Users\\euije\\Downloads\\1과\\1\\track024.mp3"
transcription = predict(audio_file_path)
print("Transcription:", join_jamos(transcription))

Transcription: 산 당살들고 라즈랑겨랄십시아은행하고 서점 사이에 미용실이있엉미용실하고 식담 사이에 서점이 있어요서점위에 야국이 ㅇㅆ어요식당 위에 커피숍이 있어요영화관하고 백화점 사이에 편의점이 있어요영화관 왼 쪽ㅇ 무체국이 있어요


In [None]:
train_test_split = dataset.train_test_split(train_size=1000, test_size=100)
train_dataset = train_test_split['train']
valid_dataset = train_test_split['test']

In [12]:
len(train_dataset['article'])

200

In [6]:
def preprocess_function(examples):
    # 입력 텍스트 설정
    inputs = examples['article']
    
    # 입력 텍스트 토큰화 (패딩, 트렁케이션)
    model_inputs = tokenizer(
        inputs, 
        max_length=768, 
        truncation=True, 
        padding="max_length", 
        return_tensors="pt"
    )
    
    # 요약 텍스트 토큰화 (패딩, 트렁케이션)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            examples['summary'], 
            max_length=256, 
            truncation=True, 
            padding="max_length", 
            return_tensors="pt"
        )
    
    # 모델에 입력할 데이터에 타겟 추가
    # CUDA 디바이스로 이동
    model_inputs = {key: value.to(device) for key, value in model_inputs.items()}
    labels = labels['input_ids'].to(device)
    
    # 딕셔너리 형태로 반환
    model_inputs["labels"] = labels
    return model_inputs

# 데이터셋을 토크나이징
tokenized_train = train_dataset.map(preprocess_function, batched=True)
tokenized_valid = valid_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]



Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [7]:
print(tokenized_train)
print(tokenized_train[0]['input_ids'])
print(len(tokenized_train[0]['input_ids']))
print(tokenized_train[0]['labels'])
print(len(tokenized_train[0]['labels']))

Dataset({
    features: ['article', 'summary', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 100
})
[5282, 222, 262, 222, 6851, 311, 222, 1021, 381, 222, 741, 222, 39561, 291, 222, 1508, 222, 530, 15, 222, 4244, 13, 222, 831, 911, 279, 222, 19316, 1400, 2452, 278, 222, 3263, 538, 222, 284, 222, 1231, 222, 1444, 222, 1516, 6544, 222, 17720, 1271, 302, 222, 1693, 279, 222, 745, 222, 1972, 278, 222, 5922, 222, 334, 222, 530, 15, 222, 19316, 1400, 302, 222, 2312, 262, 222, 911, 3339, 279, 222, 7335, 222, 2075, 279, 222, 745, 222, 1271, 274, 222, 1231, 302, 222, 642, 262, 222, 451, 222, 830, 222, 495, 222, 3452, 764, 222, 334, 222, 991, 222, 334, 222, 530, 15, 222, 1161, 222, 2855, 222, 2119, 3055, 222, 17720, 1775, 13797, 222, 262, 222, 1283, 1278, 222, 3293, 222, 710, 222, 39561, 262, 222, 991, 222, 334, 222, 530, 15, 222, 3119, 13, 222, 19316, 1400, 2452, 302, 222, 2542, 222, 418, 26091, 279, 222, 40824, 222, 1546, 222, 15748, 222, 491, 222, 1271, 278, 222, 3452, 764, 222, 912,

In [7]:
from transformers import Trainer, TrainingArguments, EarlyStoppingCallback

# 훈련 설정
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",  # 매 epoch마다 평가
    save_strategy="epoch",        # 매 epoch마다 체크포인트 저장
    num_train_epochs=30,  # 최대 30 epoch
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    save_total_limit=2,
    push_to_hub=False,
    report_to="none",
    load_best_model_at_end=True,  # 훈련 종료 후 가장 좋은 모델을 불러오기
)

# Trainer 설정
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_valid,
    tokenizer=tokenizer, 
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]  # 조기 종료 콜백 추가
)

# 파인튜닝 시작
trainer.train()

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss
1,No log,0.266612
2,No log,0.247432
3,No log,0.241627
4,0.820000,0.245901


KeyboardInterrupt: 

In [None]:
# 평가
trainer.evaluate()

In [9]:
model.save_pretrained("./korean_summary_finetuned")
tokenizer.save_pretrained("./korean_summary_finetuned")

('./korean_summary_finetuned\\tokenizer_config.json',
 './korean_summary_finetuned\\special_tokens_map.json',
 './korean_summary_finetuned\\tokenizer.json')

In [10]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
finetuned_model = AutoModelForSeq2SeqLM.from_pretrained("./korean_summary_finetuned")
finetuned_tokenizer = AutoTokenizer.from_pretrained("./korean_summary_finetuned")

In [12]:
report = "일본의 조선침략 이후 조선의 각 개항장에 진출한 일본인 상업자본가는 소작제 농장경영을 통한 고율의 토지수익률을 확인하고, 미곡반출로 상업이윤을 극대화하는 동시에 차츰 자기자본의 투자대상으로서 토지에 눈을 돌리게 되었다. 일본인 상업자본가의 토지매수는 초기에는 비옥한 삼남지방의 평야지대에 집중되었으나, 토지확보경쟁이 가열되면서 그 범위는 점차 서남해안의 도서지역으로까지 확대되었다. 이에 대해 조선농민은 ‘토지조사사업’ 이전의 사적 토지소유권의 확대·강화를 근거로 ‘토지소유권 확인소송’이나 ‘부당이익 반환소송’을 제기하는 등 일본인 지주의 비합법적인 토지매수에 맞서 갖가지 반대운동을 전개하였다. 그러나 조선통치의 사회적·경제적 기반으로서 식민지 지주제의 확립을 옹호한 식민지 권력은 ‘토지가옥증명규칙’(1906년)과 ‘토지가옥소유권증명규칙’(1908년)을 공포하는 등 일련의 토지법제 개정을 통해 일본인 지주의 토지매수를 합법화하고, 최종적으로는 ‘토지조사사업’을 실시하여 토지소유권을 법률적으로 확정지었다."
inputs = finetuned_tokenizer(report, return_tensors="pt", max_length=1024, truncation=True)
summary_ids = finetuned_model.generate(inputs["input_ids"], num_beams=4, max_length=350, early_stopping=True)
summary = finetuned_tokenizer.decode(summary_ids[0], skip_special_tokens=True)

print(summary)

식민지 권력은 일련의 토지법제 개정을 통해 일본인 지주의 토지매수를 합법화하고, 최종적으로는 ‘토지조사사업’을 실시하여 토지소유권을 확정지었다.
