## Dataset

In [1]:
!pip install datasets
!pip install sentencepiece
!pip install git+https://github.com/Tiiiger/bert_score.git


Collecting git+https://github.com/Tiiiger/bert_score.git
  Cloning https://github.com/Tiiiger/bert_score.git to /tmp/pip-req-build-41qmf9n2
  Running command git clone --filter=blob:none --quiet https://github.com/Tiiiger/bert_score.git /tmp/pip-req-build-41qmf9n2
  Resolved https://github.com/Tiiiger/bert_score.git to commit 19e7f551fe4fa43fdd07b8129ae947015b902b2d
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.0.0->bert_score==0.3.13)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.0.0->bert_score==0.3.13)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.0.0->bert_score==0.3.13)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12

In [9]:
from google.colab import drive
drive.mount('/content/drive')
from datasets import load_dataset

Mounted at /content/drive


In [11]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [21]:
from datasets import load_dataset

# CSV 파일 로드
ds = load_dataset(data_files="/test.csv")



TypeError: load_dataset() missing 1 required positional argument: 'path'

In [17]:
# '한국어-영어 번역(병렬) 말뭉치'만 사용
filtered_ds = ds.filter(lambda example: example["source"] == 126)

AttributeError: 'str' object has no attribute 'filter'

In [None]:
filtered_ds['train']['en'][1]

'I did not get back the deposit when I checked out on the 28th of May, the date of termination of contract.'

In [None]:
len(filtered_ds['train'])

128104

In [None]:
filtered_ds['train'].column_names

['ko', 'en', 'source']

In [None]:
# 1. 필요한 라이브러리 설치
!pip install -q transformers datasets sentencepiece

# 2. 라이브러리 임포트
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from datasets import load_dataset
import torch
import random
from bert_score import score


In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from bert_score import score
import torch
import random

# 1. 모델 및 토크나이저 로드
model_name = "facebook/nllb-200-distilled-600M"
tokenizer = AutoTokenizer.from_pretrained(model_name, src_lang="eng_Latn")
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# 2. 언어 코드 설정
source_lang = "eng_Latn"
target_lang = "kor_Hang"

# 3. 번역 함수 정의
def translate(text, source_lang=source_lang, target_lang=target_lang, max_length=512):
    tokenizer.src_lang = source_lang
    encoded = tokenizer(text, return_tensors="pt", truncation=True)
    target_lang_id = tokenizer.convert_tokens_to_ids(target_lang)
    generated_tokens = model.generate(
        **encoded,
        forced_bos_token_id=target_lang_id,
        max_length=max_length
    )
    return tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]

# 4. 무작위 샘플 추출
train_data = filtered_ds["train"]
random_indices = random.sample(range(len(train_data)), 200)  # 개수는 원하는 대로 조절
sample_data = [train_data[i] for i in random_indices]

# 5. 문장별 번역 및 BERTScore 즉시 평가
for i, item in enumerate(sample_data):
    en_text = item['en']
    ko_gold = item['ko']
    ko_nllb = translate(en_text)

    # 문장별 BERTScore 계산 (리스트로 감싸야 함)
    P, R, F1 = score([ko_nllb], [ko_gold], lang="ko")

    print(f"\n==== {i+1}번 문장 ====")
    print(f"[영문]   : {en_text}")
    print(f"[정답]   : {ko_gold}")
    print(f"[번역]   : {ko_nllb}")
    print(f"[BERTScore] Precision: {P[0]:.4f}, Recall: {R[0]:.4f}, F1: {F1[0]:.4f}")



==== 1번 문장 ====
[영문]   : There were wet wipes, a sanitary napkin, underwear and a skirt in the bag.
[정답]   : 가방에는 물티슈와 생리대, 속옷과 치마가 들어 있었다.
[번역]   : 물은 바늘, 세련된 냅킨, 속옷, 가방에 드레스를 넣었습니다.
[BERTScore] Precision: 0.7803, Recall: 0.7891, F1: 0.7847

==== 2번 문장 ====
[영문]   : One can check the application list directly whether or not education application is received.
[정답]   : 교육 신청 접수여부는 신청리스트에서 본인이 직접 확인 가능하다.
[번역]   : 교육 신청서가 수신되었는지 여부를 직접 신청 목록에서 확인할 수 있습니다.
[BERTScore] Precision: 0.7695, Recall: 0.8062, F1: 0.7875

==== 3번 문장 ====
[영문]   : The Dongdaemun market has made up the Dongdaemun Apparel Cluster, which combines 20,000 apparel retailers and 7,000 sewing factories. Although It accounts for 21% of exports and 26% of employment o Korea's textile fashion, it has been unable to find a new growth engine from lack of strategies.
[정답]   : 동대문 시장은 2만여개의 의류 도소매점과 7천여개 봉제공장이 결합된 동대문 의류클러스터를 이루며 한국 섬유패션 수출의 21%, 고용의 26%를 담당해왔지만 최근들어 전략부재로 새로운 성장동력을 찾지 못하는 상황이다.
[번역]   : 동대문 시장은 20000명의 의류 소매업체

In [None]:
print(type(item['en']), type(item['ko_nllb']), type(item['ko']))
print("en 개수:", len(item['en']))
print("ko_nllb 개수:", len(item['ko_nllb']))
print("ko 개수:", len(item['ko']))



<class 'str'> <class 'str'> <class 'str'>
en 개수: 69
ko_nllb 개수: 26
ko 개수: 29


In [None]:
from bert_score import score

# 예시 문장 (reference, candidate)
references = item['en']
candidates = item['ko_nllb']

# BERTScore 계산 (한국어 모델 명시)
P, R, F1 = score(candidates, references, lang="ko")

# 결과 출력
for i, (p, r, f) in enumerate(zip(P, R, F1)):
    print(f"문장 {i+1}")
    print(f"  Precision : {p:.4f}")
    print(f"  Recall    : {r:.4f}")
    print(f"  F1 Score  : {f:.4f}")



AssertionError: Different number of candidates and references