In [None]:
!pip install datasets

from datasets import load_dataset

# Tatoeba 데이터셋 불러오기
dataset = load_dataset("tatoeba", "en-mr", split="train")

# 영어 문장은 sentence_1에 있음
english_sentences = [item['translation']['en'] for item in dataset]

# 짧고 쉬운 문장만 (단어 수 4~12개)
filtered_sentences = [s for s in english_sentences if 4 <= len(s.split()) <= 12]

# 500개 샘플링
sampled_english = filtered_sentences[:500]

# 확인
for i in range(5):
    print(sampled_english[i])




The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Today is June 18th and it is Muiriel's birthday!
Muiriel is 20 now.
The password is "Muiriel".
I will be back soon.
I will be back soon.


In [None]:
# googletrans 설치
!pip install googletrans==4.0.0-rc1

from googletrans import Translator

translator = Translator()

# 영어 문장 500개 자동 번역
translated_sentences = []

for sentence in sampled_english:
    try:
        translated = translator.translate(sentence, src='en', dest='ko')
        translated_sentences.append(translated.text)
    except Exception as e:
        print(f"번역 실패 문장: {sentence} | 에러: {e}")

# 번역된 문장 출력 확인
for i in range(5):
    print(translated_sentences[i])


번역 실패 문장: Can you drive a car? | 에러: 'Translator' object has no attribute 'raise_Exception'
번역 실패 문장: Why can't you come? | 에러: 'Translator' object has no attribute 'raise_Exception'
번역 실패 문장: How much money do you want? | 에러: 'Translator' object has no attribute 'raise_Exception'
번역 실패 문장: I know your name. | 에러: 'Translator' object has no attribute 'raise_Exception'
번역 실패 문장: Your birthday is drawing near. | 에러: 'Translator' object has no attribute 'raise_Exception'
번역 실패 문장: Write your address here. | 에러: 'Translator' object has no attribute 'raise_Exception'
번역 실패 문장: Write your address here. | 에러: 'Translator' object has no attribute 'raise_Exception'
번역 실패 문장: I have no more time to talk with you. | 에러: 'Translator' object has no attribute 'raise_Exception'
번역 실패 문장: Empty vessels make the most sound. | 에러: 'Translator' object has no attribute 'raise_Exception'
번역 실패 문장: Among modern novels, this is the best. | 에러: 'Translator' object has no attribute 'raise_Exception'
번역 실패 문장: 

In [None]:
print(f"번역 성공한 문장 수: {len(translated_sentences)}개")


번역 성공한 문장 수: 479개


In [None]:
import random

# 479개 문장을 여러 번 복제해서 10,000개 만들기
multiplied_sentences = (translated_sentences * (10000 // len(translated_sentences) + 1))[:10000]
random.shuffle(multiplied_sentences)

# 파일로 저장
with open("translated_ordered.txt", "w", encoding="utf-8") as f:
    for line in multiplied_sentences:
        f.write(line + "\n")

print("✅ translated_ordered.txt 파일 생성 완료!")


✅ translated_ordered.txt 파일 생성 완료!


In [None]:
from google.colab import files
files.download("translated_ordered.txt")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# ✅ 완전 새로 만드는 translated_ordered.txt 학습용 Colab 코드

# 1. Install libraries
!pip install -U transformers datasets

# 2. Upload translated_ordered.txt
from google.colab import files
uploaded = files.upload()

# 3. Load and split the text
def load_dataset_from_txt(file_path):
    from datasets import Dataset
    import random

    with open(file_path, "r", encoding="utf-8") as f:
        lines = [line.strip() for line in f if line.strip()]

    random.shuffle(lines)
    split1 = int(0.9 * len(lines))
    split2 = int(0.95 * len(lines))
    train_lines = lines[:split1]
    val_lines = lines[split1:split2]
    test_lines = lines[split2:]

    train_dataset = Dataset.from_dict({"text": train_lines})
    val_dataset = Dataset.from_dict({"text": val_lines})
    test_dataset = Dataset.from_dict({"text": test_lines})

    return train_dataset, val_dataset, test_dataset

train_dataset, val_dataset, test_dataset = load_dataset_from_txt("translated_ordered.txt")

# 4. Load model and tokenizer
from transformers import GPT2LMHeadModel, GPT2TokenizerFast

tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")

tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = tokenizer.pad_token_id

# 5. Tokenize dataset
def tokenize_function(example):
    encoding = tokenizer(
        example["text"],
        truncation=True,
        padding="max_length",
        max_length=128
    )
    encoding["labels"] = encoding["input_ids"]
    return encoding

tokenized_train = train_dataset.map(tokenize_function, remove_columns=["text"])
tokenized_val = val_dataset.map(tokenize_function, remove_columns=["text"])

# 6. Set training arguments
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./results-translated",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    # evaluation_strategy="epoch",  # Remove or comment out this line
    # Instead of evaluation_strategy, use the following for older versions:
    # save_steps = len(tokenized_train) // 8 # Save every epoch if batch size is 8
    save_strategy="epoch",
    logging_dir="./logs",
    logging_steps=100,
    report_to="none",
    fp16=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
)

# 7. Train model
trainer.train()

# 8. Save model
trainer.save_model("checkpoint-translated")

# 9. Evaluate Perplexity
import math

eval_results = trainer.evaluate()
print(f">>> Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

# 10. Download model checkpoint
!zip -r checkpoint-translated.zip checkpoint-translated
files.download("checkpoint-translated.zip")




Saving translated_ordered (1).txt to translated_ordered (1) (1).txt


Map:   0%|          | 0/9000 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
100,0.9339
200,0.365
300,0.3163
400,0.2652
500,0.2313
600,0.2021
700,0.1715
800,0.148
900,0.1267
1000,0.1154


>>> Perplexity: 1.03
  adding: checkpoint-translated/ (stored 0%)
  adding: checkpoint-translated/training_args.bin (deflated 52%)
  adding: checkpoint-translated/config.json (deflated 52%)
  adding: checkpoint-translated/model.safetensors (deflated 7%)
  adding: checkpoint-translated/generation_config.json (deflated 24%)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>