# Dataset Augmentation and Translation Notebook

preveously created a jsonl file with 50 Q&As about UHI(based on three papers listed below), using GPT 5.0

-----------

도시지역의 기후변화 적응을 위한 열섬현상 완화방안 연구, 2009, 명수정

도시녹지의 도시환경질 개선 연구(**I**) - 도시열섬현상을중심으로, 2014, 공학양 외 12,

도시개발사업의 열섬 완화 대책, 조흥곤

------------

This notebook performs the following tasks:
1.  **Translates** the `urban_heat_island_finetune_expanded.jsonl` file from Korean to English using the OpenAI API.
2.  **Augments** the dataset by paraphrasing the user questions in the original Korean JSONL file using a T5-based model.
3. **Splits** the dataset for validation
4. **Finetunes** a gpt4.1-mini model based on the dataset

In [None]:
import os
os.environ['OPENAI_API_KEY'] = "your_API_KEY"


## 1. Translate Korean to English

In [None]:
import json
import openai
import os

def translate_ko_to_en(text, client, model_id):
    """
    Translates Korean text to English using the OpenAI API.
    """
    try:
        response = client.chat.completions.create(
            model=model_id,
            messages=[
                {"role": "system", "content": "You are a helpful assistant that translates Korean to English."},
                {"role": "user", "content": f"Translate the following Korean text to English: {text}"}
            ],
            temperature=0,
        )
        return response.choices[0].message.content
    except Exception as e:
        print('An error occurred during translation: {}'.format(e))

def translate_file():
    # Configure the OpenAI client
    try:
        # Using streamlit secrets as the user has used it before
        import streamlit as st
        client = openai.OpenAI(api_key=OPENAI_API_KEY)
    except:
        client = openai.OpenAI(api_key=os.environ['OPENAI_API_KEY'])

    MODEL_ID = "gpt-4.1-2025-04-14"

    input_file = 'initial_data.jsonl'
    output_file = 'translated_data.jsonl'

    with open(input_file, 'r', encoding='utf-8') as infile, \
         open(output_file, 'w', encoding='utf-8') as outfile:
        for i, line in enumerate(infile):
            data = json.loads(line)
            translated_messages = []
            for message in data['messages']:
                translated_content = translate_ko_to_en(message['content'], client, MODEL_ID)
                translated_messages.append({
                    'role': message['role'],
                    'content': translated_content
                })

            translated_data = {'messages': translated_messages}
            outfile.write(json.dumps(translated_data, ensure_ascii=False) + '\n')
            print(f"Translated line {i+1}: {line.strip()}")

    print(f"Translation complete. Translated file saved as {output_file}")

translate_file() # Uncomment to run

Translated line 1: {"messages": [{"role": "user", "content": "도시열섬(UHI)이란 무엇인가요?"}, {"role": "assistant", "content": "도시열섬(Urban Heat Island, UHI)은 인공 구조물과 인구·에너지 밀집으로 인해 도시의 기온이 주변 교외지역보다 높아지는 현상입니다. 도시의 표면·대기·하부층 등 여러 층위에서 발생할 수 있으며, 시간대와 공간규모에 따라 특성이 달라집니다."}]}
Translated line 2: {"messages": [{"role": "user", "content": "열섬은 어떤 유형으로 구분되나요?"}, {"role": "assistant", "content": "일반적으로 지표면 열섬(Surface UHI), 캐노피층 열섬(Canopy-layer UHI), 경계층 열섬(Boundary-layer UHI) 등으로 구분합니다. 각 유형은 측정방법(지표면온도 vs 기상관측 기온)과 영향범위가 다릅니다."}]}
Translated line 3: {"messages": [{"role": "user", "content": "열섬현상이 가장 뚜렷해지는 조건은 언제인가요?"}, {"role": "assistant", "content": "맑고 바람이 약한 날, 특히 여름철 밤(야간)에 도시열섬이 가장 뚜렷하게 관측됩니다. 낮에는 복사와 열저장이, 밤에는 저장된 열의 재방출과 대류효율 저하가 중요합니다."}]}
Translated line 4: {"messages": [{"role": "user", "content": "도시에서 가장 큰 원인은 무엇인가요?"}, {"role": "assistant", "content": "주요 원인은 식생 감소(증발산 저하), 불투수면(아스팔트·콘크리트) 증가, 열저장량이 큰 건축자재 사용, 고밀도 도시구조로 인한 통풍 저해, 차량·냉난방 등 인공열 배출 등입니다."}]}
Translated line 5: {"messages":

## 2. Augment Dataset by Paraphrasing

In [None]:
import json
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch

def paraphrase(question, model, tokenizer, device, num_return_sequences=3):
    """
    Paraphrases a question using a pre-trained model.
    """
    input_ids = tokenizer(
        f'paraphrase: {question}',
        return_tensors="pt", padding="longest",
        max_length=128,
        truncation=True,
    ).input_ids.to(device)

    outputs = model.generate(
        input_ids,
        temperature=0.7,
        repetition_penalty=10.0,
        num_return_sequences=num_return_sequences,
        no_repeat_ngram_size=2,
        num_beams=5,
        max_length=128
    )

    res = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    return res

def augment_dataset():
    # Set device to MPS for Apple Silicon, fall back to CPU
    if torch.backends.mps.is_available():
        device = torch.device("mps")
    elif torch.cuda.is_available():
        device = torch.device("cuda")
    else:
        device = torch.device("cpu")

    print(f"Using device: {device}")

    tokenizer = AutoTokenizer.from_pretrained("humarin/chatgpt_paraphraser_on_T5_base")
    model = AutoModelForSeq2SeqLM.from_pretrained("humarin/chatgpt_paraphraser_on_T5_base").to(device)

    input_file = 'translated_data.jsonl'
    output_file = 'paraphrased_data.jsonl'

    NUM_PARAPHRASES = 3

    with open(input_file, 'r', encoding='utf-8') as infile, \
         open(output_file, 'w', encoding='utf-8') as outfile:
        for i, line in enumerate(infile):
            data = json.loads(line)

            # Write the original data
            outfile.write(json.dumps(data, ensure_ascii=False) + '\n')

            user_question = data['messages'][0]['content']
            assistant_answer = data['messages'][1]['content']

            paraphrased_questions = paraphrase(user_question, model, tokenizer, device, NUM_PARAPHRASES)

            for pq in paraphrased_questions:
                new_data = {
                    "messages": [
                        {"role": "user", "content": pq},
                        {"role": "assistant", "content": assistant_answer}
                    ]
                }
                outfile.write(json.dumps(new_data, ensure_ascii=False) + '\n')

            print(f"Processed line {i+1} and generated {len(paraphrased_questions)} paraphrases.")

    print(f"Dataset augmentation complete. New dataset saved as {output_file}")

augment_dataset() # Uncomment to run

Using device: cpu


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/892M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Processed line 1 and generated 3 paraphrases.
Processed line 2 and generated 3 paraphrases.
Processed line 3 and generated 3 paraphrases.
Processed line 4 and generated 3 paraphrases.
Processed line 5 and generated 3 paraphrases.
Processed line 6 and generated 3 paraphrases.
Processed line 7 and generated 3 paraphrases.
Processed line 8 and generated 3 paraphrases.
Processed line 9 and generated 3 paraphrases.
Processed line 10 and generated 3 paraphrases.
Processed line 11 and generated 3 paraphrases.
Processed line 12 and generated 3 paraphrases.
Processed line 13 and generated 3 paraphrases.
Processed line 14 and generated 3 paraphrases.
Processed line 15 and generated 3 paraphrases.
Processed line 16 and generated 3 paraphrases.
Processed line 17 and generated 3 paraphrases.
Processed line 18 and generated 3 paraphrases.
Processed line 19 and generated 3 paraphrases.
Processed line 20 and generated 3 paraphrases.
Processed line 21 and generated 3 paraphrases.
Processed line 22 and 

## 3. Split data(train, validation)

In [None]:
import json
import random

def split_dataset(input_file, train_file, validation_file, split_ratio=0.8):
    """
    Splits a JSONL file into a training and a validation set.
    """
    with open(input_file, 'r', encoding='utf-8') as infile:
        lines = infile.readlines()

    random.shuffle(lines)

    split_index = int(len(lines) * split_ratio)

    train_lines = lines[:split_index]
    validation_lines = lines[split_index:]

    with open(train_file, 'w', encoding='utf-8') as outfile:
        for line in train_lines:
            outfile.write(line)

    with open(validation_file, 'w', encoding='utf-8') as outfile:
        for line in validation_lines:
            outfile.write(line)

    print(f"Dataset split complete.")
    print(f"Training set: {len(train_lines)} lines, saved to {train_file}")
    print(f"Validation set: {len(validation_lines)} lines, saved to {validation_file}")

input_file = 'paraphrased_data.jsonl'
train_file = 'train_dataset.jsonl'
validation_file = 'validation_dataset.jsonl'
split_dataset(input_file, train_file, validation_file)


Dataset split complete.
Training set: 160 lines, saved to train_dataset.jsonl
Validation set: 40 lines, saved to validation_dataset.jsonl


In [None]:
from openai import OpenAI
from time import sleep

# Initialize OpenAI client
client = OpenAI(api_key = os.environ['OPENAI_API_KEY'])
def upload_training_file(file_path):
    """Upload training file to OpenAI"""
    with open(file_path, "rb") as file:
        response = client.files.create(
            file=file,
            purpose="fine-tune"
        )
        return response.id

# Upload both training and validation files
training_file_id = upload_training_file("train_dataset.jsonl")
validation_file_id = upload_training_file("validation_dataset.jsonl")
def create_fine_tuning_job(training_file_id, validation_file_id=None, model="gpt-4o-2024-08-06"):
    """Create a fine-tuning job"""
    response = client.fine_tuning.jobs.create(
        training_file=training_file_id,
        validation_file=validation_file_id,
        model=model
    )
    return response.id

# Start the fine-tuning job
model = "gpt-4o-2024-08-06"
job_id = create_fine_tuning_job(training_file_id, validation_file_id, model)
def monitor_job(job_id):
    """Monitor fine-tuning job progress"""
    while True:
        job = client.fine_tuning.jobs.retrieve(job_id)
        print(f"Status: {job.status}")

        if job.status in ["succeeded", "failed"]:
            return job

        # List latest events
        events = client.fine_tuning.jobs.list_events(
            fine_tuning_job_id=job_id,
            limit=5
        )
        for event in events.data:
            print(f"Event: {event.message}")

        sleep(30)  # Check every 30 seconds

# Monitor the job until completion
job = monitor_job(job_id)
if job.status == "succeeded":
    fine_tuned_model = job.fine_tuned_model
    print(f"Fine-tuned model ID: {fine_tuned_model}")
else:
    print("Fine-tuning failed.")

Status: validating_files
Event: Validating training file: file-9aNkLt7wJw6yNcGTrpFu3V and validation file: file-6qJVAvZ5mC5Se8JeDSdEqR
Event: Created fine-tuning job: ftjob-iPZjFwDXbp1IBdFug0a7sfmB
Status: validating_files
Event: Validating training file: file-9aNkLt7wJw6yNcGTrpFu3V and validation file: file-6qJVAvZ5mC5Se8JeDSdEqR
Event: Created fine-tuning job: ftjob-iPZjFwDXbp1IBdFug0a7sfmB
Status: validating_files
Event: Validating training file: file-9aNkLt7wJw6yNcGTrpFu3V and validation file: file-6qJVAvZ5mC5Se8JeDSdEqR
Event: Created fine-tuning job: ftjob-iPZjFwDXbp1IBdFug0a7sfmB
Status: validating_files
Event: Validating training file: file-9aNkLt7wJw6yNcGTrpFu3V and validation file: file-6qJVAvZ5mC5Se8JeDSdEqR
Event: Created fine-tuning job: ftjob-iPZjFwDXbp1IBdFug0a7sfmB
Status: queued
Event: Files validated, moving job to queued state
Event: Validating training file: file-9aNkLt7wJw6yNcGTrpFu3V and validation file: file-6qJVAvZ5mC5Se8JeDSdEqR
Event: Created fine-tuning job: 

KeyboardInterrupt: 