In [None]:
import os
import requests
import json
from datasets import load_dataset

# -----------------------------
# 1. GitHub API 통해 .py diff 가져오는 함수
github_token = "use your token"  # 실제 토큰으로 변경
if not github_token:
    raise ValueError("GitHub API token not set.")

def get_py_diff_from_commit(repo, base_commit, github_token):
    url = f"https://api.github.com/repos/{repo}/compare/{base_commit}^...{base_commit}"
    headers = {
        "Authorization": f"token {github_token}",
        "Accept": "application/vnd.github.v3+json"
    }
    response = requests.get(url, headers=headers)
    if response.status_code != 200:
        raise Exception(f"GitHub API error: {response.status_code} {response.text}")
    
    commit_data = response.json()
    py_diffs = []
    for file in commit_data.get("files", []):
        filename = file.get("filename", "")
        if filename.endswith(".py"):
            patch = file.get("patch")
            if patch:
                py_diffs.append(f"File: {filename}\n{patch}\n")
    return "\n".join(py_diffs)

In [None]:
# 2. 데이터셋 로드 (여기서는 princeton-nlp/SWE-bench_Verified)
dataset = load_dataset("princeton-nlp/SWE-bench_Verified", split="test")

In [None]:
# 3. 프롬프트 생성 함수
def build_prompt_with_diff(data, py_diff):
    prompt = (
        f"A problem similar to '{data['problem_statement']}' has occurred, "
        "and it seems that the root cause lies in the modified section of the .py file:\n\n"
    )
    prompt += py_diff + "\n\n"
    prompt += (
        f"{data.get('hints_text', '')}\n\n"
        "Based on the information above, suggest a fix in the form of a code patch. "
        "Do not include any unnecessary explanations—only provide the modified patch code. "
        "Do not write any internal reasoning or thoughts. Only the modified patch code!\n"
        "### PATCH CODE START ###\n"
    )
    return prompt

In [None]:
# 4. 모델 로드 (int8 양자화)
import torch
from transformers import BitsAndBytesConfig, AutoTokenizer, AutoModelForCausalLM

model_name = "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"

# 아래는 예시로 device_map="cuda:0"를 강제하는 모습
custom_device_map = {"": "cuda:0"}
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    load_in_8bit=True,    # int8 양자화
    device_map=custom_device_map
)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
# 5. 모델로 수정 패치 코드 생성 함수
def generate_response(prompt, max_new_tokens=512):
    inputs = tokenizer(prompt, return_tensors="pt")
    inputs = {k: v.to(model.device) for k, v in inputs.items()}
    outputs = model.generate(
        **inputs,
        max_new_tokens=max_new_tokens,
        do_sample=True,
        temperature=0.7,
    )
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    delimiter = "### PATCH CODE START ###"
    if delimiter in response:
        response = response.split(delimiter, 1)[1].strip()
    
    return response

In [None]:
# 6. 전체 프로세스 실행
all_patch_results = []  # JSON으로 저장할 결과를 담을 리스트
output_dir = "swe-bench-verified"
os.makedirs(output_dir, exist_ok=True)

json_output_path = os.path.join(output_dir, "patch_results.jsonl")

for i, sample in enumerate(dataset):
    print(f"\n--- Processing sample {i} ---")
    try:
        repo = sample['repo']
        base_commit = sample['base_commit']

        py_diff = get_py_diff_from_commit(repo, base_commit, github_token)
        print("Fetched .py diff:")
        print(py_diff)

        # 프롬프트 생성
        prompt = build_prompt_with_diff(sample, py_diff)
        print("Generated Prompt:")
        print(prompt)

        # 모델 패치 생성
        patch_code = generate_response(prompt, max_new_tokens=512)
        print("Generated Patch Code:")
        print(patch_code)

        # (A) JSON에 담을 형태로 구조화
        result_dict = {
            "instance_id": sample["instance_id"],
            "model_patch": patch_code,
            "model_name_or_path": "Changhyun Lee"  # 사용자 지정
        }

        # 결과 리스트에 추가
        all_patch_results.append(result_dict)

        # (선택) 개별 텍스트 파일로도 저장
        # output_file = os.path.join(output_dir, f"patch_code_sample_{i}.txt")
        # with open(output_file, "w", encoding="utf-8") as f:
        #     f.write(patch_code)
        # print(f"Patch code saved to {output_file}")

        # (B) 모든 루프 후, 하나의 JSON 파일에 저장
        with open(json_output_path, "a", encoding="utf-8") as f:
            line = json.dumps(result_dict, ensure_ascii=False)
            f.write(line + "\n")
        
    except Exception as e:
        print(f"Error processing sample {i}: {e}")

print(f"\nFinal JSONL results saved to: {json_output_path}")