# DeepSeek-Chat Modele Prompt G√∂nderip Sonu√ß Almak

In [None]:
import os
import json
import re
import time
from typing import Dict, List
from openai import OpenAI
from tqdm import tqdm
from dotenv import load_dotenv, find_dotenv

# --- Ortam deƒüi≈ükenlerini y√ºkle ---
load_dotenv(find_dotenv())

# --- Sabitler / Ayarlar ---
LEVELS = ["A2", "B1", "B2", "C1"]  # A1'i zaten √ºrettin
BASE_URL = "https://api.deepseek.com"
MODEL_NAME = "deepseek-chat"
TEMPERATURE = 1.0                  # 0.0-1.0 arasƒ± deƒüer, 1.0 daha yaratƒ±cƒ± sonu√ßlar verir
N_EVALS = 2                        # her task i√ßin ka√ß tekrar

API_KEY = os.getenv("DEEPSEEK_API_KEY")
if not API_KEY:
    raise RuntimeError("DEEPSEEK_API_KEY ortam deƒüi≈ükeni tanƒ±mlƒ± deƒüil!")

# --- ƒ∞stemci ---
client = OpenAI(api_key=API_KEY, base_url=BASE_URL)

# --- Yollar ---
root = os.getcwd()
tasks_dir = os.path.abspath(os.path.join(root, "..", "data", "tasks"))
ratings_root = os.path.abspath(os.path.join(root, "..", "data", "ratings"))
output_dir = os.path.join(ratings_root, "deepseek_ratings")
os.makedirs(output_dir, exist_ok=True)

# --- Ayrƒ±≈ütƒ±rma ---
def parse_response(response_text: str) -> Dict[str, Dict[str, int]]:
    pattern = r"Sentence\s*([A-F])\s*:\s*([1-5])\s*,\s*([1-5])\s*,\s*([1-5])\s*,\s*([1-5])"
    results = {}
    for m in re.finditer(pattern, response_text):
        label = f"Sentence {m.group(1)}"
        s = list(map(int, m.groups()[1:]))
        results[label] = {"word_usage": s[0], "clarity": s[1], "grammar": s[2], "naturalness": s[3]}
    return results

# --- G√ºvenli √ßaƒürƒ± ---
def call_deepseek(prompt: str, retries: int = 3, backoff: float = 2.0) -> str:
    for attempt in range(1, retries + 1):
        try:
            resp = client.chat.completions.create(
                model=MODEL_NAME,
                messages=[{"role": "user", "content": prompt}],
                temperature=TEMPERATURE,
                stream=False,
            )
            return resp.choices[0].message.content
        except Exception as e:
            print(f"[HATA] API √ßaƒürƒ±sƒ± ba≈üarƒ±sƒ±z (deneme {attempt}/{retries}): {e}")
            if attempt < retries:
                time.sleep(backoff ** attempt)
            else:
                raise
    return ""

def average_scores(score_dicts: List[Dict[str, Dict[str, int]]]) -> Dict[str, Dict[str, float]]:
    merged: Dict[str, Dict[str, List[float]]] = {}
    for sd in score_dicts:
        for label, metrics in sd.items():
            merged.setdefault(label, {k: [] for k in ["word_usage","clarity","grammar","naturalness"]})
            for k, v in metrics.items():
                if isinstance(v, (int, float)):
                    merged[label][k].append(float(v))
    averaged: Dict[str, Dict[str, float]] = {}
    for label, lists in merged.items():
        averaged[label] = {k: (sum(vals)/len(vals) if vals else 0.0) for k, vals in lists.items()}
    return averaged

def process_level(level: str):
    tasks_path = os.path.join(tasks_dir, f"tasks_{level}.json")
    out_path   = os.path.join(output_dir, f"ratings_{level}.json")

    if not os.path.exists(tasks_path):
        print(f"[ATLA] Task dosyasƒ± yok: {tasks_path}")
        return

    with open(tasks_path, "r", encoding="utf-8") as f:
        tasks = json.load(f)

    all_ratings = []
    for task in tqdm(tasks, desc=f"Processing {level}"):
        prompt  = task["prompt"]
        mapping = task["mapping"]
        task_id = task["task_id"]
        word    = task["word"]

        try:
            runs = []
            for _ in range(N_EVALS):
                reply = call_deepseek(prompt)
                runs.append(parse_response(reply))

            averaged = average_scores(runs)

            for label, rating in averaged.items():
                if label not in mapping:
                    print(f"[UYARI] E≈üle≈ümeyen etiket: {label} (task_id={task_id})"); continue
                all_ratings.append({
                    "task_id":  task_id,
                    "model":    mapping[label]["model"],
                    "level":    level,
                    "word":     word,
                    "label":    label,
                    "sentence": mapping[label]["sentence"],
                    "ratings":  {
                        "word_usage": round(rating.get("word_usage", 0.0), 3),
                        "clarity":     round(rating.get("clarity", 0.0), 3),
                        "grammar":     round(rating.get("grammar", 0.0), 3),
                        "naturalness": round(rating.get("naturalness", 0.0), 3),
                    }
                })
        except Exception as e:
            print(f"[HATA] Level={level} Task={task_id}: {e}")
            continue

    with open(out_path, "w", encoding="utf-8") as f:
        json.dump(all_ratings, f, ensure_ascii=False, indent=2)
    print(f"‚úÖ {level}: {len(all_ratings)} satƒ±r kaydedildi ‚Üí {out_path}")

if __name__ == "__main__":
    for lvl in LEVELS:
        print("\n" + "="*32 + f"  START {lvl}  " + "="*32)
        process_level(lvl)
    print("\nüéâ Bitti: A2‚ÜíC1 deƒüerlendirmeleri yazƒ±ldƒ±.")





Processing A2: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 10/10 [02:24<00:00, 14.48s/it]


‚úÖ A2: 60 satƒ±r kaydedildi ‚Üí /home/user/Documents/Tez/Deneyler/LLM_Degerlendirme/data/ratings/deepseek_ratings/ratings_A2.json



Processing B1: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 10/10 [02:19<00:00, 13.96s/it]


‚úÖ B1: 60 satƒ±r kaydedildi ‚Üí /home/user/Documents/Tez/Deneyler/LLM_Degerlendirme/data/ratings/deepseek_ratings/ratings_B1.json



Processing B2: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 10/10 [02:29<00:00, 14.95s/it]


‚úÖ B2: 60 satƒ±r kaydedildi ‚Üí /home/user/Documents/Tez/Deneyler/LLM_Degerlendirme/data/ratings/deepseek_ratings/ratings_B2.json



Processing C1: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 10/10 [02:20<00:00, 14.10s/it]

‚úÖ C1: 60 satƒ±r kaydedildi ‚Üí /home/user/Documents/Tez/Deneyler/LLM_Degerlendirme/data/ratings/deepseek_ratings/ratings_C1.json

üéâ Bitti: A2‚ÜíC1 deƒüerlendirmeleri yazƒ±ldƒ±.





# deepseek-reasoner

In [5]:
import os
import json
import re
import time
from typing import Dict, List
from openai import OpenAI
from tqdm import tqdm
from dotenv import load_dotenv, find_dotenv

# --- ENV ---
load_dotenv(find_dotenv())
API_KEY = os.getenv("DEEPSEEK_API_KEY")
if not API_KEY:
    raise RuntimeError("DEEPSEEK_API_KEY ortam deƒüi≈ükeni tanƒ±mlƒ± deƒüil!")

# --- Ayarlar ---
LEVELS = ["A1", "A2", "B1", "B2", "C1"]
BASE_URL = "https://api.deepseek.com"
MODEL_NAME = "deepseek-reasoner"   # <<<< reasoner modeli
TEMPERATURE = 1.0                  # 0.0-1.0 arasƒ±; 1.0 daha yaratƒ±cƒ±
N_EVALS = 2                        # her task i√ßin ka√ß tekrar (ortalama alƒ±nƒ±r)

# --- ƒ∞stemci ---
client = OpenAI(api_key=API_KEY, base_url=BASE_URL)

# --- Yollar ---
root = os.getcwd()
tasks_dir = os.path.abspath(os.path.join(root, "..", "data", "tasks"))
ratings_root = os.path.abspath(os.path.join(root, "..", "data", "ratings"))
output_dir = os.path.join(ratings_root, "deepseek_ratings")
os.makedirs(output_dir, exist_ok=True)

# (Opsiyonel) ham loglar
RAW_LOG = False
raw_dir = os.path.join(output_dir, "reasoner_raw_logs")
if RAW_LOG:
    os.makedirs(raw_dir, exist_ok=True)

# --- Ayrƒ±≈ütƒ±rma ---
def parse_response(response_text: str) -> Dict[str, Dict[str, int]]:
    """
    Format √∂rneƒüi:
      Sentence A: 4, 4, 5, 4
      ...
    D√∂n√º≈ü: {"Sentence A": {"word_usage":int, "clarity":int, "grammar":int, "naturalness":int}, ...}
    """
    pattern = r"Sentence\s*([A-F])\s*:\s*([1-5])\s*,\s*([1-5])\s*,\s*([1-5])\s*,\s*([1-5])"
    results = {}
    for m in re.finditer(pattern, response_text):
        label = f"Sentence {m.group(1)}"
        s = list(map(int, m.groups()[1:]))
        results[label] = {"word_usage": s[0], "clarity": s[1], "grammar": s[2], "naturalness": s[3]}
    return results

def average_scores(score_dicts: List[Dict[str, Dict[str, int]]]) -> Dict[str, Dict[str, float]]:
    merged: Dict[str, Dict[str, List[float]]] = {}
    for sd in score_dicts:
        for label, metrics in sd.items():
            merged.setdefault(label, {k: [] for k in ["word_usage","clarity","grammar","naturalness"]})
            for k, v in metrics.items():
                if isinstance(v, (int, float)):
                    merged[label][k].append(float(v))
    averaged: Dict[str, Dict[str, float]] = {}
    for label, lists in merged.items():
        averaged[label] = {k: (sum(vals)/len(vals) if vals else 0.0) for k, vals in lists.items()}
    return averaged

# --- API √ßaƒürƒ±sƒ± (retry/backoff) ---
def call_deepseek(prompt: str, retries: int = 3, backoff: float = 2.0) -> str:
    for attempt in range(1, retries + 1):
        try:
            resp = client.chat.completions.create(
                model=MODEL_NAME,
                messages=[{"role": "user", "content": prompt}],
                temperature=TEMPERATURE,
                stream=False,
            )
            return resp.choices[0].message.content
        except Exception as e:
            print(f"[HATA] API √ßaƒürƒ±sƒ± ba≈üarƒ±sƒ±z (deneme {attempt}/{retries}): {e}")
            if attempt < retries:
                time.sleep(backoff ** attempt)
            else:
                raise
    return ""

def process_level(level: str):
    tasks_path = os.path.join(tasks_dir, f"tasks_{level}.json")
    out_path   = os.path.join(output_dir, f"ratings_reasoner_{level}.json")  # <<<< ayrƒ± dosya adƒ±

    if not os.path.exists(tasks_path):
        print(f"[ATLA] Task dosyasƒ± yok: {tasks_path}")
        return

    with open(tasks_path, "r", encoding="utf-8") as f:
        tasks = json.load(f)

    all_ratings = []
    for task in tqdm(tasks, desc=f"Processing {level} (reasoner)"):
        prompt  = task["prompt"]
        mapping = task["mapping"]
        task_id = task["task_id"]
        word    = task["word"]

        try:
            runs = []
            for run_idx in range(N_EVALS):
                reply = call_deepseek(prompt)
                if RAW_LOG:
                    with open(os.path.join(raw_dir, f"{level}__{task_id}__run{run_idx+1}__prompt.txt"), "w", encoding="utf-8") as pf:
                        pf.write(prompt)
                    with open(os.path.join(raw_dir, f"{level}__{task_id}__run{run_idx+1}__reply.txt"), "w", encoding="utf-8") as rf:
                        rf.write(reply)
                runs.append(parse_response(reply))

            averaged = average_scores(runs)

            for label, rating in averaged.items():
                if label not in mapping:
                    print(f"[UYARI] E≈üle≈ümeyen etiket: {label} (task_id={task_id})")
                    continue

                all_ratings.append({
                    "task_id":  task_id,
                    "model":    mapping[label]["model"],
                    "level":    level,
                    "word":     word,
                    "label":    label,
                    "sentence": mapping[label]["sentence"],
                    "ratings":  {
                        "word_usage": round(rating.get("word_usage", 0.0), 3),
                        "clarity":     round(rating.get("clarity", 0.0), 3),
                        "grammar":     round(rating.get("grammar", 0.0), 3),
                        "naturalness": round(rating.get("naturalness", 0.0), 3),
                    }
                })

        except Exception as e:
            print(f"[HATA] Level={level} Task={task_id}: {e}")
            continue

    with open(out_path, "w", encoding="utf-8") as f:
        json.dump(all_ratings, f, ensure_ascii=False, indent=2)

    task_ids = {r["task_id"] for r in all_ratings}
    expected_rows = 6 * len(task_ids)
    print(f"‚úÖ {level} (reasoner): {len(all_ratings)} satƒ±r kaydedildi ‚Üí {out_path}")
    if len(all_ratings) != expected_rows:
        print(f"‚ö†Ô∏è  {level}: Satƒ±r sayƒ±sƒ± beklenenle uyu≈ümuyor (beklenen {expected_rows}, ger√ßek {len(all_ratings)}).")

if __name__ == "__main__":
    print("\n=== deepseek-reasoner deƒüerlendirmesi: A1‚ÜíC1 ===")
    for lvl in LEVELS:
        print("\n" + "="*24 + f"  START {lvl}  " + "="*24)
        process_level(lvl)
    print("\nüéâ Bitti: Sonu√ßlar data/ratings/deepseek_ratings/ altƒ±nda ratings_reasoner_{LEVEL}.json dosyalarƒ±na yazƒ±ldƒ±.")



=== deepseek-reasoner deƒüerlendirmesi: A1‚ÜíC1 ===



Processing A1 (reasoner):   0%|          | 0/10 [00:00<?, ?it/s]

[HATA] API √ßaƒürƒ±sƒ± ba≈üarƒ±sƒ±z (deneme 1/3): <!DOCTYPE html>
<!--[if lt IE 7]> <html class="no-js ie6 oldie" lang="en-US"> <![endif]-->
<!--[if IE 7]>    <html class="no-js ie7 oldie" lang="en-US"> <![endif]-->
<!--[if IE 8]>    <html class="no-js ie8 oldie" lang="en-US"> <![endif]-->
<!--[if gt IE 8]><!--> <html class="no-js" lang="en-US"> <!--<![endif]-->
<head>


<title>api.deepseek.com | 504: Gateway time-out</title>
<meta charset="UTF-8" />
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
<meta http-equiv="X-UA-Compatible" content="IE=Edge" />
<meta name="robots" content="noindex, nofollow" />
<meta name="viewport" content="width=device-width,initial-scale=1" />
<link rel="stylesheet" id="cf_styles-css" href="/cdn-cgi/styles/main.css" />


</head>
<body>
<div id="cf-wrapper">
    <div id="cf-error-details" class="p-0">
        <header class="mx-auto pt-10 lg:pt-6 lg:px-8 w-240 lg:w-full mb-8">
            <h1 class="inline-block sm:block sm:mb-2 font-ligh

Processing A1 (reasoner):   0%|          | 0/10 [15:35<?, ?it/s]


KeyboardInterrupt: 

# === DEBUG: 2 g√∂revle u√ßtan uca test (DeepSeek-Chat) ===

In [8]:
# === DEBUG: 2 g√∂revle u√ßtan uca test h√ºcresi ===

import os, json, re, textwrap
from typing import Dict
from dotenv import load_dotenv, find_dotenv
from openai import OpenAI

# ---- Ayarlar ----
LEVEL = "A1"  # burada test etmek istediƒüin seviyeyi se√ß (A1, A2, B1, B2, C1)
BASE_URL = "https://api.deepseek.com"
MODEL_NAME = "deepseek-chat"

# ---- Ortam / ƒ∞stemci ----
load_dotenv(find_dotenv())
API_KEY = os.getenv("DEEPSEEK_API_KEY")
if not API_KEY:
    raise RuntimeError("DEEPSEEK_API_KEY bulunamadƒ±. .env veya ortam deƒüi≈ükenini kontrol et.")
client = OpenAI(api_key=API_KEY, base_url=BASE_URL)

# ---- Yollar ----
root = os.getcwd()
tasks_path = os.path.abspath(os.path.join(root, "..", "data", "tasks", f"tasks_{LEVEL}.json"))

# ---- Parser ----
def parse_response(response_text: str) -> Dict[str, Dict[str, int]]:
    """
    Beklenen satƒ±rlar:
      Sentence A: 4, 4, 5, 4
    D√∂nen dict anahtarlarƒ±: "Sentence A" ... "Sentence F"
    """
    pattern = r"Sentence\s*([A-F])\s*:\s*([1-5])\s*,\s*([1-5])\s*,\s*([1-5])\s*,\s*([1-5])"
    results = {}
    for m in re.finditer(pattern, response_text):
        label = f"Sentence {m.group(1)}"
        scores = list(map(int, m.groups()[1:]))
        results[label] = {
            "word_usage":  scores[0],
            "clarity":     scores[1],
            "grammar":     scores[2],
            "naturalness": scores[3],
        }
    return results

def assert_parsed_ok(parsed, mapping, task_id):
    # Etiket e≈üle≈ümesi
    expected = set(mapping.keys())                 # {"Sentence A", ..., "Sentence F"}
    got = set(parsed.keys())
    missing = expected - got
    extra   = got - expected
    if missing or extra:
        raise ValueError(f"[PARSE MISMATCH] task={task_id} missing={sorted(missing)} extra={sorted(extra)}")
    # Skor aralƒ±ƒüƒ±
    for lbl, r in parsed.items():
        for k, v in r.items():
            assert isinstance(v, int) and 1 <= v <= 5, f"[SCORE RANGE] task={task_id} {lbl}/{k} -> {v}"

# ---- Yardƒ±mcƒ±: kƒ±sa c√ºmle √∂nizleme ----
def preview(s: str, n=80):
    s = s.replace("\n", " ")
    return s if len(s) <= n else s[:n] + "..."

# ---- √áalƒ±≈ütƒ±r ----
with open(tasks_path, "r", encoding="utf-8") as f:
    tasks = json.load(f)

tasks = tasks[:2]  # SADECE ƒ∞LK 2 G√ñREV

for idx, task in enumerate(tasks, start=1):
    prompt  = task["prompt"]
    mapping = task["mapping"]
    task_id = task["task_id"]
    word    = task["word"]
    level   = task["level"]

    print("="*90)
    print(f"[{idx}/2] TASK ID: {task_id} | LEVEL={level} | WORD={word}")
    print("-"*90)
    print(">> PROMPT (ilk 600 karakter):")
    #print(textwrap.shorten(prompt, width=600, placeholder=" ..."))
    print(prompt)
    print("-"*90)
    print(">> MAPPING LABELS:", list(mapping.keys()))
    print(">> MAPPING SENTENCE PREVIEW:")
    for lbl in ["Sentence A","Sentence B","Sentence C","Sentence D","Sentence E","Sentence F"]:
        sent = mapping[lbl]["sentence"]
        print(f"   {lbl}: {preview(sent)}")

    # --- API √ßaƒürƒ±sƒ± ---
    resp = client.chat.completions.create(
        model=MODEL_NAME,
        messages=[{"role": "user", "content": prompt}],
        temperature=0,
        stream=False,
    )
    reply = resp.choices[0].message.content

    print("-"*90)
    print(">> RAW REPLY:")
    print(reply)

    # --- Parse & doƒürulama ---
    parsed = parse_response(reply)
    print("-"*90)
    print(">> PARSED LABELS:", list(parsed.keys()))
    assert_parsed_ok(parsed, mapping, task_id)
    print("‚úÖ PARSE OK & LABELS MATCH & SCORES IN RANGE")

    # --- E≈üle≈ütirilmi≈ü √∂rnek √ßƒ±ktƒ± (ilk 2 label g√∂sterelim) ---
    show_labels = ["Sentence A","Sentence B"]
    for lbl in show_labels:
        model_name = mapping[lbl]["model"]
        sentence   = mapping[lbl]["sentence"]
        scores     = parsed[lbl]
        print(f"‚Üí {lbl} | model={model_name} | {preview(sentence)}")
        print(f"   scores: {scores}")

print("="*90)
print("üéâ Test tamam: 2 g√∂rev i√ßin prompt ‚Üí reply ‚Üí parse ‚Üí mapping doƒürulandƒ±.")


[1/2] TASK ID: A1_age | LEVEL=A1 | WORD=age
------------------------------------------------------------------------------------------
>> PROMPT (ilk 600 karakter):
You are a professional CEFR-aligned English sentence evaluator.

Your task is to evaluate 6 example sentences that all use the target word: "age" at CEFR level: A1.

Rate each sentence from 1 (poor) to 5 (excellent) for the following **four independent criteria**:

1. **Word Usage** ‚Äì Is the target word used correctly and meaningfully in context?
2. **Clarity** ‚Äì Is the sentence understandable and suitable for the given CEFR level?
3. **Grammar** ‚Äì Is the grammar accurate and appropriate for the level?
4. **Naturalness** ‚Äì Does the sentence sound fluent and natural to a native speaker?

‚ö†Ô∏è Important Instructions:
- **Only return numerical ratings** for each criterion.
- **Do not include any explanations, comments, or justifications.**
- Follow the exact output format below.

### Output Format:
Sentence A: <Word 

# === DEBUG: 2 g√∂revle u√ßtan uca test (OpenAI / ChatGPT) ===

In [None]:


import os, json, re
from typing import Dict
from dotenv import load_dotenv, find_dotenv
from openai import OpenAI

# ---- Ayarlar ----
LEVEL = "A1"                      # A1, A2, B1, B2, C1
MODEL_NAME = "gpt-5"              # alternatif: "gpt-5-mini", "gpt-4.1"
TEMPERATURE = 1.0                   # sayƒ±sal derecelendirme i√ßin deterministik

# ---- Ortam / ƒ∞stemci ----
load_dotenv(find_dotenv())
API_KEY = os.getenv("OPENAI_API_KEY")
if not API_KEY:
    raise RuntimeError("OPENAI_API_KEY bulunamadƒ±. .env veya ortam deƒüi≈ükenini ayarla.")
client = OpenAI(api_key=API_KEY)  # OpenAI i√ßin base_url vermiyoruz

# ---- Yollar ----
root = os.getcwd()
tasks_path = os.path.abspath(os.path.join(root, "..", "data", "tasks", f"tasks_{LEVEL}.json"))

# ---- Parser ----
def parse_response(response_text: str) -> Dict[str, Dict[str, int]]:
    """
    Beklenen satƒ±rlar:
      Sentence A: 4, 4, 5, 4
    D√∂nen dict anahtarlarƒ±: "Sentence A" ... "Sentence F"
    """
    pattern = r"Sentence\s*([A-F])\s*:\s*([1-5])\s*,\s*([1-5])\s*,\s*([1-5])\s*,\s*([1-5])"
    results = {}
    for m in re.finditer(pattern, response_text):
        label = f"Sentence {m.group(1)}"
        scores = list(map(int, m.groups()[1:]))
        results[label] = {
            "word_usage":  scores[0],
            "clarity":     scores[1],
            "grammar":     scores[2],
            "naturalness": scores[3],
        }
    return results

def assert_parsed_ok(parsed, mapping, task_id):
    expected = set(mapping.keys())  # {"Sentence A",..., "Sentence F"}
    got = set(parsed.keys())
    missing = expected - got
    extra   = got - expected
    if missing or extra:
        raise ValueError(f"[PARSE MISMATCH] task={task_id} missing={sorted(missing)} extra={sorted(extra)}")
    for lbl, r in parsed.items():
        for k, v in r.items():
            assert isinstance(v, int) and 1 <= v <= 5, f"[SCORE RANGE] task={task_id} {lbl}/{k} -> {v}"

def preview(s: str, n=80):
    s = s.replace("\n", " ")
    return s if len(s) <= n else s[:n] + "..."

# ---- √áalƒ±≈ütƒ±r ----
with open(tasks_path, "r", encoding="utf-8") as f:
    tasks = json.load(f)

tasks = tasks[:2]  # SADECE ƒ∞LK 2 G√ñREV

for idx, task in enumerate(tasks, start=1):
    prompt  = task["prompt"]
    mapping = task["mapping"]
    task_id = task["task_id"]
    word    = task["word"]
    level   = task["level"]

    print("="*100)
    print(f"[{idx}/2] TASK ID: {task_id} | LEVEL={level} | WORD={word}")
    print("-"*100)
    print(">> PROMPT (tam hali):")
    print(prompt)
    print("-"*100)
    print(">> MAPPING LABELS:", list(mapping.keys()))
    print(">> MAPPING SENTENCE PREVIEW:")
    for lbl in ["Sentence A","Sentence B","Sentence C","Sentence D","Sentence E","Sentence F"]:
        sent = mapping[lbl]["sentence"]
        print(f"   {lbl}: {preview(sent)}")

    # --- OpenAI API √ßaƒürƒ±sƒ± ---
    resp = client.chat.completions.create(
        model=MODEL_NAME,
        messages=[{"role": "user", "content": prompt}],
        temperature=TEMPERATURE,
        stream=False,
    )
    reply = resp.choices[0].message.content

    print("-"*100)
    print(">> RAW REPLY:")
    print(reply)

    # --- Parse & doƒürulama ---
    parsed = parse_response(reply)
    print("-"*100)
    print(">> PARSED LABELS:", list(parsed.keys()))
    assert_parsed_ok(parsed, mapping, task_id)
    print("‚úÖ PARSE OK & LABELS MATCH & SCORES IN RANGE")

    # --- E≈üle≈ütirilmi≈ü √∂rnek √ßƒ±ktƒ± (A ve B'yi g√∂ster) ---
    for lbl in ["Sentence A","Sentence B"]:
        model_name = mapping[lbl]["model"]
        sentence   = mapping[lbl]["sentence"]
        scores     = parsed[lbl]
        print(f"‚Üí {lbl} | model={model_name} | {preview(sentence)}")
        print(f"   scores: {scores}")

print("="*100)
print("üéâ Test tamam: 2 g√∂rev i√ßin prompt ‚Üí reply ‚Üí parse ‚Üí mapping doƒürulandƒ±.")


[1/2] TASK ID: A1_age | LEVEL=A1 | WORD=age
----------------------------------------------------------------------------------------------------
>> PROMPT (tam hali):
You are a professional CEFR-aligned English sentence evaluator.

Your task is to evaluate 6 example sentences that all use the target word: "age" at CEFR level: A1.

Rate each sentence from 1 (poor) to 5 (excellent) for the following **four independent criteria**:

1) Word Usage ‚Äî Is the given word used with the correct meaning and appropriately in context?
2) Level Appropriateness ‚Äî Are the tense, structure, and syntax appropriate for the target CEFR level (A1, A2, B1, B2, C1)?
3) Grammatical Accuracy ‚Äî Are the grammatical structures correct and suitable for the expected level (simple / intermediate / advanced)?
4) Naturalness ‚Äî Does the sentence sound natural and align with standard usage by native English speakers?

‚ö†Ô∏è Important Instructions:
- **Only return numerical ratings** for each criterion.
- **Do no

# GPT-5 Modeline Prompt G√∂nderip Sonu√ßlarƒ± Alma

In [3]:
import os
import json
import re
import time
from typing import Dict, List
from openai import OpenAI
from tqdm import tqdm
from dotenv import load_dotenv, find_dotenv

# ========= ENV / CLIENT =========
load_dotenv(find_dotenv())
API_KEY = os.getenv("OPENAI_API_KEY")
if not API_KEY:
    raise RuntimeError("OPENAI_API_KEY not set. Put it in .env or export it.")

# Model adƒ±nƒ± burada deƒüi≈ütir: ("gpt-5", "gpt-5-mini", "gpt-4.1" vs.)
MODEL_NAME = "gpt-5"
TEMPERATURE = 1.0                   # yaratƒ±cƒ± cevaplar i√ßin, artƒ±k sadece 1.0 kabul ediliyormu≈ü

client = OpenAI(api_key=API_KEY)

# ========= PATHS =========
root = os.getcwd()
tasks_dir = os.path.abspath(os.path.join(root, "..", "data", "tasks"))
ratings_root = os.path.abspath(os.path.join(root, "..", "data", "ratings"))
output_dir = os.path.join(ratings_root, "chatgpt_ratings")
os.makedirs(output_dir, exist_ok=True)

RAW_LOG = False
raw_dir = os.path.join(output_dir, "raw_logs")
if RAW_LOG:
    os.makedirs(raw_dir, exist_ok=True)

# ========= PARSER =========
def parse_response(response_text: str) -> Dict[str, Dict[str, int]]:
    pattern = r"Sentence\s*([A-F])\s*:\s*([1-5])\s*,\s*([1-5])\s*,\s*([1-5])\s*,\s*([1-5])"
    results = {}
    for m in re.finditer(pattern, response_text):
        label = f"Sentence {m.group(1)}"
        s = list(map(int, m.groups()[1:]))
        results[label] = {"word_usage": s[0], "clarity": s[1], "grammar": s[2], "naturalness": s[3]}
    return results

def average_scores(score_dicts: List[Dict[str, Dict[str, int]]]) -> Dict[str, Dict[str, float]]:
    merged: Dict[str, Dict[str, List[float]]] = {}
    for sd in score_dicts:
        for label, metrics in sd.items():
            merged.setdefault(label, {k: [] for k in ["word_usage","clarity","grammar","naturalness"]})
            for k, v in metrics.items():
                merged[label][k].append(float(v))
    averaged: Dict[str, Dict[str, float]] = {}
    for label, lists in merged.items():
        averaged[label] = {k: (sum(vals)/len(vals) if vals else 0.0) for k, vals in lists.items()}
    return averaged

# ========= API CALL =========
def call_openai(prompt: str, retries: int = 3, backoff: float = 2.0) -> str:
    for attempt in range(1, retries + 1):
        try:
            resp = client.chat.completions.create(
                model=MODEL_NAME,
                messages=[{"role": "user", "content": prompt}],
                temperature=TEMPERATURE,
                stream=False,
            )
            return resp.choices[0].message.content
        except Exception as e:
            print(f"[ERROR] API call failed (attempt {attempt}/{retries}): {e}")
            if attempt < retries:
                time.sleep(backoff ** attempt)
            else:
                raise
    return ""

# ========= RUN ONE LEVEL =========
def process_level(level: str, N_EVALS: int = 2, skip_if_exists: bool = False):
    tasks_path = os.path.join(tasks_dir, f"tasks_{level}.json")
    out_path   = os.path.join(output_dir, f"ratings_{level}.json")

    if not os.path.exists(tasks_path):
        print(f"[SKIP] No tasks file: {tasks_path}")
        return

    if skip_if_exists and os.path.exists(out_path):
        print(f"[SKIP] Output exists ‚Üí {out_path}")
        return

    with open(tasks_path, "r", encoding="utf-8") as f:
        tasks = json.load(f)

    all_ratings = []
    for task in tqdm(tasks, desc=f"Processing {level}"):
        prompt  = task["prompt"]
        mapping = task["mapping"]
        task_id = task["task_id"]
        word    = task["word"]

        try:
            runs = []
            for run_idx in range(N_EVALS):
                reply = call_openai(prompt)
                if RAW_LOG:
                    with open(os.path.join(raw_dir, f"{level}__{task_id}__run{run_idx+1}__prompt.txt"), "w", encoding="utf-8") as pf:
                        pf.write(prompt)
                    with open(os.path.join(raw_dir, f"{level}__{task_id}__run{run_idx+1}__reply.txt"), "w", encoding="utf-8") as rf:
                        rf.write(reply)
                runs.append(parse_response(reply))

            averaged = average_scores(runs)

            for label, rating in averaged.items():
                if label not in mapping:
                    print(f"[WARN] Unmatched label: {label} (task_id={task_id})")
                    continue
                all_ratings.append({
                    "task_id":  task_id,
                    "model":    mapping[label]["model"],
                    "level":    level,
                    "word":     word,
                    "label":    label,
                    "sentence": mapping[label]["sentence"],
                    "ratings":  {
                        "word_usage": round(rating.get("word_usage", 0.0), 1),
                        "clarity":     round(rating.get("clarity", 0.0), 1),
                        "grammar":     round(rating.get("grammar", 0.0), 1),
                        "naturalness": round(rating.get("naturalness", 0.0), 1),
                    }
                })
        except Exception as e:
            print(f"[ERROR] level={level} task={task_id}: {e}")
            continue

    with open(out_path, "w", encoding="utf-8") as f:
        json.dump(all_ratings, f, ensure_ascii=False, indent=2)

    task_ids = {r["task_id"] for r in all_ratings}
    expected_rows = 6 * len(task_ids)
    print(f"‚úÖ {level}: {len(all_ratings)} rows saved ‚Üí {out_path}")
    if len(all_ratings) != expected_rows:
        print(f"‚ö†Ô∏è  {level}: Row count mismatch (expected {expected_rows}, got {len(all_ratings)}).")

# ========= MAIN =========
if __name__ == "__main__":
    LEVELS = ["A1", "A2", "B1", "B2", "C1"]
    for lvl in LEVELS:
        print("\n" + "="*18 + f"  START {lvl}  " + "="*18)
        process_level(lvl, N_EVALS=2, skip_if_exists=False)
    print("\nüéâ Done: All levels written to data/ratings/chatgpt_ratings/")






Processing A1: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 10/10 [12:19<00:00, 73.91s/it]


‚úÖ A1: 60 rows saved ‚Üí /home/user/Documents/Tez/Deneyler/LLM_Degerlendirme/data/ratings/chatgpt_ratings/ratings_A1.json



Processing A2: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 10/10 [07:30<00:00, 45.04s/it]


‚úÖ A2: 60 rows saved ‚Üí /home/user/Documents/Tez/Deneyler/LLM_Degerlendirme/data/ratings/chatgpt_ratings/ratings_A2.json



Processing B1: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 10/10 [08:59<00:00, 53.98s/it]


‚úÖ B1: 60 rows saved ‚Üí /home/user/Documents/Tez/Deneyler/LLM_Degerlendirme/data/ratings/chatgpt_ratings/ratings_B1.json



Processing B2: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 10/10 [07:48<00:00, 46.85s/it]


‚úÖ B2: 60 rows saved ‚Üí /home/user/Documents/Tez/Deneyler/LLM_Degerlendirme/data/ratings/chatgpt_ratings/ratings_B2.json



Processing C1: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 10/10 [10:28<00:00, 62.80s/it]

‚úÖ C1: 60 rows saved ‚Üí /home/user/Documents/Tez/Deneyler/LLM_Degerlendirme/data/ratings/chatgpt_ratings/ratings_C1.json

üéâ Done: All levels written to data/ratings/chatgpt_ratings/



