In [11]:
import os
import json
import re
import time
from typing import Dict, List
from openai import OpenAI
from tqdm import tqdm
from dotenv import load_dotenv, find_dotenv

# --- Ortam değişkenlerini yükle ---
load_dotenv(find_dotenv())

# --- Sabitler / Ayarlar ---
LEVELS = ["A2", "B1", "B2", "C1"]  # A1'i zaten ürettin
BASE_URL = "https://api.deepseek.com"
MODEL_NAME = "deepseek-chat"
TEMPERATURE = 1.0                  # 0.0-1.0 arası değer, 1.0 daha yaratıcı sonuçlar verir
N_EVALS = 2                        # her task için kaç tekrar

API_KEY = os.getenv("DEEPSEEK_API_KEY")
if not API_KEY:
    raise RuntimeError("DEEPSEEK_API_KEY ortam değişkeni tanımlı değil!")

# --- İstemci ---
client = OpenAI(api_key=API_KEY, base_url=BASE_URL)

# --- Yollar ---
root = os.getcwd()
tasks_dir = os.path.abspath(os.path.join(root, "..", "data", "tasks"))
ratings_root = os.path.abspath(os.path.join(root, "..", "data", "ratings"))
output_dir = os.path.join(ratings_root, "deepseek_ratings")
os.makedirs(output_dir, exist_ok=True)

# --- Ayrıştırma ---
def parse_response(response_text: str) -> Dict[str, Dict[str, int]]:
    pattern = r"Sentence\s*([A-F])\s*:\s*([1-5])\s*,\s*([1-5])\s*,\s*([1-5])\s*,\s*([1-5])"
    results = {}
    for m in re.finditer(pattern, response_text):
        label = f"Sentence {m.group(1)}"
        s = list(map(int, m.groups()[1:]))
        results[label] = {"word_usage": s[0], "clarity": s[1], "grammar": s[2], "naturalness": s[3]}
    return results

# --- Güvenli çağrı ---
def call_deepseek(prompt: str, retries: int = 3, backoff: float = 2.0) -> str:
    for attempt in range(1, retries + 1):
        try:
            resp = client.chat.completions.create(
                model=MODEL_NAME,
                messages=[{"role": "user", "content": prompt}],
                temperature=TEMPERATURE,
                stream=False,
            )
            return resp.choices[0].message.content
        except Exception as e:
            print(f"[HATA] API çağrısı başarısız (deneme {attempt}/{retries}): {e}")
            if attempt < retries:
                time.sleep(backoff ** attempt)
            else:
                raise
    return ""

def average_scores(score_dicts: List[Dict[str, Dict[str, int]]]) -> Dict[str, Dict[str, float]]:
    merged: Dict[str, Dict[str, List[float]]] = {}
    for sd in score_dicts:
        for label, metrics in sd.items():
            merged.setdefault(label, {k: [] for k in ["word_usage","clarity","grammar","naturalness"]})
            for k, v in metrics.items():
                if isinstance(v, (int, float)):
                    merged[label][k].append(float(v))
    averaged: Dict[str, Dict[str, float]] = {}
    for label, lists in merged.items():
        averaged[label] = {k: (sum(vals)/len(vals) if vals else 0.0) for k, vals in lists.items()}
    return averaged

def process_level(level: str):
    tasks_path = os.path.join(tasks_dir, f"tasks_{level}.json")
    out_path   = os.path.join(output_dir, f"ratings_{level}.json")

    if not os.path.exists(tasks_path):
        print(f"[ATLA] Task dosyası yok: {tasks_path}")
        return

    with open(tasks_path, "r", encoding="utf-8") as f:
        tasks = json.load(f)

    all_ratings = []
    for task in tqdm(tasks, desc=f"Processing {level}"):
        prompt  = task["prompt"]
        mapping = task["mapping"]
        task_id = task["task_id"]
        word    = task["word"]

        try:
            runs = []
            for _ in range(N_EVALS):
                reply = call_deepseek(prompt)
                runs.append(parse_response(reply))

            averaged = average_scores(runs)

            for label, rating in averaged.items():
                if label not in mapping:
                    print(f"[UYARI] Eşleşmeyen etiket: {label} (task_id={task_id})"); continue
                all_ratings.append({
                    "task_id":  task_id,
                    "model":    mapping[label]["model"],
                    "level":    level,
                    "word":     word,
                    "label":    label,
                    "sentence": mapping[label]["sentence"],
                    "ratings":  {
                        "word_usage": round(rating.get("word_usage", 0.0), 3),
                        "clarity":     round(rating.get("clarity", 0.0), 3),
                        "grammar":     round(rating.get("grammar", 0.0), 3),
                        "naturalness": round(rating.get("naturalness", 0.0), 3),
                    }
                })
        except Exception as e:
            print(f"[HATA] Level={level} Task={task_id}: {e}")
            continue

    with open(out_path, "w", encoding="utf-8") as f:
        json.dump(all_ratings, f, ensure_ascii=False, indent=2)
    print(f"✅ {level}: {len(all_ratings)} satır kaydedildi → {out_path}")

if __name__ == "__main__":
    for lvl in LEVELS:
        print("\n" + "="*32 + f"  START {lvl}  " + "="*32)
        process_level(lvl)
    print("\n🎉 Bitti: A2→C1 değerlendirmeleri yazıldı.")





Processing A2: 100%|██████████| 10/10 [02:24<00:00, 14.48s/it]


✅ A2: 60 satır kaydedildi → /home/user/Documents/Tez/Deneyler/LLM_Degerlendirme/data/ratings/deepseek_ratings/ratings_A2.json



Processing B1: 100%|██████████| 10/10 [02:19<00:00, 13.96s/it]


✅ B1: 60 satır kaydedildi → /home/user/Documents/Tez/Deneyler/LLM_Degerlendirme/data/ratings/deepseek_ratings/ratings_B1.json



Processing B2: 100%|██████████| 10/10 [02:29<00:00, 14.95s/it]


✅ B2: 60 satır kaydedildi → /home/user/Documents/Tez/Deneyler/LLM_Degerlendirme/data/ratings/deepseek_ratings/ratings_B2.json



Processing C1: 100%|██████████| 10/10 [02:20<00:00, 14.10s/it]

✅ C1: 60 satır kaydedildi → /home/user/Documents/Tez/Deneyler/LLM_Degerlendirme/data/ratings/deepseek_ratings/ratings_C1.json

🎉 Bitti: A2→C1 değerlendirmeleri yazıldı.





In [8]:
# === DEBUG: 2 görevle uçtan uca test hücresi ===

import os, json, re, textwrap
from typing import Dict
from dotenv import load_dotenv, find_dotenv
from openai import OpenAI

# ---- Ayarlar ----
LEVEL = "A1"  # burada test etmek istediğin seviyeyi seç (A1, A2, B1, B2, C1)
BASE_URL = "https://api.deepseek.com"
MODEL_NAME = "deepseek-chat"

# ---- Ortam / İstemci ----
load_dotenv(find_dotenv())
API_KEY = os.getenv("DEEPSEEK_API_KEY")
if not API_KEY:
    raise RuntimeError("DEEPSEEK_API_KEY bulunamadı. .env veya ortam değişkenini kontrol et.")
client = OpenAI(api_key=API_KEY, base_url=BASE_URL)

# ---- Yollar ----
root = os.getcwd()
tasks_path = os.path.abspath(os.path.join(root, "..", "data", "tasks", f"tasks_{LEVEL}.json"))

# ---- Parser ----
def parse_response(response_text: str) -> Dict[str, Dict[str, int]]:
    """
    Beklenen satırlar:
      Sentence A: 4, 4, 5, 4
    Dönen dict anahtarları: "Sentence A" ... "Sentence F"
    """
    pattern = r"Sentence\s*([A-F])\s*:\s*([1-5])\s*,\s*([1-5])\s*,\s*([1-5])\s*,\s*([1-5])"
    results = {}
    for m in re.finditer(pattern, response_text):
        label = f"Sentence {m.group(1)}"
        scores = list(map(int, m.groups()[1:]))
        results[label] = {
            "word_usage":  scores[0],
            "clarity":     scores[1],
            "grammar":     scores[2],
            "naturalness": scores[3],
        }
    return results

def assert_parsed_ok(parsed, mapping, task_id):
    # Etiket eşleşmesi
    expected = set(mapping.keys())                 # {"Sentence A", ..., "Sentence F"}
    got = set(parsed.keys())
    missing = expected - got
    extra   = got - expected
    if missing or extra:
        raise ValueError(f"[PARSE MISMATCH] task={task_id} missing={sorted(missing)} extra={sorted(extra)}")
    # Skor aralığı
    for lbl, r in parsed.items():
        for k, v in r.items():
            assert isinstance(v, int) and 1 <= v <= 5, f"[SCORE RANGE] task={task_id} {lbl}/{k} -> {v}"

# ---- Yardımcı: kısa cümle önizleme ----
def preview(s: str, n=80):
    s = s.replace("\n", " ")
    return s if len(s) <= n else s[:n] + "..."

# ---- Çalıştır ----
with open(tasks_path, "r", encoding="utf-8") as f:
    tasks = json.load(f)

tasks = tasks[:2]  # SADECE İLK 2 GÖREV

for idx, task in enumerate(tasks, start=1):
    prompt  = task["prompt"]
    mapping = task["mapping"]
    task_id = task["task_id"]
    word    = task["word"]
    level   = task["level"]

    print("="*90)
    print(f"[{idx}/2] TASK ID: {task_id} | LEVEL={level} | WORD={word}")
    print("-"*90)
    print(">> PROMPT (ilk 600 karakter):")
    #print(textwrap.shorten(prompt, width=600, placeholder=" ..."))
    print(prompt)
    print("-"*90)
    print(">> MAPPING LABELS:", list(mapping.keys()))
    print(">> MAPPING SENTENCE PREVIEW:")
    for lbl in ["Sentence A","Sentence B","Sentence C","Sentence D","Sentence E","Sentence F"]:
        sent = mapping[lbl]["sentence"]
        print(f"   {lbl}: {preview(sent)}")

    # --- API çağrısı ---
    resp = client.chat.completions.create(
        model=MODEL_NAME,
        messages=[{"role": "user", "content": prompt}],
        temperature=0,
        stream=False,
    )
    reply = resp.choices[0].message.content

    print("-"*90)
    print(">> RAW REPLY:")
    print(reply)

    # --- Parse & doğrulama ---
    parsed = parse_response(reply)
    print("-"*90)
    print(">> PARSED LABELS:", list(parsed.keys()))
    assert_parsed_ok(parsed, mapping, task_id)
    print("✅ PARSE OK & LABELS MATCH & SCORES IN RANGE")

    # --- Eşleştirilmiş örnek çıktı (ilk 2 label gösterelim) ---
    show_labels = ["Sentence A","Sentence B"]
    for lbl in show_labels:
        model_name = mapping[lbl]["model"]
        sentence   = mapping[lbl]["sentence"]
        scores     = parsed[lbl]
        print(f"→ {lbl} | model={model_name} | {preview(sentence)}")
        print(f"   scores: {scores}")

print("="*90)
print("🎉 Test tamam: 2 görev için prompt → reply → parse → mapping doğrulandı.")


[1/2] TASK ID: A1_age | LEVEL=A1 | WORD=age
------------------------------------------------------------------------------------------
>> PROMPT (ilk 600 karakter):
You are a professional CEFR-aligned English sentence evaluator.

Your task is to evaluate 6 example sentences that all use the target word: "age" at CEFR level: A1.

Rate each sentence from 1 (poor) to 5 (excellent) for the following **four independent criteria**:

1. **Word Usage** – Is the target word used correctly and meaningfully in context?
2. **Clarity** – Is the sentence understandable and suitable for the given CEFR level?
3. **Grammar** – Is the grammar accurate and appropriate for the level?
4. **Naturalness** – Does the sentence sound fluent and natural to a native speaker?

⚠️ Important Instructions:
- **Only return numerical ratings** for each criterion.
- **Do not include any explanations, comments, or justifications.**
- Follow the exact output format below.

### Output Format:
Sentence A: <Word Usage>, <Cla