# Tek Bir JSON Dosyasından Veri Çekelim

In [1]:
import json

# Örnek dosya yolu
file_path = "Model2.json"

# JSON dosyasını oku
with open(file_path, "r", encoding="utf-8") as f:
    data = json.load(f)

entries = []

model_name = data["model_name"]

for generation in data["generations"]:
    level = generation["cefr_level"]
    words = generation["word_list"]
    sentences = generation["generated_sentences"]

    for word, sentence in zip(words, sentences):
        entries.append({
            "model": model_name,
            "level": level,
            "word": word,
            "sentence": sentence
        })

# Kontrol amaçlı ilk birkaç girdiyi yazdıralım
for e in entries[:5]:
    print(e)


{'model': 'Llama-3.2-8B-Instruct/Llama-3.2-8B-Instruct-Q4_K_M.gguf', 'level': 'A1', 'word': 'age', 'sentence': 'My birthday is on January 12th.'}
{'model': 'Llama-3.2-8B-Instruct/Llama-3.2-8B-Instruct-Q4_K_M.gguf', 'level': 'A1', 'word': 'animal', 'sentence': 'The cat is a pet animal.'}
{'model': 'Llama-3.2-8B-Instruct/Llama-3.2-8B-Instruct-Q4_K_M.gguf', 'level': 'A1', 'word': 'ask', 'sentence': 'Can you ask your teacher for help?'}
{'model': 'Llama-3.2-8B-Instruct/Llama-3.2-8B-Instruct-Q4_K_M.gguf', 'level': 'A1', 'word': 'computer', 'sentence': 'The computer is very useful for students.'}
{'model': 'Llama-3.2-8B-Instruct/Llama-3.2-8B-Instruct-Q4_K_M.gguf', 'level': 'A1', 'word': 'eat', 'sentence': 'I like to eat pizza on Fridays.'}


# Çoklu Dosyadan Veriyi Al, Etiketle, Karıştır

In [6]:
import os
import json
import random
from collections import defaultdict

# 📁 JSON dosyalarının bulunduğu klasör
json_folder = "data/model_results"

# 💾 Tüm cümleleri saklayacağımız yer
all_entries = []

# 📥 Tüm dosyaları oku
for filename in os.listdir(json_folder):
    if filename.endswith(".json"):
        filepath = os.path.join(json_folder, filename)
        with open(filepath, "r", encoding="utf-8") as f:
            data = json.load(f)
            model_name = data["model_name"]
            for generation in data["generations"]:
                level = generation["cefr_level"]
                words = generation["word_list"]
                sentences = generation["generated_sentences"]
                for word, sentence in zip(words, sentences):
                    all_entries.append({
                        "model": model_name,
                        "level": level,
                        "word": word,
                        "sentence": sentence
                    })

# ✅ Her kelime için 6 modelin cümlesini gruplama
grouped = defaultdict(list)

for entry in all_entries:
    key = (entry["level"], entry["word"])  # örnek: ("A1", "age")
    grouped[key].append(entry)

# ✅ Cümleleri karıştır, etiketle ve mapping oluştur
all_tasks = []

for (level, word), sentence_group in grouped.items():
    if len(sentence_group) != 6:
        print(f"Uyarı: {level} seviyesinde '{word}' kelimesi için {len(sentence_group)} cümle var. Atlanıyor.")
        continue

    # Cümleleri karıştır
    random.shuffle(sentence_group)

    labels = ["Sentence A", "Sentence B", "Sentence C", "Sentence D", "Sentence E", "Sentence F"]

    labeled_sentences = []
    mapping = {}

    for label, item in zip(labels, sentence_group):
        labeled_sentences.append((label, item["sentence"]))
        mapping[label] = {
            "model": item["model"],
            "level": level,
            "word": word,
            "sentence": item["sentence"]
        }

    all_tasks.append({
        "level": level,
        "word": word,
        "labeled_sentences": labeled_sentences,
        "mapping": mapping
    })

# 🔍 Örnek çıktı (bir task)
example = all_tasks[0]
print(f"\nSample task for word '{example['word']}' (Level: {example['level']})\n")
for label, sentence in example["labeled_sentences"]:
    print(f"{label}: {sentence}")



Sample task for word 'age' (Level: A1)

Sentence A: I am ten years old.
Sentence B: My birthday is on January 12th.
Sentence C: I am 20 years old.
Sentence D: My age is twenty-five years old.
Sentence E: My age is seven years old.
Sentence F: I am 10 years old, and I love playing with my pet dog.


# Promptları Otomatik Üretmek

In [7]:
task_prompts = []

prompt_template = """You are a professional CEFR-aligned English sentence evaluator.

Your task is to evaluate 6 example sentences that all use the target word: "{word}" at CEFR level: {level}.

Rate each sentence from 1 (poor) to 5 (excellent) for the following **four independent criteria**:

1. **Word Usage** – Is the target word used correctly and meaningfully in context?
2. **Clarity** – Is the sentence understandable and suitable for the given CEFR level?
3. **Grammar** – Is the grammar accurate and appropriate for the level?
4. **Naturalness** – Does the sentence sound fluent and natural to a native speaker?

⚠️ Important Instructions:
- **Only return numerical ratings** for each criterion.
- **Do not include any explanations, comments, or justifications.**
- Follow the exact output format below.

### Output Format:
Sentence A: <Word Usage>, <Clarity>, <Grammar>, <Naturalness>  
Sentence B: ...  
Sentence C: ...  
Sentence D: ...  
Sentence E: ...  
Sentence F: ...  

### Sentences:
Sentence A: {sentence_A}  
Sentence B: {sentence_B}  
Sentence C: {sentence_C}  
Sentence D: {sentence_D}  
Sentence E: {sentence_E}  
Sentence F: {sentence_F}
"""

for task in all_tasks:
    word = task["word"]
    level = task["level"]
    labeled = dict(task["labeled_sentences"])  # {'Sentence A': '...', ...}

    prompt = prompt_template.format(
        word=word,
        level=level,
        sentence_A=labeled["Sentence A"],
        sentence_B=labeled["Sentence B"],
        sentence_C=labeled["Sentence C"],
        sentence_D=labeled["Sentence D"],
        sentence_E=labeled["Sentence E"],
        sentence_F=labeled["Sentence F"],
    )

    # Prompt'la birlikte task ID'sini (level, word) saklıyoruz
    task_prompts.append({
        "level": level,
        "word": word,
        "prompt": prompt
    })


In [21]:
print("🔍 First Prompt Preview:\n")
print(task_prompts[27]["prompt"])


🔍 First Prompt Preview:

You are an expert English evaluator.

Evaluate the following 6 example sentences that all use the word: "religion" (CEFR Level: B1).

Rate each sentence from 1 (poor) to 5 (excellent) on the following four criteria:

1. Word Usage – Is the word used correctly and meaningfully?
2. Clarity – Is the sentence understandable and appropriate for the given CEFR level?
3. Grammar – Is the grammar correct and level-appropriate?
4. Naturalness – Does the sentence sound natural and fluent (not AI-generated)?

⚠️ Do not give any explanations or comments.
⚠️ Just return the ratings in the following format, with no extra output:

Sentence A: <Word Usage>, <Clarity>, <Grammar>, <Naturalness>
Sentence B: ...
Sentence C: ...
Sentence D: ...
Sentence E: ...
Sentence F: ...

Sentences:
Sentence A: People should respect each other's religion.
Sentence B: The new employee has to respect the company's religion policy.
Sentence C: Different cultures have their own religion and tradit

# DeepSeek ile 2 Prompt Testi 

In [9]:
from openai import OpenAI
import time

# DeepSeek API key ve base_url
client = OpenAI(
    api_key="sk-9115f967efad41a09f30b761d5f36f53",
    base_url="https://api.deepseek.com"
)

# İlk iki prompt (daha önce tanımladığın task_prompts listesi)
prompts_to_test = task_prompts[:2]

def parse_scores(raw_response):
    parsed = {}
    lines = raw_response.strip().split("\n")
    for line in lines:
        if ":" in line:
            parts = line.split(":")
            label = parts[0].strip()
            scores = [int(s.strip()) for s in parts[1].split(",") if s.strip().isdigit()]
            if len(scores) == 4:
                parsed[label] = {
                    "Word Usage": scores[0],
                    "Clarity": scores[1],
                    "Grammar": scores[2],
                    "Naturalness": scores[3]
                }
    return parsed

# Test et
for i, task in enumerate(prompts_to_test):
    print(f"\n--- Prompt {i+1} ({task['word']}, {task['level']}) ---\n")
    try:
        response = client.chat.completions.create(
            model="deepseek-chat",
            messages=[
                {"role": "user", "content": task["prompt"]}
            ],
            temperature=1.0
        )

        raw_output = response.choices[0].message.content
        print("🟦 Raw Output:\n", raw_output)

        parsed = parse_scores(raw_output)
        print("\n✅ Parsed Scores:")
        for sentence, scores in parsed.items():
            print(f"{sentence}: {scores}")

    except Exception as e:
        print("❌ Hata:", e)

    time.sleep(2)



--- Prompt 1 (age, A1) ---

🟦 Raw Output:
 Sentence A: 5, 5, 5, 5  
Sentence B: 1, 5, 5, 5  
Sentence C: 5, 5, 5, 5  
Sentence D: 3, 4, 3, 3  
Sentence E: 3, 4, 3, 3  
Sentence F: 5, 5, 5, 5

✅ Parsed Scores:
Sentence A: {'Word Usage': 5, 'Clarity': 5, 'Grammar': 5, 'Naturalness': 5}
Sentence B: {'Word Usage': 1, 'Clarity': 5, 'Grammar': 5, 'Naturalness': 5}
Sentence C: {'Word Usage': 5, 'Clarity': 5, 'Grammar': 5, 'Naturalness': 5}
Sentence D: {'Word Usage': 3, 'Clarity': 4, 'Grammar': 3, 'Naturalness': 3}
Sentence E: {'Word Usage': 3, 'Clarity': 4, 'Grammar': 3, 'Naturalness': 3}
Sentence F: {'Word Usage': 5, 'Clarity': 5, 'Grammar': 5, 'Naturalness': 5}

--- Prompt 2 (animal, A1) ---

🟦 Raw Output:
 Sentence A: 4, 4, 5, 5  
Sentence B: 5, 5, 5, 5  
Sentence C: 5, 5, 5, 5  
Sentence D: 5, 5, 5, 5  
Sentence E: 4, 4, 5, 4  
Sentence F: 5, 5, 5, 5

✅ Parsed Scores:
Sentence A: {'Word Usage': 4, 'Clarity': 4, 'Grammar': 5, 'Naturalness': 5}
Sentence B: {'Word Usage': 5, 'Clarity': 5, '

In [11]:
# ✅ Önce DeepSeek'ten dönen raw skorları hardcoded şekilde ekleyelim:

deepseek_results = {
    ("A1", "age"): {
        "Sentence A": {'Word Usage': 5, 'Clarity': 5, 'Grammar': 5, 'Naturalness': 5},
        "Sentence B": {'Word Usage': 1, 'Clarity': 5, 'Grammar': 5, 'Naturalness': 5},
        "Sentence C": {'Word Usage': 5, 'Clarity': 5, 'Grammar': 5, 'Naturalness': 5},
        "Sentence D": {'Word Usage': 3, 'Clarity': 4, 'Grammar': 3, 'Naturalness': 3},
        "Sentence E": {'Word Usage': 3, 'Clarity': 4, 'Grammar': 3, 'Naturalness': 3},
        "Sentence F": {'Word Usage': 5, 'Clarity': 5, 'Grammar': 5, 'Naturalness': 5},
    },
    ("A1", "animal"): {
        "Sentence A": {'Word Usage': 4, 'Clarity': 4, 'Grammar': 5, 'Naturalness': 5},
        "Sentence B": {'Word Usage': 5, 'Clarity': 5, 'Grammar': 5, 'Naturalness': 5},
        "Sentence C": {'Word Usage': 5, 'Clarity': 5, 'Grammar': 5, 'Naturalness': 5},
        "Sentence D": {'Word Usage': 5, 'Clarity': 5, 'Grammar': 5, 'Naturalness': 5},
        "Sentence E": {'Word Usage': 4, 'Clarity': 4, 'Grammar': 5, 'Naturalness': 4},
        "Sentence F": {'Word Usage': 5, 'Clarity': 5, 'Grammar': 5, 'Naturalness': 5},
    }
}

# ✅ Model bazlı puanları toplayacağımız dict
model_scores = {}

# ✅ all_tasks içinden ilgili kelimeleri bul
for task in all_tasks:
    key = (task["level"], task["word"])
    if key not in deepseek_results:
        continue

    scores_for_task = deepseek_results[key]
    mapping = task["mapping"]

    for sentence_label, scores in scores_for_task.items():
        model_name = mapping[sentence_label]["model"]

        if model_name not in model_scores:
            model_scores[model_name] = {}

        if key not in model_scores[model_name]:
            model_scores[model_name][key] = {
                "Word Usage": [],
                "Clarity": [],
                "Grammar": [],
                "Naturalness": []
            }

        # Skorları ekle
        for criterion, value in scores.items():
            model_scores[model_name][key][criterion].append(value)

# ✅ Sonuçları yazdır
for model, words in model_scores.items():
    print(f"\n🧠 Model: {model}")
    for (level, word), criteria in words.items():
        print(f"  🔹 Word: '{word}' (Level: {level})")
        for criterion, values in criteria.items():
            avg = round(sum(values) / len(values), 2)
            print(f"    - {criterion}: {values} → Avg: {avg}")



🧠 Model: GPT-4-turbo
  🔹 Word: 'age' (Level: A1)
    - Word Usage: [5] → Avg: 5.0
    - Clarity: [5] → Avg: 5.0
    - Grammar: [5] → Avg: 5.0
    - Naturalness: [5] → Avg: 5.0
  🔹 Word: 'animal' (Level: A1)
    - Word Usage: [5] → Avg: 5.0
    - Clarity: [5] → Avg: 5.0
    - Grammar: [5] → Avg: 5.0
    - Naturalness: [5] → Avg: 5.0

🧠 Model: Llama-3.2-8B-Instruct/Llama-3.2-8B-Instruct-Q4_K_M.gguf
  🔹 Word: 'age' (Level: A1)
    - Word Usage: [1] → Avg: 1.0
    - Clarity: [5] → Avg: 5.0
    - Grammar: [5] → Avg: 5.0
    - Naturalness: [5] → Avg: 5.0
  🔹 Word: 'animal' (Level: A1)
    - Word Usage: [4] → Avg: 4.0
    - Clarity: [4] → Avg: 4.0
    - Grammar: [5] → Avg: 5.0
    - Naturalness: [5] → Avg: 5.0

🧠 Model: Ministral-8B-Instruct-2410.Q4_K_M.gguf
  🔹 Word: 'age' (Level: A1)
    - Word Usage: [5] → Avg: 5.0
    - Clarity: [5] → Avg: 5.0
    - Grammar: [5] → Avg: 5.0
    - Naturalness: [5] → Avg: 5.0
  🔹 Word: 'animal' (Level: A1)
    - Word Usage: [5] → Avg: 5.0
    - Clarity: [5]

In [13]:
# Age ve animal kelimeleri için mapping'leri yazdıralım
for task in all_tasks:
    if (task["level"], task["word"]) in [("A1", "age"), ("A1", "animal")]:
        print(f"\n📌 Mapping for '{task['word']}' ({task['level']})")
        for label, info in task["mapping"].items():
            print(f"{label} → {info['model']}")



📌 Mapping for 'age' (A1)
Sentence A → GPT-4-turbo
Sentence B → Llama-3.2-8B-Instruct/Llama-3.2-8B-Instruct-Q4_K_M.gguf
Sentence C → Ministral-8B-Instruct-2410.Q4_K_M.gguf
Sentence D → Claude Sonnet 4
Sentence E → Gemini 2.5 Flash
Sentence F → Llama-3.2-3B-Instruct-Q4_K_M.gguf

📌 Mapping for 'animal' (A1)
Sentence A → Llama-3.2-8B-Instruct/Llama-3.2-8B-Instruct-Q4_K_M.gguf
Sentence B → Ministral-8B-Instruct-2410.Q4_K_M.gguf
Sentence C → Llama-3.2-3B-Instruct-Q4_K_M.gguf
Sentence D → Gemini 2.5 Flash
Sentence E → Claude Sonnet 4
Sentence F → GPT-4-turbo
