<a href="https://colab.research.google.com/github/eyupdalan/BLM6114-hw2/blob/main/BLM6114_hw2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Hesaplamalı Anlambilim Dersi 2.Ödevi
Konusu: Sınıflandırma

## Package installations

In [None]:
!pip install datasets torch scikit-learn transformers # uncomment if necessary

### Necessary imports

In [None]:
import torch
import random
import re
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from datasets import load_dataset
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

## Dataset preperations
https://huggingface.co/datasets/turkish-nlp-suite/TrGLUE

sst2 için eğitim kümesini 60K-->5K, test kümesini 9K-->1K düşürerek kullanınız.

> TrSST-2 The Stanford Sentiment Treebank is a sentiment analysis dataset includes sentences from movie reviews, annotated by human annotators. The task is to predict the sentiment of a given sentence. Our dataset is compiled from movie review websites BeyazPerde.com and Sinefil.com, both reviews and sentiment ratings are compiled from those websites. Here we offer a binary classification task to be compatible with the original GLUE task, however we offer a 10-way classification challenge in this dataset's standalone HuggingFace repo.

cola için eğitim kümesini 8K-->5K, test kümesini değiştirmeden kullanınız.

> TrCOLA The original Corpus of Linguistic Acceptability consists of sentences compiled from English literature textbooks. The task is to determine if the sentences are grammatically correct and acceptable sentences. Our corpus is also compiled from Turkish linguistic textbooks and include morphological, syntactic and semantic violations. This dataset also has a standalone repo on HuggingFace.


In [None]:
# SST2
sst2 = load_dataset("turkish-nlp-suite/TrGLUE", "sst2")
sst2_train = sst2['train'].shuffle(seed=42).select(range(5000))  # Eğitim: 60K -> 5K
sst2_test = sst2['test'].shuffle(seed=42).select(range(1000))  # Test: 9K -> 1K

# CoLA
cola = load_dataset("turkish-nlp-suite/TrGLUE", "cola")
cola_train = cola['train'].shuffle(seed=42).select(range(5000))  # Eğitim: 5K
cola_test = cola['test']  # Test: Tüm test kümesi



## Check imported data

In [None]:
sst_train_len = len(sst2_train)
sst_test_len = len(sst2_test)
cola_train_len = len(cola_train)
cola_test_len = len(cola_test)

print(f"SST2 eğitim kümesi uzunluğu: {sst_train_len}")
print(f"SST2 test kümesi uzunluğu: {sst_test_len}")
print(f"CoLA eğitim kümesi uzunluğu: {cola_train_len}")
print(f"CoLA test kümesi uzunluğu: {cola_test_len}")

# sst2_train verisinin ilk 10 satırını yazdır
print("sst2_train:")
for i in range(10):
    print(sst2_train[i])

# sst2_test verisinin ilk 10 satırını yazdır
print("\nsst2_test:")
for i in range(10):
    print(sst2_test[i])

# cola_train verisinin ilk 10 satırını yazdır
print("\ncola_train:")
for i in range(10):
    print(cola_train[i])

# cola_test verisinin ilk 10 satırını yazdır
print("\ncola_test:")
for i in range(10):
    print(cola_test[i])

## Model installments

1. https://huggingface.co/ytu-ce-cosmos/Turkish-Llama-8b-DPO-v0.1
2. https://huggingface.co/google/gemma-2-9b-it
3. https://huggingface.co/Trendyol/Llama-3-Trendyol-LLM-8b-chat-v2.0
4. https://huggingface.co/TURKCELL/Turkcell-LLM-7b-v1
5. https://huggingface.co/WiroAI/wiroai-turkish-llm-9b

In [None]:
def generate_llm (model_name):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        device_map="auto",
        torch_dtype=torch.float16,
    )
    llm = pipeline("text-generation", model=model, tokenizer=tokenizer)
    return llm

# Modelleri yükleme
models = {
    "cosmos_dpo": generate_llm("ytu-ce-cosmos/Turkish-Llama-8b-DPO-v0.1"),
    "gemma2_9b": generate_llm("Metin/Gemma-2-9b-it-TR-DPO-V1"),
    "trendyol_llm": generate_llm("Trendyol/Trendyol-LLM-8b-chat-v2.0"),
    "turkcell_llm":generate_llm("TURKCELL/Turkcell-LLM-7b-v1"),
    "wiroai_llm": generate_llm("WiroAI/wiroai-turkish-llm-9b"),
}


In [None]:
def build_prompts_with_examples_sst2(sentence, shot_count):
  prompt = f'Cümle: "{sentence}"\nBu cümledeki duygu olumlu mu? Lütfen sadece "Evet" veya "Hayır" şeklinde cevap verin.\n'
  selected_examples = sst2_train.shuffle(seed=42).select(range(shot_count))
  examples = ""
  for i in range(shot_count):
    cevap = ""
    if(selected_examples["label"][i] == 1):
      cevap = "Bu cümledeki duygu olumludur"
    else:
      cevap = "Bu cümledeki duygu olumsuzdur"
    examples += f'Cümle: "{selected_examples["sentence"][i]}"\nCevap: {cevap}\n'

  if (examples != ""):
    examples += "Yukarıdaki cümleleri ve cevaplarını değerlendir.\n"

  return f"{examples}{prompt}"

def build_prompts_with_examples_cola(sentence, shot_count):
  prompt = f'Cümle: "{sentence}"\nBu cümlenin Türkçe dilbilgisi kurallarına uygunluğunu değerlendirin. Eğer cümle dilbilgisi açısından kabul edilebilir ise, "Evet" değil ise "Hayır" şeklinde cevap verin.Lütfen sadece "Evet" veya "Hayır" şeklinde cevap verin.\n'
  selected_examples = cola_train.shuffle(seed=42).select(range(shot_count))
  examples = ""
  for i in range(shot_count):
    cevap = ""
    if(selected_examples["label"][i] == 1):
      cevap = "Bu cümle kurallara uygundur"
    else:
      cevap = "Bu cümle kurallara uygun değildir"
    examples += f'Cümle: "{selected_examples["sentence"][i]}"\nCevap: {cevap}\n'

  if (examples != ""):
    examples += "Yukarıdaki cümleleri ve cevaplarını değerlendir.\n"

  return f"{examples}{prompt}"


def run_prompt(prompt, model_name, max_new_tokens=150):
  print(f"Model: {model_name}")
  print(f"Prompt: {prompt}")
  outputs = models[model_name](prompt, max_new_tokens=max_new_tokens, do_sample=False)
  generated = outputs[0]['generated_text'][len(prompt):].lower().strip()
  return generated

def run_bulk_prompt(prompts, model_name, max_new_tokens=150):
  outputs = models[model_name](prompts, max_new_tokens=max_new_tokens, do_sample=False)
  generated_outputs = []
  for output in outputs:
    generated = output[0]['generated_text'][len(prompts[outputs.index(output)]):].lower().strip()  # İstem uzunluğunu çıkar
    generated_outputs.append(generated)

  return generated_outputs

def bulk_predict_for_few_shot_sst2(sentences, shot_count, model_name):
  prompts = [build_prompts_with_examples_sst2(sentence, shot_count) for sentence in sentences]
  outputs = run_bulk_prompt(prompts, model_name)
  pred_labels = []
  for i, output in enumerate(outputs):
    if "evet" in output:
      pred_labels.append(1)
    elif "hayır" in output:
      pred_labels.append(0)
    else:
      pred_labels.append(-1)
  return pred_labels

def bulk_predict_for_few_shot_cola(sentences, shot_count, model_name):
  prompts = [build_prompts_with_examples_cola(sentence, shot_count) for sentence in sentences]
  outputs = run_bulk_prompt(prompts, model_name)
  pred_labels = []
  for i, output in enumerate(outputs):
    if "evet" in output:
      pred_labels.append(1)
    elif "hayır" in output:
      pred_labels.append(0)
    else:
      pred_labels.append(-1)
  return pred_labels

In [None]:
print(sst2_test["sentence"][:5])
print(sst2_test["label"][:5])

print(sst2_test.select(range(3))["sentence"])

In [None]:
print(run_prompt(build_prompts_with_examples_cola(cola_test["sentence"][0],0), "cosmos_dpo"))

In [None]:
print(build_prompts_with_examples_sst2(sst2_test["sentence"][0], 0))
print(build_prompts_with_examples_sst2(sst2_test["sentence"][0], 3))
print(build_prompts_with_examples_sst2(sst2_test["sentence"][0], 5))

print(build_prompts_with_examples_cola(cola_test["sentence"][0], 0))
print(build_prompts_with_examples_cola(cola_test["sentence"][0], 3))
print(build_prompts_with_examples_cola(cola_test["sentence"][0], 5))

In [None]:
sample_count = 10
print(f"Started for {sample_count} samples")

def run_analyse_for_sst2():
  actual_labels_sst2 = sst2_test["label"][:sample_count]
  actual_labels_cola = cola_test["label"][:sample_count]
  print(f"True Labels - sst-2: {actual_labels_sst2}")
  print(f"True Labels - cola: {actual_labels_cola}")

  all_results = []

  for model in models:
    print(f"Model: {model}")

    for shot in [0, 3, 5]:
      print(f"Shot: {shot}")
      print(f"SST-2")
      predicted_labels_sst2 = bulk_predict_for_few_shot_sst2(sst2_test["sentence"][:sample_count], shot_count=shot, model_name=model)

      accuracy_sst2 = accuracy_score(actual_labels_sst2, predicted_labels_sst2)
      f1_sst2 = f1_score(actual_labels_sst2, predicted_labels_sst2, average='weighted')
      precision_sst2 = precision_score(actual_labels_sst2, predicted_labels_sst2, average='weighted')
      recall_sst2 = recall_score(actual_labels_sst2, predicted_labels_sst2, average='weighted')

      all_results.append({
        "model": model,
        "task": "SST-2",
        "shot": shot,
        "accuracy": accuracy_sst2,
        "f1": f1_sst2,
        "precision": precision_sst2,
        "recall": recall_sst2
      })

      print(f"CoLA")
      predicted_labels_cola = bulk_predict_for_few_shot_cola(cola_test["sentence"][:sample_count], shot_count=shot, model_name=model)

      accuracy_cola = accuracy_score(actual_labels_cola, predicted_labels_cola)
      f1_cola = f1_score(actual_labels_cola, predicted_labels_cola, average='weighted')
      precision_cola = precision_score(actual_labels_cola, predicted_labels_cola, average='weighted')
      recall_cola = recall_score(actual_labels_cola, predicted_labels_cola, average='weighted')

      all_results.append({
        "model": model,
        "task": "CoLA",
        "shot": shot,
        "accuracy": accuracy_cola,
        "f1": f1_cola,
        "precision": precision_cola,
        "recall": recall_cola
      })

  return all_results


results = run_analyse_for_sst2()

results_df = pd.DataFrame(results)
print("\nTüm sonuçlar:")
print(results_df)

# Görselleştirme: Task ve shot sayısına göre doğruluk
plt.figure(figsize=(14, 8))
sns.barplot(x='model', y='accuracy', hue='shot',
            data=results_df, palette='viridis',
            dodge=True, ci=None)
plt.title('LLM Modellerinin Görevlere Göre Doğruluk Performansı')
plt.xlabel('Model')
plt.ylabel('Doğruluk')
plt.xticks(rotation=45, ha='right')
plt.legend(title='Shot Sayısı')
plt.tight_layout()
plt.savefig('model_accuracy_by_shot.png')
plt.show()

# Task ve modele göre doğruluk karşılaştırması
plt.figure(figsize=(14, 8))
sns.barplot(x='task', y='accuracy', hue='model',
            data=results_df, palette='Set2',
            dodge=True, ci=None)
plt.title('Görevlere Göre Model Doğruluk Karşılaştırması')
plt.xlabel('Görev')
plt.ylabel('Doğruluk')
plt.legend(title='Model')
plt.tight_layout()
plt.savefig('task_accuracy_by_model.png')
plt.show()

# Shot sayısının performans üzerindeki etkisi
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

for i, task in enumerate(['CoLA', 'SST-2']):
    task_df = results_df[results_df['task'] == task]
    sns.lineplot(x='shot', y='accuracy', hue='model',
                 data=task_df, palette='Set1',
                 markers=True, dashes=False, ax=axes[i])
    axes[i].set_title(f'{task} Görevinde Shot Sayısının Etkisi')
    axes[i].set_xlabel('Shot Sayısı')
    axes[i].set_ylabel('Doğruluk')
    axes[i].grid(True, linestyle='--', alpha=0.7)

plt.tight_layout()
plt.savefig('shot_effect_on_accuracy.png')
plt.show()

# 9. Sonuçların özeti
# =============

print("\n=== ÖZET RAPOR ===")

# Görev ve model bazında en iyi performanslar
for task in ['CoLA', 'SST-2']:
    task_df = results_df[results_df['task'] == task]
    best_model_idx = task_df['accuracy'].idxmax()
    best_model = task_df.loc[best_model_idx]

    print(f"\n{task} görevinde en iyi performans:")
    print(f"Model: {best_model['model']}")
    print(f"Shot Sayısı: {best_model['shot']}")
    print(f"Doğruluk: {best_model['accuracy']:.4f}")
    print(f"F1 Skor: {best_model['f1']:.4f}")

