In [9]:
!pip install -q openai google-generativeai anthropic statsmodels scikit-learn pandas tqdm

In [10]:
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from statsmodels.stats.contingency_tables import mcnemar
from tqdm import tqdm
import os

from openai import OpenAI
import google.generativeai as genai
import anthropic

from kaggle_secrets import UserSecretsClient

In [11]:
# ==== Setup API Keys ====
user_secrets = UserSecretsClient()

os.environ["OPENAI_API_KEY"] = user_secrets.get_secret("OPENAI_API_KEY")
os.environ["GOOGLE_API_KEY"] = user_secrets.get_secret("GOOGLE_API_KEY")
os.environ["ANTHROPIC_API_KEY"] = user_secrets.get_secret("ANTHROPIC_API_KEY")

# ==== Initialize Clients ====
openai_client = OpenAI(api_key=os.environ["OPENAI_API_KEY"])
genai.configure(api_key=os.environ["GOOGLE_API_KEY"])
claude_client = anthropic.Anthropic(api_key=os.environ["ANTHROPIC_API_KEY"])

# ==== Dataset Paths ====
TRAIN_PATH = "/kaggle/input/paper-kelulusan-eda/train_data.csv"  # for few-shot
TEST_PATH = "/kaggle/input/paper-kelulusan-eda/test_data.csv"    # for evaluation

LLMS = [
    "gpt", 
    "gemini", 
    "claude"
]
APPROACHES = ["zero_shot", "few_shot", "chain_of_thought"]

In [12]:
# ==== LLM Calls ====
def call_gpt(prompt: str) -> str:
    response = openai_client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[{"role": "user", "content": prompt}],
        temperature=0
    )
    return response.choices[0].message.content

def call_gemini(prompt: str) -> str:
    model = genai.GenerativeModel("gemini-2.5-flash")
    response = model.generate_content(prompt)
    return response.text

def call_claude(prompt: str) -> str:
    response = claude_client.messages.create(
        model="claude-3-5-sonnet-20240620",
        max_tokens=300,
        messages=[{"role": "user", "content": prompt}]
    )
    return response.content[0].text

LLM_CALLS = {
    "gpt": call_gpt,
    "gemini": call_gemini,
    "claude": call_claude,
}

In [13]:
def chunk_list(lst, chunk_size=5):
    """Split list into chunks for bulk prediction."""
    for i in range(0, len(lst), chunk_size):
        yield lst[i:i + chunk_size]

In [14]:
def bulk_predict(llm_name, approach, texts, train_df=None, batch_size=50):
    predictions = []
    for batch in chunk_list(texts, batch_size):
        text_list_str = "\n".join([f"{i+1}. {t}" for i, t in enumerate(batch)])
        
        # Build prompt
        if approach == "zero_shot":
            prompt = (
                "Lakukan klasifikasi sentimen (kita sebut positif sebagai positive dan negatif sebagai negative) untuk setiap teks dibawah ini:\n"
                f"{text_list_str}\n"
                "Berikan hasil dalam format:\n1. positive\n2. negative\n..."
            )
        elif approach == "few_shot":
            fewshot_examples = ""
            for _, row in train_df.sample(n=5, random_state=42).iterrows():
                fewshot_examples += f"Text: '{row['clean_text']}' -> Sentiment: {row['sentiment'].lower()}\n"
            prompt = (
                "Lakukan klasifikasi sentimen, berikut beberapa contoh samplenya:\n"
                f"{fewshot_examples}\n"
                f"Sekarang, lakukan klasifikasi dalam teks berikut:\n{text_list_str}\n"
                "Berikan hasil dalam format:\n1. positive\n2. negative\n..."
            )
        else:  # chain_of_thought
            prompt = (
                "Analisa dan lakukan klasifikasisetiap teks sebagai positif atau negatif:\n"
                f"{text_list_str}\n"
                "Berikan hasil akhir saja dalam format:\n1. positive\n2. negative\n..."
            )

        raw_output = LLM_CALLS[llm_name](prompt)
        raw_lines = [line.strip().lower() for line in raw_output.split("\n") if line.strip()]

        # Parse predictions line-by-line, fallback to neutral
        batch_preds = []
        for i in range(len(batch)):
            if i < len(raw_lines) and ("positive" in raw_lines[i] or "negative" in raw_lines[i]):
                sentiment = "positive" if "positive" in raw_lines[i] else "negative"
            else:
                sentiment = "negative"  # fallback
            batch_preds.append(sentiment)

        predictions.extend(batch_preds)
    
    # Ensure prediction count matches texts count
    assert len(predictions) == len(texts), f"Prediction length mismatch: {len(predictions)} vs {len(texts)}"
    return predictions

In [15]:
def evaluate(y_true, y_pred):
    return {
        "accuracy": accuracy_score(y_true, y_pred),
        "precision": precision_score(y_true, y_pred, average='binary', pos_label='positive'),
        "recall": recall_score(y_true, y_pred, average='binary', pos_label='positive'),
        "f1": f1_score(y_true, y_pred, average='binary', pos_label='positive'),
        "report": classification_report(y_true, y_pred, zero_division=0)
    }

def mcnemar_test(y_true, y_pred_1, y_pred_2):
    table = [[0, 0], [0, 0]]
    for gt, p1, p2 in zip(y_true, y_pred_1, y_pred_2):
        correct1 = p1 == gt
        correct2 = p2 == gt
        table[int(not correct1)][int(not correct2)] += 1
    return mcnemar(table, exact=True).pvalue

In [16]:
train_df = pd.read_csv(TRAIN_PATH)
test_df = pd.read_csv(TEST_PATH)

In [17]:
# === Predictions ===
y_true = test_df["sentiment"].str.lower()

results = {}
metrics = {}

for llm in LLMS:
    results[llm] = {}
    metrics[llm] = {}
    for approach in APPROACHES:
        print(f"\nRunning {llm.upper()} - {approach.upper()}...")
        preds = bulk_predict(
            llm_name=llm,
            approach=approach,
            texts=test_df["clean_text"].tolist(),
            train_df=train_df
        )
        results[llm][approach] = preds
        metrics[llm][approach] = evaluate(y_true, preds)
        print(metrics[llm][approach]["report"])


Running GPT - ZERO_SHOT...
              precision    recall  f1-score   support

    negative       0.86      0.65      0.74       500
    positive       0.72      0.89      0.80       500

    accuracy                           0.77      1000
   macro avg       0.79      0.77      0.77      1000
weighted avg       0.79      0.77      0.77      1000


Running GPT - FEW_SHOT...
              precision    recall  f1-score   support

    negative       0.87      0.67      0.76       500
    positive       0.73      0.90      0.81       500

    accuracy                           0.79      1000
   macro avg       0.80      0.79      0.78      1000
weighted avg       0.80      0.79      0.78      1000


Running GPT - CHAIN_OF_THOUGHT...
              precision    recall  f1-score   support

    negative       0.88      0.64      0.75       500
    positive       0.72      0.92      0.81       500

    accuracy                           0.78      1000
   macro avg       0.80      0.78     

In [18]:
# === McNemar Tests (LLM vs LLM for each approach) ===
for approach in APPROACHES:
    print(f"\n=== McNemar Tests for {approach.upper()} ===")
    p_gpt_claude = mcnemar_test(y_true, results["gpt"][approach], results["claude"][approach])
    p_gpt_gemini = mcnemar_test(y_true, results["gpt"][approach], results["gemini"][approach])
    p_claude_gemini = mcnemar_test(y_true, results["claude"][approach], results["gemini"][approach])
    print(f"GPT vs Claude: p={p_gpt_claude:.4f}")
    print(f"GPT vs Gemini: p={p_gpt_gemini:.4f}")
    print(f"Claude vs Gemini: p={p_claude_gemini:.4f}")


=== McNemar Tests for ZERO_SHOT ===
GPT vs Claude: p=0.1337
GPT vs Gemini: p=0.0001
Claude vs Gemini: p=0.0595

=== McNemar Tests for FEW_SHOT ===
GPT vs Claude: p=0.0000
GPT vs Gemini: p=0.7838
Claude vs Gemini: p=0.0000

=== McNemar Tests for CHAIN_OF_THOUGHT ===
GPT vs Claude: p=0.2521
GPT vs Gemini: p=0.6866
Claude vs Gemini: p=0.2429
