In [1]:
import json

with open('../data/task3.json', 'r') as f:
    data = json.load(f)

test_texts = data['drafts']
test_labels = data['labels']

In [2]:
from tqdm import tqdm
import random
from openai import OpenAI

# 1. 严格按照你的格式初始化客户端
client = OpenAI(
    api_key="sk-SSx6483bw6LeuX4qoq1oEvktThrp0zLcSPxl5NPuAExgBTbG",
    base_url="https://hiapi.online/v1"
)

def classify_texts(texts):
    results = []

    # 遍历列表，加入 enumerate 方便定位报错
    for i, text in enumerate(tqdm(texts)):
        user_prompt = f"""
        The provided document is a United Nations Security Council's draft resolution. Predict whether the draft resolution will be adopted or not. Answer with 'yes' (1) or 'no' (0) without any explanation.

        Text: "{text}"
        Answer:
        """

        try:
            # 2. 完全按照你的调用形式，加入了防崩溃保护
            response = client.chat.completions.create(
                model="gemini-3-flash-preview",
                messages=[
                    {"role": "system", "content": "You are a helpful assistant."},
                    {"role": "user", "content": user_prompt}
                ],
                stream=False,       # 按照你的要求显式声明为非流式
                max_tokens=5,       # 限制输出长度，防止模型乱讲话
                temperature=0.0     # 保持 0.0，让每次预测结果稳定一致
            )

            # 提取结果并转为小写
            raw_result = response.choices[0].message.content.strip().lower()

            # 智能截取逻辑：防止模型回答 "Answer: yes" 导致格式判定失败
            if 'yes' in raw_result or '1' in raw_result:
                final_res = 1
            elif 'no' in raw_result or '0' in raw_result:
                final_res = 0
            else:
                print(f"\n[警告] 第 {i} 条数据格式异常: '{raw_result}'，触发随机兜底")
                final_res = random.choice([0, 1])

        except Exception as e:
            # 捕获网络超时等严重错误，防止中断整个循环
            print(f"\n[错误] 第 {i} 条数据 API 请求失败: {e}")
            final_res = random.choice([0, 1])

        results.append(final_res)

    return results

# 第一步：依然坚持切片测试！
# 绝不要一上来就跑全量数据。先取前 10 条测试链路是否完全畅通。
pred = classify_texts(test_texts[:10])

print(f"\n测试完成，前10条预测结果为: {pred}")

100%|██████████| 10/10 [01:12<00:00,  7.23s/it]


测试完成，前10条预测结果为: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]





In [2]:
# calculate metrics
from sklearn.metrics import accuracy_score, balanced_accuracy_score, precision_recall_fscore_support
from sklearn.metrics import roc_auc_score, average_precision_score, matthews_corrcoef
from imblearn.metrics import geometric_mean_score

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
from sklearn.metrics import roc_auc_score, balanced_accuracy_score, precision_recall_curve, auc

def calculate_metrics(pred, labels):
    # swap 0 and 1
    pred = [1 - x for x in pred]
    labels = [1 - x for x in labels]
    acc = accuracy_score(labels, pred)
    try:
        roc_auc = roc_auc_score(labels, pred)
    except ValueError:
        roc_auc = 0
    balanced_acc = balanced_accuracy_score(labels, pred)
    prec, rec, f1, _ = precision_recall_fscore_support(labels, pred, average='binary')
    # pr_auc = average_precision_score(labels, pred)
    precision, recall, _ = precision_recall_curve(labels, pred)
    pr_auc = auc(recall, precision)
    mcc = matthews_corrcoef(labels, pred)
    g_mean = geometric_mean_score(labels, pred)
    tn, fp, fn, tp = confusion_matrix(labels, pred).ravel()
    specificity = tn / (tn + fp)

    print(f'Accuracy: {acc}')
    print(f'AUC: {roc_auc}')
    print(f'Balanced Accuracy: {balanced_acc}')
    print(f'Precision: {prec}')
    print(f'Recall: {rec}')
    print(f'F1: {f1}')
    print(f'PR AUC: {pr_auc}')
    print(f'MCC: {mcc}')
    print(f'G-Mean: {g_mean}')
    print(f'Specificity: {specificity}')

    print('Accuracy AUC Balanced_Acc Precision Recall F1 PR_AUC MCC G-Mean Specificity')
    print(f'{acc:.4f} {roc_auc:.4f} {balanced_acc:.4f} {prec:.4f} {rec:.4f} {f1:.4f} {pr_auc:.4f} {mcc:.4f} {g_mean:.4f} {specificity:.4f}')



In [None]:
calculate_metrics(pred, test_labels)