In [None]:
import pandas as pd
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score
from sklearn.metrics import classification_report
import json

def get_result_excel(test_file, pred_file):
    
    with open(test_file, 'r', encoding='utf-8') as file:
        data = json.load(file)
    
    pred_data = [json.loads(line) for line in open(pred_file, "r")]
    preds = [item["predict"] for item in pred_data]

    submit_df = pd.DataFrame(data)
    submit_df['predict'] = preds
    
    return submit_df
    

def calculate_detailed_metrics(y_true, y_pred):
    report = classification_report(y_true, y_pred, output_dict=True)
    return report


def calculate_metrics(y_true, y_pred):
    """
    计算多标签分类任务的指标: 加权F1, 总体acc, precision, recall。

    参数:
    y_true (List[List[int]]): 真实标签
    y_pred (List[List[int]]): 预测标签

    返回:
    dict: 各指标的分数
    """

    # 计算指标
    metrics = {
        "f1": float(f1_score(y_true, y_pred, average="weighted")),
        "accuracy": float(accuracy_score(y_true, y_pred)),
        "precision": float(precision_score(y_true, y_pred, average="weighted")),
        "recall": float(recall_score(y_true, y_pred, average="weighted")),
    }

    return metrics


def get_label_pred(test_file, pred_file):
    """获取测试集标签以及预测结果

    Args:
        test_file (_type_): 带ground_truth标签的测试集文件
        pred_file (_type_): 对应的预测结果文件
    """
    test_data = json.load(open(test_file, "r"))
    labels = [item["output"] for item in test_data]

    pred_data = [json.loads(line) for line in open(pred_file, "r")]
    preds = [item["predict"] for item in pred_data]

    return labels, preds


def cal_acc(test_file, pred_file):
    labels, preds = get_label_pred(test_file, pred_file)
    metrics = calculate_metrics(y_true=labels, y_pred=preds)
    dlo_metrics = calculate_metrics(y_true=labels[:300], y_pred=preds[:300])
    img_metrics = calculate_metrics(y_true=labels[300:], y_pred=preds[300:])
    dlo_report = calculate_detailed_metrics(y_true=labels[:300], y_pred=preds[:300])
    img_report = calculate_detailed_metrics(y_true=labels[300:], y_pred=preds[300:])
    return metrics, dlo_metrics, img_metrics, dlo_report, img_report


if __name__ == "__main__":
    test_file = "./data/mire/train2.json"
    pred_file = "./saves/Qwen2-VL-7B-Instruct/lora/eval_2024-11-26-22-05-54/generated_predictions.jsonl"
    metrics, dlo_metrics, img_metrics, dlo_report, img_report = cal_acc(test_file, pred_file)
    
    df = pd.DataFrame(dlo_report).transpose()
    df = df.round(2)
    df = df.sort_values(by='f1-score', ascending=False)
    
    img_df = pd.DataFrame(img_report).transpose()
    img_df = img_df.round(2)
    img_df = img_df.sort_values(by='f1-score', ascending=False)
    
    result_df = get_result_excel(test_file, pred_file)

In [19]:
print("总分：", metrics)
print("对话意图得分：", dlo_metrics)
df

总分： {'f1': 0.8912284893924897, 'accuracy': 0.894, 'precision': 0.8999297428578823, 'recall': 0.894}
对话意图得分： {'f1': 0.9454438552901191, 'accuracy': 0.9466666666666667, 'precision': 0.9528636733636734, 'recall': 0.9466666666666667}


Unnamed: 0,precision,recall,f1-score,support
反馈密封性不好,1.0,1.0,1.0,15.0
信号情况,1.0,1.0,1.0,9.0
反馈用后症状,1.0,1.0,1.0,4.0
是否易褪色,1.0,1.0,1.0,4.0
适用季节,1.0,1.0,1.0,24.0
能否调光,1.0,1.0,1.0,14.0
版本款型区别,1.0,1.0,1.0,18.0
商品材质,1.0,1.0,1.0,14.0
是否会生锈,1.0,1.0,1.0,8.0
功效功能,1.0,0.94,0.97,17.0


In [24]:
print("图片场景得分：", img_metrics)
img_df

图片场景得分： {'f1': 0.8679933325792196, 'accuracy': 0.8714285714285714, 'precision': 0.8772437726411149, 'recall': 0.8714285714285714}


Unnamed: 0,precision,recall,f1-score,support
店铺页面,1.0,1.0,1.0,30.0
实物拍摄(含售后),1.0,1.0,1.0,23.0
商品分类选项,1.0,0.95,0.97,39.0
投诉举报页面,0.97,0.97,0.97,31.0
购物车页面,0.97,0.97,0.97,37.0
商品头图,0.9,1.0,0.95,37.0
换货页面,0.92,0.96,0.94,23.0
退货页面,0.9,0.97,0.94,37.0
退款页面,0.97,0.92,0.94,36.0
评论区截图页面,0.87,1.0,0.93,27.0


In [28]:
result_df.to_excel("result.xlsx", index=False)
img_df.to_excel("img_df.xlsx")
df.to_excel("dlo_df.xlsx")