In [1]:
import os
import pickle
import re
from opencc import OpenCC
from seqeval.metrics import classification_report

t2s = OpenCC('t2s')
s2t = OpenCC('s2t')
remove_list = ["", " "]

def remove_brackets(data):
    pattern = r'\(.*?\)|\[.*?\]|\{.*?\}|<.*?>|【.*?】'
    if isinstance(data, str):
        return re.sub(pattern, '', data) 
    if isinstance(data, list):
        return [re.sub(pattern, '', item) for item in data]
    raise ValueError("Input must be a string or a list of strings.")
    
def split_chinese_english(input_list):
    seen = set()
    def contains_chinese_and_english(s):
        return re.search(r'[\u4e00-\u9fff][a-zA-Z]|[a-zA-Z][\u4e00-\u9fff]', s)
    

    return [
        item for sublist in input_list
        for item in (re.findall(r'[\u4e00-\u9fff]+|[a-zA-Z]+', sublist) if contains_chinese_and_english(sublist) else [sublist])
        if item not in seen and not seen.add(item)
    ]

def generate_answer(context, ent):
    answer = ['O'] * len(context)
    if "無" in ent:
        return answer

    for e in ent:
        start_idx = context.find(e)
        if start_idx != -1:
            answer[start_idx] = 'B'
            for i in range(1, len(e)):
                answer[start_idx + i] = 'I'

    return answer

def remove_subsets(lst):
    result = []
    for item in lst:
        if not any(item != other and item in other for other in lst):
            result.append(item)
    return result

def process_data(data_list):

    result_data = []
    for data in data_list:
        last_content = data[-1]['content']
        
        p_name = data[0]['context']

        ent = last_content.split("、")
        ent = [e for e in ent if e not in remove_list]  # 移除空字串
        ent = [s2t.convert(e) for e in ent]  # 簡繁轉換
        ent = remove_brackets(ent)       # 移除標點
        

        ent = split_chinese_english(ent)    # 分開中英文

        ent = remove_subsets(ent)        # 移除子集合

        answer = generate_answer(p_name, ent)

        result_data.append({
            'context': p_name,
            'question': '品牌',
            'answer': answer
        })
    return result_data

directory_path = "./results_train_11000_brand_prompt_2_7B_fp16"

data_list = []

for filename in os.listdir(directory_path):
    if "品牌" in filename and filename.endswith(".pkl"):
        file_path = os.path.join(directory_path, filename)
        
        try:
            with open(file_path, "rb") as file:
                data = pickle.load(file)
                data_list.append(data)
        except Exception as e:
            print(f"讀取檔案 {filename} 時發生錯誤: {e}")


In [2]:
result = process_data(data_list)

In [3]:
def find_answer(data, context, question):
    for item in data:
        if item['context'] == context and item['question'] == question:
            return item['answer']
    return None  # 如果找不到匹配項，返回 None


gp_file_path = './data/train.pickle'
with open(gp_file_path, 'rb') as file:
    gp_data = pickle.load(file)


y_pred = []
y_true = []

for pred in result:
    context = pred['context']
    question = pred['question']
    answer = pred['answer']
    true_answer = find_answer(gp_data, context, question)

    if true_answer is not None:
        y_pred.append(answer)
        y_true.append(true_answer)
        if answer != true_answer:
            print(f"預測答案: {context}, 屬性: {question}, 答案: {answer}")
            print(f"真實答案: {context}, 屬性: {question}, 答案: {true_answer}")
            print("=" * 150)
    else:
        print(f"找不到匹配的答案: {context} {question}")


print(classification_report(y_true, y_pred, digits = 4, mode='strict'))

預測答案: 【1stChoice 瑪丁】無穀低敏鴨肉成貓（皮膚腸胃敏感貓用）2.72kg, 屬性: 品牌, 答案: ['O', 'B', 'I', 'I', 'I', 'I', 'I', 'I', 'I', 'I', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
真實答案: 【1stChoice 瑪丁】無穀低敏鴨肉成貓（皮膚腸胃敏感貓用）2.72kg, 屬性: 品牌, 答案: ['O', 'B', 'I', 'I', 'I', 'I', 'I', 'I', 'I', 'I', 'O', 'B', 'I', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
預測答案: STEIFF德國金耳釦泰迪熊 - Issy Donkey 驢子 (動物王國), 屬性: 品牌, 答案: ['B', 'I', 'I', 'I', 'I', 'I', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
真實答案: STEIFF德國金耳釦泰迪熊 - Issy Donkey 驢子 (動物王國), 屬性: 品牌, 答案: ['B', 'I', 'I', 'I', 'I', 'I', 'O', 'O', 'B', 'I', 'I', 'I', 'I', 'I', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
預測答案

In [4]:
for item in data_list:
    print(item[0]['context'])   
    print(item[-1]['content'])
    print("=" * 150)
    # if "DIY" in item[0]['context']:
    #     print(item[-1]['content'])
    #     break


Y型防鏽晾衣架 
无
【1stChoice 瑪丁】無穀低敏鴨肉成貓（皮膚腸胃敏感貓用）2.72kg
1stChoice
【我愛中華筆莊】萬用羊毛刷-圓頭(3入2組) 特價150元
我爱中华笔庄
【BILLY KING 貝麗晶】天然珍珠鋯石項鍊(NP850)
BILLY KING、贝丽晶
PLAYBOY- NY Apartment 紐約公寓系列 袋式斜背包-灰色
PLAYBOY
STEIFF德國金耳釦泰迪熊 - Issy Donkey 驢子 (動物王國)
STEIFF
NIKE-大童超輕量運動鞋E555090501
NIKE
YES HTC HD7 超高容量防爆( 2入電池 )
YES
【LOTTO】女 Slim Fit 美型健走鞋(深藍-LT2AWR6626)
LOTTO
AQUATEC MK-600 側邊視窗潛水面鏡 藍框透明矽膠
AQUATEC
HIKARI．ST 打蛋器
HIKARI．ST
RedMoon SONY Xperia 1 IV 9H厚版玻璃鏡頭保護貼 手機鏡頭貼 9H玻璃保貼 2入
RedMoon
【海夫健康生活館】恆伸 鋁合金 有靠背 扶手可拆 洗澡椅 沐浴椅(ER5004)
海夫健康生活馆、恒伸
Hi Fasion HTC One M9 熊出沒-發花熊 立體壓紋 硬式超薄保護殼
Hi Fasion
【Caswell-Massey凱瑪氏】歐鈴蘭香氛身體乳(240ml)
Caswell-Massey、凯玛氏
洋彩紅地錦上添花印章
无、
Timerberwolf 草本魔力-自然無穀物天然配方 16.5磅 * 1包
Timerberwolf
ASUS EPad TF700(T) 系列 10.1吋 一指無紋抗刮(霧面)機身正面貼
ASUS
【CHIUCHIU】Google Pixel 3a (5.6吋)復古質感犀牛紋雙卡層可夾式保護皮套
CHIUCHIU
Maya Duxton 智慧型充電器 (809-4)
Maya Duxton
長版風衣
无、
Wincent 7A XL 胡桃木鼓棒
Wincent
【SWAROVSKI 施華洛世奇】Creativity Circle 玫瑰金項鍊
施华洛世奇
【HALLMARK】凱蒂聯名款雙面托特包-咖啡HLKT14B149BN
HALLMARK
【Cap】伸縮摺疊帶蓋咖啡杯395ml(灰色)
Cap
月光下的十字架（