# 多标签打标prompt

In [74]:
multi_label_select_system_prompt = """

    我为你提供了详细的文章类型的定义，请理解每个类型的含义，
    类型定义: {{possible_topic_info}}
    
    然后对以下输入的文章进行分析梳理，根据你分析的结果判断文章属于那个文章类型.
    分析文章选择合理类型时要注意的是：
        1、充分理解每个类型的定义
        2、在处理复杂上下文时，要充分理解上下文再做出判断
        3、避免关键词误导，不要因为聚焦在关键词上给错误的结果
        4、文章类型可以通过以下格式输出多个,但是必须符合类型定义

    通过以下json格式输出结果：
    {
        "result": [
            {
                "topic_relate_analysis":"选择得到合理文章类型的分析过程",
                "reasonable_topic": "第一个符合类型定义的类型名称，如果类型定义中没有符合的类型，则设置为 其他 ",
                "confidence": "你对判断结果的置信度级别(高, 中, 低, 无), 只能从中选择,无表示不相关，低表示较低相关度,高表示高相关度"
            },
            {
                "topic_relate_analysis":"选择得到合理文章类型的分析过程",
                "reasonable_topic": "第二个符合类型定义的类型名称，如果类型定义中没有符合的类型，则设置为 其他 ",
                "confidence": "你对判断结果的置信度级别(高, 中, 低, 无), 只能从中选择,无表示不相关，低表示较低相关度,高表示高相关度"
            }
        ]
    }
    
"""

input_user_message = """
    input:
        {{content}}
    output:
"""

# 通过AsyncOpenAI方式调用deepseek，进行打标

In [21]:
# deepseek需要 openai==1.*版本
import asyncio
import pandas as pd
import numpy as np
import re
import time
import json
from tqdm import tqdm
import datetime
import math
import openai
import copy
from openai import AsyncOpenAI
DEEPSEEK_API_KEY = "sk-f9cccc9420dd49dba5c78f88a466e0f1"
DEREPSEEK_BASE_URL = "https://api.deepseek.com"
DEEPSEEK_MODEL = "deepseek-chat"


deepseek_client = AsyncOpenAI(api_key=DEEPSEEK_API_KEY, base_url = DEREPSEEK_BASE_URL)
async def deepseek_caller(system_message, user_message):
    try:
        response = await deepseek_client.chat.completions.create(
            model=DEEPSEEK_MODEL,
            messages=[
                {"role": "system", "content": system_message},
                {"role": "user", "content": user_message},
            ],
            temperature=0.7,
        )
        response_parse = response.choices[0].message.content
        #print("******* response *******\n", response_parse)
        return response_parse
    except Exception as e:
        print("******* response *******\n", e, "ERROR")
        return np.nan

In [75]:
async def parse_response(response):
    json_str = re.search(r'\{.*\}', response, re.DOTALL)
    # print(json_str)
    if json_str is None:
        return None
    else:
        # 将JSON字符串转换为Python字典
        result = json.loads(json_str.group(0))
        return result
    
    
async def single_row_process(docid, system_message, user_message):
    # print(system_message, user_message)
    response = await deepseek_caller(system_message, user_message)
    if isinstance(response, str):
        result = await parse_response(response)
    elif isinstance(response, dict):
        result = response
    else:
        result = None
    # print(f"{docid}&&{topic_name}", response, result)
    return docid, result

In [103]:
async def multi_label_tag_main(data, topic_define_dict, system_template, user_template, batch_size=20):
    start_time = datetime.datetime.now() # start timing

    docid2evalres = {}
    llm_response_list = []
    # batch_size = 120
    for i in range(math.ceil(data.shape[0] / batch_size)):
        tasks = []
        for index, row in data[i*batch_size:(i+1)*batch_size].iterrows():
            if not isinstance(row['content'], float):
                tasks.append(asyncio.create_task(single_row_process(
                            docid=row['docid'],
                            system_message=system_template.replace("{{possible_topic_info}}", str(topic_define_dict)),
                            user_message=user_template.replace("{{content}}", row['content'])
                        )))
    
        for completed_task in tqdm(asyncio.as_completed(tasks), total=len(tasks), desc=f"Awaiting tasks: {i*batch_size} - {(i+1)*batch_size}"):
            try:
                _id, result = await completed_task
                llm_response_list.append({
                    "docid":_id,
                    "result": result["result"]
                })
            except Exception as e:
                print(e)
    end_time = datetime.datetime.now() # start timing
    print(f"开始时间：{start_time},  结束时间：{end_time}, 花费时间：{end_time - start_time}s")
    return llm_response_list  

# 将多标签打标结果解析并合并在原数据中

In [100]:
def parse_tag_result_and_merge_origin_data(origin_data:pd.DataFrame, gpt_tag_res_df:pd.DataFrame, merge_cols=['docid']):
    origin_data_add_tag_res = origin_data.merge(gpt_tag_res_df, on=merge_cols, how='left')
    print(f"origin_data_add_tag_res: {origin_data_add_tag_res.shape}")
    expand_list = []
    for index, row in origin_data_add_tag_res.iterrows():
        single_row = dict(row)
        results = single_row['result']
        # is_exist_topic = set()
        if isinstance(results, list):
            for single_topic in results:
                single_row_backup = copy.copy(single_row)
                single_row_backup['topic_relate_analysis'] = single_topic["topic_relate_analysis"]
                single_row_backup['reasonable_topic'] = single_topic["reasonable_topic"]
                single_row_backup['confidence'] = single_topic["confidence"]
                
                if results.__len__() == 1:
                    expand_list.append(single_row_backup)
                elif results.__len__() > 1:
                    if single_topic["reasonable_topic"] != "其他":
                        expand_list.append(single_row_backup)
                    else:
                        continue
                    
    print(f"扩展之后的数据条数： {len(expand_list)}")
    return pd.DataFrame(expand_list)
            
            

# 读取分类体系的标签定义

In [55]:
topic_define_df = pd.read_csv('topic_define.csv')
topic_define_dict = dict(zip(topic_define_df['topic_name'].tolist(), topic_define_df['topic_define'].tolist()))
topic_define_dict

{'纯招聘类': '纯招聘类是指以招聘职位，职位描述，薪资，待遇，要求等。也包括招聘工人。',
 '促销广告、打折信息类': '促销广告是指为了促进销售而发布的广告，通常包括打折、赠品、抽奖等促销活动信息，打折信息类是指商家为了促销而对商品或服务进行的降价优惠活动，通常以折扣比例或具体金额的方式呈现，例如“全场8折”、“满100元减20元”等.包括有明确的折扣，价格， 优惠券， 折扣，卖多少送多少，赠品，返现，礼包）',
 '大众科普，概念解释类': '大众科普是指将专有名词或者某种概念、知识传播给广大公众，它旨在让公众理解专有名词，增加概念含义的认知。该解释是能获得群众普遍认可的，不能带有个人的主观观点和思考偏向',
 '好物推荐、直播预告类': '好物推荐是实质用户根据根据使用心得推荐一些优质的产品或服务，店铺，影音书籍等，不能有明确的打折信息， 广告促销的内容，主要围绕讲使用，购买经历，晒单，薅羊毛等。直播预告类是指提前宣传即将进行的直播活动，包括时间、主题、嘉宾等信息，吸引观众关注并提前做好准备',
 '租房售房广告': '租房售房广告是指房东、个人、开发商或中介发布的房屋的租售广告，通常包括房屋的位置、面积、租金、房屋设施等信息。',
 '灌水水贴': '小说，段子，故事分享，心情分享，明星打call，八卦讨论， 音乐，影视分享，心灵鸡汤等内容',
 '股市、债市动态': '大宗商品价格,期货行情波动播报,包括大盘 基金综述, 大盘,板块儿等以及个股的涨跌数字的堆积复述,不包括个人或者机构的点评分析',
 '公司公告': '通常是为了向员工、股东、客户或其他相关方面传达特定的信息或消息。公司官方公告可以涉及各种主题，例如公司业务、重要事件、财务状况、人员变动、政策变化、市场趋势',
 '公司财报': '公司财报是指公司在一定时间内的财务状况和经营业绩的报告，通常包括资产负债表、利润表和现金流量表等内容',
 '汽车经销商新闻': '汽车商家以及汽车相关的新闻报道，如新车上市、参数配置、销售数据以及车市动态等',
 '维修服务': '商品产品的维修服务类信息宣传相关内容',
 '提问求答鉴定类': '发表提问，求专业知识解答与物品真伪鉴定，互动交流解疑惑',
 '假冒仿制商品': '在设计、外观或功能上模仿或参照知名品牌、高端产品，但在品牌标识、材料或工艺上

# 读取数据并进行多标签打标

## 假冒仿制商品处理

In [7]:
fangzhi_good_eval_res = pd.read_excel("./data/new_data_tag_res/假冒仿制商品验证结果.xlsx")
fangzhi_good_eval_res.shape, fangzhi_good_eval_res.columns

((24743, 18),
 Index(['Unnamed: 0', '序号', 'docid', '标题', '内文', '媒体名称', '版面', '出版日期', '媒体类型',
        '作者', '情感', '聚类编号', '原文链接', '命中关键字', 'spam', 'content',
        'detail_anlysis', 'is_recruitment'],
       dtype='object'))

In [8]:
need_multi_label_tag_fangshi_good_df = fangzhi_good_eval_res[fangzhi_good_eval_res['is_recruitment']=='no']
need_multi_label_tag_fangshi_good_df.shape

(2782, 18)

In [44]:
multi_label_tag_res = await multi_label_tag_main(
    data=need_multi_label_tag_fangshi_good_df, 
    topic_define_dict=topic_define_dict, 
    system_template=multi_label_select_system_prompt, 
    user_template=input_user_message, 
    batch_size=200)

Awaiting tasks: 100%|██████████| 200/200 [00:18<00:00, 10.67it/s]
Awaiting tasks: 100%|██████████| 200/200 [00:51<00:00,  3.87it/s]
Awaiting tasks: 100%|██████████| 200/200 [00:20<00:00,  9.72it/s]
Awaiting tasks: 100%|██████████| 200/200 [00:18<00:00, 10.66it/s]
Awaiting tasks: 100%|██████████| 199/199 [00:19<00:00, 10.01it/s]
Awaiting tasks: 100%|██████████| 200/200 [00:48<00:00,  4.15it/s]
Awaiting tasks: 100%|██████████| 200/200 [00:58<00:00,  3.40it/s]
Awaiting tasks: 100%|██████████| 200/200 [00:18<00:00, 10.58it/s]
Awaiting tasks: 100%|██████████| 200/200 [00:21<00:00,  9.33it/s]
Awaiting tasks: 100%|██████████| 200/200 [00:17<00:00, 11.34it/s]
Awaiting tasks: 100%|██████████| 200/200 [00:18<00:00, 10.79it/s]
Awaiting tasks: 100%|██████████| 200/200 [00:22<00:00,  8.77it/s]
Awaiting tasks: 100%|██████████| 200/200 [01:25<00:00,  2.34it/s]
Awaiting tasks: 100%|██████████| 182/182 [00:50<00:00,  3.64it/s]

开始时间：2024-06-25 18:41:53.090917,  结束时间：2024-06-25 18:49:44.811048, 花费时间：0:07:51.720131s





In [49]:
fangzhi_goods_tag_res = parse_tag_result_and_merge_origin_data(need_multi_label_tag_fangshi_good_df, pd.DataFrame(multi_label_tag_res))


origin_data_add_tag_res: (2782, 19)
扩展之后的数据条数： 5567


In [50]:
fangzhi_goods_tag_res.head()

Unnamed: 0.1,Unnamed: 0,序号,docid,标题,内文,媒体名称,版面,出版日期,媒体类型,作者,...,原文链接,命中关键字,spam,content,detail_anlysis,is_recruitment,result,topic_relate_analysis,reasonable_topic,confidence
0,1,2,3b992cfc6d39f59c0a007d0d38741994,"游戏:流氓软件 平台:steam(未发售,有试玩demo) 游戏里完美复刻我们现实中安装软件...","\n\n 游戏:流氓软件 \n\n 平台:steam(未发售,有试玩demo) \...",TapTap,动态-推荐,2024-06-10 23:59:59,论坛,平A流小碗熊,...,https://www.taptap.cn/moment/549330666517433979,复刻,True,"游戏:流氓软件 平台:steam(未发售,有试玩demo) 游戏里完美复刻我们现实中安装软件...",文章描述了一款名为'流氓软件'的游戏，该游戏在Steam平台上有试玩demo但尚未正式发售。...,no,[{'topic_relate_analysis': '文章主要介绍了一款名为‘流氓软件’的...,文章主要介绍了一款名为‘流氓软件’的游戏，该游戏在Steam平台上提供试玩demo，内容涉及...,好物推荐、直播预告类,高
1,1,2,3b992cfc6d39f59c0a007d0d38741994,"游戏:流氓软件 平台:steam(未发售,有试玩demo) 游戏里完美复刻我们现实中安装软件...","\n\n 游戏:流氓软件 \n\n 平台:steam(未发售,有试玩demo) \...",TapTap,动态-推荐,2024-06-10 23:59:59,论坛,平A流小碗熊,...,https://www.taptap.cn/moment/549330666517433979,复刻,True,"游戏:流氓软件 平台:steam(未发售,有试玩demo) 游戏里完美复刻我们现实中安装软件...",文章描述了一款名为'流氓软件'的游戏，该游戏在Steam平台上有试玩demo但尚未正式发售。...,no,[{'topic_relate_analysis': '文章主要介绍了一款名为‘流氓软件’的...,文章中虽然提到了游戏的具体内容和玩法，但同时也涉及到了现实中软件安装的陷阱和问题，这可以被视...,大众科普，概念解释类,中
2,4,5,df12065f96aa97888a5c376001bd9b7a,“复刻釉”究竟为何物,"\n\n在2024年佛山陶博会、潭州陶瓷展上,有关于“复刻釉”的宣传铺天盖地,尤其在广东、江...",微信,山东地王实业集团,2024-06-10 23:59:52,微信,山东地王实业集团,...,http://mp.weixin.qq.com/s?__biz=MzA4NTcxMzkzMA...,复刻,True,"“复刻釉”究竟为何物\n\n在2024年佛山陶博会、潭州陶瓷展上,有关于“复刻釉”的宣传铺天...",文章主要介绍了'复刻釉'瓷砖的技术和应用，描述了其生产工艺、特点以及市场反馈。文章中提到的'...,no,[{'topic_relate_analysis': '文章主要介绍了复刻釉瓷砖的技术原理、...,文章主要介绍了复刻釉瓷砖的技术原理、特点及其在陶瓷行业的应用和市场反馈。内容涉及复刻釉的定义...,大众科普，概念解释类,高
3,4,5,df12065f96aa97888a5c376001bd9b7a,“复刻釉”究竟为何物,"\n\n在2024年佛山陶博会、潭州陶瓷展上,有关于“复刻釉”的宣传铺天盖地,尤其在广东、江...",微信,山东地王实业集团,2024-06-10 23:59:52,微信,山东地王实业集团,...,http://mp.weixin.qq.com/s?__biz=MzA4NTcxMzkzMA...,复刻,True,"“复刻釉”究竟为何物\n\n在2024年佛山陶博会、潭州陶瓷展上,有关于“复刻釉”的宣传铺天...",文章主要介绍了'复刻釉'瓷砖的技术和应用，描述了其生产工艺、特点以及市场反馈。文章中提到的'...,no,[{'topic_relate_analysis': '文章主要介绍了复刻釉瓷砖的技术原理、...,虽然文章中提到了复刻釉瓷砖的生产和市场情况，但并未涉及具体的促销广告、打折信息或价格优惠等内...,其他,无
4,17,18,46d5daeb9adbd5d2c4acf77ab7cfe995,"哇,这次概念有点意思欸,在现实生活中大家拥有复刻般的的素淡白色外貌,躺在病床上,在濒死前看着...","哇,这次概念有点意思欸,在现实生活中大家拥有复刻般的的素淡白色外貌,躺在病床上,在濒死前看着...",新浪微博,,2024-06-10 23:59:20,微博,海底小甜,...,http://weibo.com/7770420064/OipbRl7Kd,复刻,True,"哇,这次概念有点意思欸,在现实生活中大家拥有复刻般的的素淡白色外貌,躺在病床上,在濒死前看着...",文章描述了一个关于生死、现实与虚拟交织的场景，其中提到了'复刻般的素淡白色外貌'，但这并不是...,no,[{'topic_relate_analysis': '文章描述了一个关于生死、现实与虚拟交...,文章描述了一个关于生死、现实与虚拟交错的概念，通过病床、呼吸机、心电图仪等元素，构建了一个关...,大众科普，概念解释类,高


In [51]:
fangzhi_goods_tag_res.to_excel("./data/new_data_tag_res/仿货混淆数据.xlsx", index=False)

## 维修数据处理

In [53]:
fix_service_eval_res = pd.read_excel("./data/new_data_tag_res/维修服务验证结果.xlsx")
fix_service_eval_res.shape, fix_service_eval_res.columns

((13136, 17),
 Index(['docid', '标题', '内文', '媒体名称', '版面', '出版日期', '媒体类型', '作者', '情感', '聚类编号',
        '原文链接', '命中关键字', 'spam', 'content', 'detail_anlysis', 'is_belong',
        'sign'],
       dtype='object'))

In [54]:
need_multi_label_tag_fix_service_df = fix_service_eval_res[fix_service_eval_res['is_belong']=='no']
need_multi_label_tag_fix_service_df.shape

(11263, 17)

In [68]:
multi_label_select_system_prompt

'\n\n    我为你提供了详细的文章类型的定义，请理解每个类型的含义，\n    类型定义: {{possible_topic_info}}\n    \n    然后对以下输入的文章进行分析梳理，根据你分析的结果判断文章属于那个文章类型.\n    分析文章选择合理类型时要注意的是：\n        1、充分理解每个类型的定义\n        2、在处理复杂上下文时，要充分理解上下文再做出判断\n        3、避免关键词误导，不要因为聚焦在关键词上给错误的结果\n        4、文章类型可以通过以下格式输出多个,但是必须符合类型定义\n\n    通过以下json格式输出结果：\n    {\n        "result": [\n            {\n                "topic_relate_analysis":"选择得到合理文章类型的分析过程",\n                "reasonable_topic": "第一个合理文章类型的名称，如果没有合理的类型，则设置为 其他 ",\n                "confidence": "你对判断结果的置信度级别(高, 中, 低, 无), 只能从中选择,无表示不相关，低表示较低相关度,高表示高相关度"\n            },\n            {\n                "topic_relate_analysis":"选择得到合理文章类型的分析过程",\n                "reasonable_topic": "第二个合理的文章类型名称，如果没有合理的类型，则设置为 其他 ",\n                "confidence": "你对判断结果的置信度级别(高, 中, 低, 无), 只能从中选择,无表示不相关，低表示较低相关度,高表示高相关度"\n            }\n        ]\n    }\n    \n'

In [59]:
fix_service_multi_label_tag_res = await multi_label_tag_main(
    data=need_multi_label_tag_fix_service_df, 
    topic_define_dict=topic_define_dict, 
    system_template=multi_label_select_system_prompt, 
    user_template=input_user_message, 
    batch_size=200
)

Awaiting tasks: 100%|██████████| 200/200 [00:20<00:00,  9.89it/s]
Awaiting tasks: 100%|██████████| 200/200 [00:17<00:00, 11.63it/s]
Awaiting tasks: 100%|██████████| 200/200 [00:17<00:00, 11.40it/s]
Awaiting tasks: 100%|██████████| 200/200 [00:47<00:00,  4.22it/s]
Awaiting tasks: 100%|██████████| 200/200 [00:18<00:00, 11.03it/s]
Awaiting tasks: 100%|██████████| 200/200 [00:24<00:00,  8.25it/s]
Awaiting tasks: 100%|██████████| 200/200 [00:20<00:00,  9.73it/s]
Awaiting tasks: 100%|██████████| 200/200 [00:40<00:00,  4.88it/s]
Awaiting tasks: 100%|██████████| 200/200 [00:20<00:00,  9.90it/s]
Awaiting tasks: 100%|██████████| 200/200 [00:16<00:00, 11.93it/s]
Awaiting tasks:  58%|█████▊    | 116/200 [00:12<00:01, 50.96it/s]

Expecting ',' delimiter: line 4 column 60 (char 87)


Awaiting tasks: 100%|██████████| 200/200 [00:17<00:00, 11.55it/s]
Awaiting tasks: 100%|██████████| 200/200 [00:17<00:00, 11.18it/s]
Awaiting tasks: 100%|██████████| 200/200 [00:15<00:00, 13.06it/s]
Awaiting tasks: 100%|██████████| 200/200 [00:50<00:00,  3.99it/s]
Awaiting tasks: 100%|██████████| 200/200 [00:17<00:00, 11.70it/s]
Awaiting tasks: 100%|██████████| 200/200 [00:17<00:00, 11.67it/s]
Awaiting tasks: 100%|██████████| 200/200 [00:21<00:00,  9.15it/s]
Awaiting tasks: 100%|██████████| 200/200 [00:17<00:00, 11.70it/s]
Awaiting tasks: 100%|██████████| 200/200 [00:20<00:00,  9.96it/s]
Awaiting tasks: 100%|██████████| 200/200 [00:23<00:00,  8.37it/s]
Awaiting tasks:   0%|          | 1/200 [00:01<04:33,  1.37s/it]

******* response *******
 Error code: 400 - {'detail': 'Content Exists Risk'} ERROR
'NoneType' object is not subscriptable


Awaiting tasks: 100%|██████████| 200/200 [00:16<00:00, 11.93it/s]
Awaiting tasks: 100%|██████████| 200/200 [00:17<00:00, 11.49it/s]
Awaiting tasks: 100%|██████████| 200/200 [00:19<00:00, 10.28it/s]
Awaiting tasks: 100%|██████████| 200/200 [00:16<00:00, 12.17it/s]
Awaiting tasks: 100%|██████████| 200/200 [00:59<00:00,  3.34it/s]
Awaiting tasks: 100%|██████████| 200/200 [00:19<00:00, 10.06it/s]
Awaiting tasks: 100%|██████████| 200/200 [00:25<00:00,  7.90it/s]
Awaiting tasks: 100%|██████████| 200/200 [00:17<00:00, 11.52it/s]
Awaiting tasks: 100%|██████████| 200/200 [03:29<00:00,  1.05s/it]
Awaiting tasks:   0%|          | 0/200 [00:00<?, ?it/s]

Expecting ',' delimiter: line 252 column 10 (char 10415)


Awaiting tasks: 100%|██████████| 200/200 [00:48<00:00,  4.10it/s]
Awaiting tasks: 100%|██████████| 200/200 [00:24<00:00,  8.23it/s]
Awaiting tasks: 100%|██████████| 200/200 [00:17<00:00, 11.25it/s]
Awaiting tasks: 100%|██████████| 200/200 [00:15<00:00, 13.19it/s]
Awaiting tasks: 100%|██████████| 200/200 [00:19<00:00, 10.08it/s]
Awaiting tasks: 100%|██████████| 200/200 [00:17<00:00, 11.49it/s]
Awaiting tasks: 100%|██████████| 200/200 [00:18<00:00, 11.06it/s]
Awaiting tasks: 100%|██████████| 200/200 [00:51<00:00,  3.89it/s]
Awaiting tasks: 100%|██████████| 200/200 [00:17<00:00, 11.71it/s]
Awaiting tasks: 100%|██████████| 200/200 [00:19<00:00, 10.05it/s]
Awaiting tasks: 100%|██████████| 200/200 [00:15<00:00, 12.92it/s]
Awaiting tasks: 100%|██████████| 200/200 [00:16<00:00, 11.91it/s]
Awaiting tasks: 100%|██████████| 200/200 [00:26<00:00,  7.69it/s]
Awaiting tasks: 100%|██████████| 200/200 [00:16<00:00, 11.96it/s]
Awaiting tasks: 100%|██████████| 200/200 [00:16<00:00, 12.27it/s]
Awaiting t

Expecting ',' delimiter: line 4 column 49 (char 76)


Awaiting tasks: 100%|██████████| 200/200 [00:19<00:00, 10.16it/s]
Awaiting tasks: 100%|██████████| 200/200 [00:16<00:00, 12.04it/s]
Awaiting tasks: 100%|██████████| 200/200 [00:17<00:00, 11.76it/s]
Awaiting tasks: 100%|██████████| 200/200 [00:27<00:00,  7.18it/s]
Awaiting tasks: 100%|██████████| 200/200 [00:18<00:00, 10.91it/s]
Awaiting tasks: 100%|██████████| 200/200 [00:17<00:00, 11.65it/s]
Awaiting tasks: 100%|██████████| 200/200 [00:24<00:00,  8.05it/s]
Awaiting tasks: 100%|██████████| 200/200 [00:17<00:00, 11.25it/s]
Awaiting tasks: 100%|██████████| 63/63 [00:15<00:00,  4.10it/s]

开始时间：2024-06-26 11:46:58.085161,  结束时间：2024-06-26 12:11:06.634591, 花费时间：0:24:08.549430s





In [60]:
fix_service_multi_label_tag_res.__len__()

11259

In [66]:
fix_service_add_multi_label_df = parse_tag_result_and_merge_origin_data(need_multi_label_tag_fix_service_df, pd.DataFrame(fix_service_multi_label_tag_res))


origin_data_add_tag_res: (11263, 18)
扩展之后的数据条数： 18473


In [67]:
fix_service_add_multi_label_df.to_excel("./data/new_data_tag_res/维修混淆数据.xlsx", index=False)

# 跨境代购  多标签打标

In [70]:
cross_border_eval_res = pd.read_excel("./data/new_data_tag_res/跨境代购验证结果.xlsx")

In [71]:
need_multi_label_tag_cross_border_df = cross_border_eval_res[cross_border_eval_res['is_belong']=='no']
need_multi_label_tag_cross_border_df.shape

(9764, 22)

In [99]:
cross_border_multi_label_tag_res = await multi_label_tag_main(
    data=need_multi_label_tag_cross_border_df, 
    topic_define_dict=topic_define_dict, 
    system_template=multi_label_select_system_prompt, 
    user_template=input_user_message, 
    batch_size=200
)

Awaiting tasks:   2%|▎         | 5/200 [00:04<10:28,  3.22s/it]

******* response *******
 <html>
<head><title>502 Bad Gateway</title></head>
<body>
<center><h1>502 Bad Gateway</h1></center>
<hr><center>nginx</center>
</body>
</html> ERROR
'NoneType' object is not subscriptable
******* response *******
 <html>
<head><title>502 Bad Gateway</title></head>
<body>
<center><h1>502 Bad Gateway</h1></center>
<hr><center>nginx</center>
</body>
</html> ERROR
'NoneType' object is not subscriptable
******* response *******
 <html>
<head><title>502 Bad Gateway</title></head>
<body>
<center><h1>502 Bad Gateway</h1></center>
<hr><center>nginx</center>
</body>
</html> ERROR
'NoneType' object is not subscriptable
******* response *******
 <html>
<head><title>502 Bad Gateway</title></head>
<body>
<center><h1>502 Bad Gateway</h1></center>
<hr><center>nginx</center>
</body>
</html> ERROR
'NoneType' object is not subscriptable
******* response *******
 <html>
<head><title>502 Bad Gateway</title></head>
<body>
<center><h1>502 Bad Gateway</h1></center>
<hr><center>nginx<

Awaiting tasks:   4%|▍         | 9/200 [00:04<05:08,  1.62s/it]

******* response *******
 <html>
<head><title>502 Bad Gateway</title></head>
<body>
<center><h1>502 Bad Gateway</h1></center>
<hr><center>nginx</center>
</body>
</html> ERROR
'NoneType' object is not subscriptable
******* response *******
 <html>
<head><title>502 Bad Gateway</title></head>
<body>
<center><h1>502 Bad Gateway</h1></center>
<hr><center>nginx</center>
</body>
</html> ERROR
'NoneType' object is not subscriptable
******* response *******
 <html>
<head><title>502 Bad Gateway</title></head>
<body>
<center><h1>502 Bad Gateway</h1></center>
<hr><center>nginx</center>
</body>
</html> ERROR
'NoneType' object is not subscriptable
******* response *******
 <html>
<head><title>502 Bad Gateway</title></head>
<body>
<center><h1>502 Bad Gateway</h1></center>
<hr><center>nginx</center>
</body>
</html> ERROR
'NoneType' object is not subscriptable
******* response *******
 <html>
<head><title>502 Bad Gateway</title></head>
<body>
<center><h1>502 Bad Gateway</h1></center>
<hr><center>nginx<

Awaiting tasks:   6%|▌         | 12/200 [00:05<03:37,  1.16s/it]

******* response *******
 <html>
<head><title>502 Bad Gateway</title></head>
<body>
<center><h1>502 Bad Gateway</h1></center>
<hr><center>nginx</center>
</body>
</html> ERROR
'NoneType' object is not subscriptable
******* response *******
 <html>
<head><title>502 Bad Gateway</title></head>
<body>
<center><h1>502 Bad Gateway</h1></center>
<hr><center>nginx</center>
</body>
</html> ERROR
'NoneType' object is not subscriptable
******* response *******
 <html>
<head><title>502 Bad Gateway</title></head>
<body>
<center><h1>502 Bad Gateway</h1></center>
<hr><center>nginx</center>
</body>
</html> ERROR
'NoneType' object is not subscriptable


Awaiting tasks:   7%|▋         | 14/200 [00:05<02:41,  1.15it/s]

******* response *******
 <html>
<head><title>502 Bad Gateway</title></head>
<body>
<center><h1>502 Bad Gateway</h1></center>
<hr><center>nginx</center>
</body>
</html> ERROR
'NoneType' object is not subscriptable


Awaiting tasks: 100%|██████████| 200/200 [00:21<00:00,  9.31it/s]
Awaiting tasks:   0%|          | 1/200 [00:04<13:53,  4.19s/it]

******* response *******
 <html>
<head><title>502 Bad Gateway</title></head>
<body>
<center><h1>502 Bad Gateway</h1></center>
<hr><center>nginx</center>
</body>
</html> ERROR
'NoneType' object is not subscriptable
******* response *******
 <html>
<head><title>502 Bad Gateway</title></head>
<body>
<center><h1>502 Bad Gateway</h1></center>
<hr><center>nginx</center>
</body>
</html> ERROR
'NoneType' object is not subscriptable
******* response *******
 <html>
<head><title>502 Bad Gateway</title></head>
<body>
<center><h1>502 Bad Gateway</h1></center>
<hr><center>nginx</center>
</body>
</html> ERROR
'NoneType' object is not subscriptable


Awaiting tasks:   2%|▏         | 3/200 [00:04<09:43,  2.96s/it]

******* response *******
 <html>
<head><title>502 Bad Gateway</title></head>
<body>
<center><h1>502 Bad Gateway</h1></center>
<hr><center>nginx</center>
</body>
</html> ERROR
'NoneType' object is not subscriptable


Awaiting tasks:   2%|▎         | 5/200 [00:04<06:52,  2.12s/it]

******* response *******
 <html>
<head><title>502 Bad Gateway</title></head>
<body>
<center><h1>502 Bad Gateway</h1></center>
<hr><center>nginx</center>
</body>
</html> ERROR
'NoneType' object is not subscriptable
******* response *******
 <html>
<head><title>502 Bad Gateway</title></head>
<body>
<center><h1>502 Bad Gateway</h1></center>
<hr><center>nginx</center>
</body>
</html> ERROR
'NoneType' object is not subscriptable
******* response *******
 <html>
<head><title>502 Bad Gateway</title></head>
<body>
<center><h1>502 Bad Gateway</h1></center>
<hr><center>nginx</center>
</body>
</html> ERROR
'NoneType' object is not subscriptable


Awaiting tasks: 100%|██████████| 200/200 [00:43<00:00,  4.63it/s]
Awaiting tasks: 100%|██████████| 200/200 [00:24<00:00,  8.11it/s]
Awaiting tasks: 100%|██████████| 200/200 [00:18<00:00, 11.01it/s]
Awaiting tasks:   0%|          | 1/200 [00:04<13:42,  4.13s/it]

******* response *******
 <html>
<head><title>502 Bad Gateway</title></head>
<body>
<center><h1>502 Bad Gateway</h1></center>
<hr><center>nginx</center>
</body>
</html> ERROR
'NoneType' object is not subscriptable


Awaiting tasks:   2%|▏         | 3/200 [00:04<07:03,  2.15s/it]

******* response *******
 <html>
<head><title>502 Bad Gateway</title></head>
<body>
<center><h1>502 Bad Gateway</h1></center>
<hr><center>nginx</center>
</body>
</html> ERROR
'NoneType' object is not subscriptable
******* response *******
 <html>
<head><title>502 Bad Gateway</title></head>
<body>
<center><h1>502 Bad Gateway</h1></center>
<hr><center>nginx</center>
</body>
</html> ERROR
'NoneType' object is not subscriptable


Awaiting tasks:   2%|▏         | 4/200 [00:04<05:07,  1.57s/it]

******* response *******
 <html>
<head><title>502 Bad Gateway</title></head>
<body>
<center><h1>502 Bad Gateway</h1></center>
<hr><center>nginx</center>
</body>
</html> ERROR
'NoneType' object is not subscriptable


Awaiting tasks: 100%|██████████| 200/200 [00:27<00:00,  7.22it/s]
Awaiting tasks:   0%|          | 1/200 [00:04<15:14,  4.60s/it]

******* response *******
 <html>
<head><title>502 Bad Gateway</title></head>
<body>
<center><h1>502 Bad Gateway</h1></center>
<hr><center>nginx</center>
</body>
</html> ERROR
'NoneType' object is not subscriptable


Awaiting tasks:   1%|          | 2/200 [00:05<11:04,  3.36s/it]

******* response *******
 <html>
<head><title>502 Bad Gateway</title></head>
<body>
<center><h1>502 Bad Gateway</h1></center>
<hr><center>nginx</center>
</body>
</html> ERROR
'NoneType' object is not subscriptable


Awaiting tasks: 100%|██████████| 200/200 [00:18<00:00, 10.94it/s]
Awaiting tasks: 100%|██████████| 200/200 [00:22<00:00,  9.08it/s]
Awaiting tasks: 100%|██████████| 200/200 [00:59<00:00,  3.36it/s]
Awaiting tasks: 100%|██████████| 200/200 [00:17<00:00, 11.28it/s]
Awaiting tasks: 100%|██████████| 200/200 [00:19<00:00, 10.32it/s]
Awaiting tasks: 100%|██████████| 200/200 [00:19<00:00, 10.02it/s]
Awaiting tasks: 100%|██████████| 200/200 [00:17<00:00, 11.20it/s]
Awaiting tasks: 100%|██████████| 200/200 [00:19<00:00, 10.46it/s]
Awaiting tasks: 100%|██████████| 200/200 [00:18<00:00, 10.56it/s]
Awaiting tasks: 100%|██████████| 200/200 [00:18<00:00, 10.58it/s]
Awaiting tasks: 100%|██████████| 200/200 [00:23<00:00,  8.61it/s]
Awaiting tasks: 100%|██████████| 200/200 [00:17<00:00, 11.58it/s]
Awaiting tasks: 100%|██████████| 200/200 [00:16<00:00, 12.25it/s]
Awaiting tasks: 100%|██████████| 200/200 [00:27<00:00,  7.34it/s]
Awaiting tasks: 100%|██████████| 200/200 [00:17<00:00, 11.25it/s]
Awaiting t

Expecting ',' delimiter: line 9 column 53 (char 308)


Awaiting tasks: 100%|██████████| 164/164 [00:55<00:00,  2.94it/s]

开始时间：2024-06-26 17:12:04.757156,  结束时间：2024-06-26 17:36:56.167273, 花费时间：0:24:51.410117s





In [88]:
# cross_border_multi_label_tag_res[50:100]

In [104]:
cross_border_add_multi_label_df = parse_tag_result_and_merge_origin_data(need_multi_label_tag_cross_border_df, pd.DataFrame(cross_border_multi_label_tag_res))


origin_data_add_tag_res: (9764, 23)
扩展之后的数据条数： 17881


In [105]:
cross_border_add_multi_label_df.to_excel("./data/new_data_tag_res/跨境购物混淆数据.xlsx", index=False)

In [92]:
pd.DataFrame(cross_border_multi_label_tag_res).shape, need_multi_label_tag_cross_border_df.shape

((11260, 2), (9764, 22))

In [93]:
need_multi_label_tag_cross_border_df.merge(pd.DataFrame(cross_border_multi_label_tag_res), on=['docid'], how='inner').shape

(5, 23)

In [98]:
docids = pd.DataFrame(cross_border_multi_label_tag_res)['docid'].tolist()
need_multi_label_tag_cross_border_df[need_multi_label_tag_cross_border_df['docid'].isin(docids)].shape 

(5, 22)

In [97]:
need_multi_label_tag_cross_border_df.dtypes

序号                      int64
docid                  object
标题                     object
内文                     object
媒体名称                   object
版面                     object
出版日期                   object
媒体类型                   object
作者                     object
情感                     object
聚类编号                   object
原文链接                   object
命中关键字                  object
spam                     bool
content                object
label_id                int64
label_name             object
label_probability     float64
top3_label_entropy    float64
label_probs            object
detail_anlysis         object
is_belong              object
dtype: object