# openai==1.35可以正常使用

In [1]:
# deepseek需要 openai==1.*版本
import asyncio
import pandas as pd
import numpy as np
import re
import time
import json
from tqdm import tqdm
import openai
from openai import AsyncOpenAI
DEEPSEEK_API_KEY = "sk-f9cccc9420dd49dba5c78f88a466e0f1"
DEREPSEEK_BASE_URL = "https://api.deepseek.com"
DEEPSEEK_MODEL = "deepseek-chat"


deepseek_client = AsyncOpenAI(api_key=DEEPSEEK_API_KEY, base_url = DEREPSEEK_BASE_URL)
async def deepseek_caller(system_message, user_message):
    try:
        response = await deepseek_client.chat.completions.create(
            model=DEEPSEEK_MODEL,
            messages=[
                {"role": "system", "content": system_message},
                {"role": "user", "content": user_message},
            ],
            temperature=0.7,
        )
        response_parse = response.choices[0].message.content
        #print("******* response *******\n", response_parse)
        return response_parse
    except Exception as e:
        print("******* response *******\n", e, "ERROR")
        return np.nan


# GPT4o 异步处理过程

In [7]:
async def parse_response(response):
    json_str = re.search(r'\{.*\}', response, re.DOTALL)
    # print(json_str)
    if json_str is None:
        return None
    else:
        # 将JSON字符串转换为Python字典
        result = json.loads(json_str.group(0))
        return result
    
    
async def single_row_process(docid, content, topic_name, topic_define, system_message, user_message):
    response = await deepseek_caller(system_message, user_message)
    result = await parse_response(response)
    # print(f"{docid}&&{topic_name}", response, result)
    return f"{docid}&&{topic_name}", result

In [10]:
import math
async def tag_main(data, system_message, user_message):
    start_time = time.time() # start timing
    total_rows = data.shape[0]
    
    count = 1
    batch_size = 30
    start_row = 0

    tasks = []
    
    llm_response_list = []

    for i, row in tqdm(data.iloc[start_row:].iterrows(), total=total_rows - start_row, desc="Processing rows"):
        embedding_recall_topics = row['embedding_recall_topics']
        if isinstance(embedding_recall_topics, list) and len(embedding_recall_topics) > 0:
            for topic_name in row['embedding_recall_topics']:
                task = asyncio.create_task(single_row_process(
                    docid=row['docid'],
                    content=row['content'], 
                    topic_name=topic_name, 
                    topic_define=topic_define_dict[topic_name]
                ))
                tasks.append(task)


        # Await the completion of current batch tasks
        for completed_task in tqdm(asyncio.as_completed(tasks), total=len(tasks), desc="Awaiting tasks"):
            try:
                _id, result = await completed_task
                # print(_id, result, type(result))
                docid, topic_name = _id.split("&&")[0], _id.split("&&")[1]
                llm_response_list.append({
                    "docid":docid,
                    "topic_name": topic_name,
                    "confidence": result["confidence"],
                    "topic_is_right": result["topic_is_right"],
                    "explanation": result["explanation"]
                })
            except Exception as e:
                print("******* response *******\n", result, e, "ERROR")



        count += 1
        tasks.clear()  # Clear the tasks list after processing a batch

    end_time = time.time()  # End timing
    elapsed_time = end_time - start_time
    print("Elapsed time:", elapsed_time, "seconds")
    return llm_response_list

# 测试4o从多个标签中选择一个最正确的标签的效果

In [11]:
# 这个过程必须是openai==0.28
import wiserspromptlayer

openai = wiserspromptlayer.openai
openai.api_type = "azure"
openai.api_base = "https://openai-test-south-central-us.openai.azure.com/"
openai.api_version = "2024-02-01"
openai.api_key = "597bc6a5b23645b59550ddf809db7ef4"
wiserspromptlayer.api_key = "597bc6a5b23645b59550ddf809db7ef4"
 
def call_gpt4(messages):
    rs = openai.ChatCompletion.create(
        engine="model-gpt-4o-0513", #"model-gpt-4-8k"
        messages = messages,
        temperature=0.7,
        # response_format={ "type": "json_object" },
        top_p=0.95,
        frequency_penalty=0,
        presence_penalty=0,
        stop=None)
    try:
        # response = ast.literal_eval(rs.choices[0]['message']['content'])
        response = rs.choices[0]['message']['content']
        print(response)
    except:
        print('gpt ... error, try gpt again..........')
        rs = openai.ChatCompletion.create(
            engine="model-gpt-4o-0513", # "model-gpt-4-8k"
            messages = messages,
            temperature=0.7,
            # response_format={ "type": "json_object" },
            top_p=0.95,
            frequency_penalty=0,
            presence_penalty=0,
            stop=None)
        try:
            # response = ast.literal_eval(rs.choices[0]['message']['content'])
            response = rs.choices[0]['message']['content']
        except Exception as e:
            print(e)
            response = {}
    return response

# 调用GPT 4o, 测试效率

In [12]:
def batch_request_gpt4(data):
    def parse_response(response):
        json_str = re.search(r'\{.*\}', response, re.DOTALL)
        # print(json_str)
        if json_str is None:
            return None
        else:
            # 将JSON字符串转换为Python字典
            result = json.loads(json_str.group(0))
            return result
    response_list = []
    
    for index, row in data.head(100).iterrows():
        message = [
            {"role": "system", "content": multi_label_select_system_prompt.replace("{{possible_topic_info}}", str(topic_define_dict))},
            {"role": "user", "content": input_user_message.replace("{{content}}", row['content'])}
        ]
        response = call_gpt4(message=message)
        result = parse_response(response)
        response_list.append(result)
        
        print(message, response)
        break
    return response_list
        
    

In [14]:

call_gpt4(messages=[
                {"role": "system", "content": """
                    你是专业的文本内容分析师，我为你提供了“假冒仿制商品”的类型定义，请理解类型的含义

类型定义：
    假冒仿制商品： 在设计、外观或功能上模仿或参照知名品牌、高端产品，但在品牌标识、材料或工艺上有所区别的商品，主要包括高仿、水货、复刻及莆田货。
    
我再给你提供一些“假冒仿制商品”的正错的例字和错误的例字，

正确的例子：
    (1) a货lv皮带多少钱，莆田货纯原版去哪里买，圣罗兰女包高仿，「这是全球统一的浪漫吧」市场鱼龙杂乱，一般在这个地方大多数牌子都可以找到。名牌是可以在我们现实生活中的专柜里面买得到的，因为这样的一些产品在专柜里面会拥有着更多的款式，因为它是生产厂家直接进行相应的 ..
    (2) 古驰仿真皮带#成功男人的标配 时尚潮流#真皮牛皮 莆田货
    (3) \n\n\n高仿欧米茄手表购买 一比一复刻劳力士哪里有卖 爱彼皇家橡树精仿手表刚才我给大家介绍完了几种计时方式，接下来就是我们的重要时刻！现在市场上计时腕表太多了，大家都眼花缭乱，不知道有哪些计时腕表值得买呢？我来给大家推荐几款性价比高的、常见的、机芯质量非常在线的计时腕表。\n首先推荐的，是浪琴这只康卡斯计时。这是你可以花很少的钱，买到很高配置的计时码表。因为浪琴L688是柱状轮计时机芯，这是计时表里比较高的配置，通常都很贵，但这枚浪琴很便宜。我个人觉得沉稳的人很适合戴浪琴，当然，也可以戴下面这枚非常有设计感的熊猫盘，我们尊重每个人的喜好。\n第二枚推荐的，是颜值与实力并存的熊猫盘。帝舵表凭借着熊猫盘受到欢迎,今年东奥会没抢到冰墩墩的，可以买块这个，看看像不像冰墩墩。而且，这只表有越级的配置，因为搭载的MT5813机芯和百年灵B01是同款共用，同样很有性价比！\n第三，是泰格豪雅摩纳哥系列的这一只，推荐原因是搭载的02机芯是品牌主推的自产机芯，这只腕表采用大面积的蓝色，辨识度很高。\n那么第四个！同样还是超霸系列，大家印象中的超霸好像都挺贵的，但是no！也有便宜的表款，比如欧米茄超霸3861。这是现在市面上，你能花最少的钱就能买到的水平离合手动上弦计时机芯，它的结构和上世纪60年代的古董计时表没什么区别，机芯很漂亮性价比很高。\n第五，万国飞行员计时。飞计是名表中非常经典的，又热门，并且品牌已经完成了机芯的更新换代，还新增加了背透的设计，绿色表盘搭配棕色表带，复古，颜值高，要啥有啥。\n如果预算再高一点，可以选择百年灵的这款AB0138241C1P1，推荐理由是搭载B01计时机芯，和帝舵的MT5813机芯是共用的，并且在劳力士、欧米茄、百年灵这个级别里，百年灵的综合性价比很好，相对便宜，但是东西一点都不差，全线都拥有天文台认证，五年质保，售后也有保障。这款腕表，拥有冰蓝色盘面，颜值很高，戴上它你就是人群中最靓的仔。\n
    (4) '\n\n\n款号HN604\n✨23年巴黎世家春季新款T恤\n2023早春新品BALENCIGA巴黎世家胸前锁扣双B刺绣短袖，\n细节版本一比一复刻，Oversize版型，还原正版，尺码偏大！\n-面料采用280克纯棉布料，水洗做旧磨破效果\n-衣肩服缝链后接幅采内用包0.8cm捆条，\n-特拉种捆机器边双锁链包线缝纯棉定织面料，\n-田岛机立体刺绣重工2万多针 重工之作！\n-全套定制原版辅料，三针五线工艺，\n-原版包装配品牌防潮纸包装！\n-顶级代级购版本，随卖意无惧品压质力。\n颜色：水洗黑\n尺码：XS-L'

错误的例子：
    （1）'\n游戏:流氓软件\n平台:steam(未发售,有试玩demo)\n游戏里完美复刻我们现实中安装软件的各种流氓套路,比如无法跳过但细看非常坑人的用户协议,或者安装时突然跳出的其他软件选项,一不小心就下了一个全家桶或者下崽器。\n游戏里的套路非常多,各种谐音又很搞笑,我再三小心,却还是不小心装了5个流氓软件,真是防不胜防。\n很适合电脑新手拿来练手玩玩,现实中避开这些坑🕳️感兴趣的小伙伴可以去玩玩看~\n#浅评一下\n#休闲游戏\n#搞笑游戏\n#游戏安利\n#Steam游戏\n你被流氓软件坑过吗?\n(单选)\n被坑过\n没有\n150\n人参与,距离结束还有:\n0\n天\n14\n时\n24\n分\n'
    (2)  '\n在2024年佛山陶博会、潭州陶瓷展上,有关于“复刻釉”的宣传铺天盖地,尤其在广东、江西两产区表现尤为火热。\n复刻釉到底为何物?据介绍,复刻釉瓷砖是采用瓷质坯体表面数字喷码技术,将原有的图案或图像直接印刷到坯体表面,并在烧制过程中加入釉料制成。这种技术使得瓷砖图案真实自然、丰富多彩,具有高度的复刻效果。同时,复刻釉瓷砖表面光滑细腻,不易积污,易于清洁,可用于家庭、办公室、商业场所和公共空间等多种场合。\n也有行业人士介绍,复刻釉瓷砖,名字源于它能复刻大理石、花岗岩、砂岩、洞石等奇峰峻石,定格大自然的永恒之美。从产品的颜色、规格、工艺、光感、设计、质感六个方面出发,还原自然真石的凹凸质感,可以让瓷砖表面呈现出各种不同的图案和颜色,同时通过多工艺组合,提高瓷砖的三维立体效果,增强产品的逼真程度和性能。\n目前,有部分陶瓷厂家将复刻釉用于在木纹砖与大理石纹砖的设计上,借此来提升瓷砖图案的立体感。但根据生产厂家反馈,复刻釉瓷砖的生产工艺复杂,生产难度较高,在当下降本增效的环境下,生产出的复刻釉瓷砖总是差强人意。在复刻釉试销过程中,行业者也表示,复刻釉瓷砖本身并不是一款走量型产品,它代表着生产商的研发和技术实力,但消费者对于这类产品的接受程度还有待市场验证。\n'
    (3)  高仿大牌男装一手货源 , 微商高仿服饰一手货源 , miumiu仿品连衣裙1.首富位置丢给了LV服装的老板 , 高仿 , 男装 , 高仿 ,    高仿男装 高仿男装批发 服装那里有主要是股价大跌 , 顶部下来约60%了。这首富本来就虚得很 , 两年涨十倍不止 , 极度透支。还有X的1200多亿美元的估值 , 不是公开交易的 , 一笔融资就算了。2. 股价大涨 , 中国是最大力量!一个是中国粉丝多 , 销量大涨 , 成最大市场 , 潜力大。再一个是上海工厂给力 , 解决了困扰很久的产能问题 , 格局打开。因此 , 中国总裁新西兰人被提拔为全球总裁 , 成为大陆背景华人职位最高的。3. 但是 , 中国又冒出很多电动车企业。利用中国产业链 , 越南都跑出个企业要上美股 , 对标特斯拉 , 有模有样。中国这边很多企业就觉得电动车就那回事 , 最关键的电池也是买的 , 自动驾驶不靠谱 , 大家都能搞。逼格往下走 , 虽然在中国生产成本下降有利 , 但是被迫降价就不好了。4. 虽然利润率还是较高 , 单车利润是丰田大众传统车企的几倍 , 但能否维持 , 市场不乐观。这边中国电动车企马上出海了 , 销量被群狼争抢不可避免 , 利润率再下降 , 这股价还得崩。 最近纳斯达克没怎么跌 , 不赖美股。这种走势特别危险。学费可以少交点你会发现 , 他并不完美
    (4) 产品名称:hmax邻家若妻飞机杯 福利群:企鹅801945478 推荐理由:邻家若妻丰腴饱满身材曼妙,采用1:1仿真复刻设计,深度还原真人的完美细节,可爱的肚脐眼,脊柱沟美背,S型腰凹凸有致,两颗镶嵌在山峦高耸地带的樱桃,娇艳欲滴、似画龙点睛一般点缀在胶体上。 适合人群:双通道设计 中等刺激程度 适合进阶玩家尝鲜哦!
    

                """},
                {"role": "user", "content": "请你理解“假冒仿制商品” 的定义，并对正确的例子和错误的例子进行分析，分析得到为什么正确的例子可以被正确的判定为纯招聘类，而错误的例子会被错误的判断为纯招聘类，并总结出如何去判断“假冒仿制商品”类文章，请你的结论整理输出"},
            ])

APIRemovedInV1: 

You tried to access openai.ChatCompletion, but this is no longer supported in openai>=1.0.0 - see the README at https://github.com/openai/openai-python for the API.

You can run `openai migrate` to automatically upgrade your codebase to use the 1.0.0 interface. 

Alternatively, you can pin your installation to the old version, e.g. `pip install openai==0.28`

A detailed migration guide is available here: https://github.com/openai/openai-python/discussions/742


# 通过验证prompt对招聘类数据进行验证，判断prompt的效果

In [24]:
recruitment_prompt_path = "./prompt/correct_fanghuo_error_example.txt"
with open(recruitment_prompt_path, 'r', encoding='utf-8') as f:
    recruitment_prompt = f.read()
    

## 读取仿货数据

In [18]:
counterfeit_goods_df = pd.read_excel("./data/new_data/仿货废文0622164742_dedup.xlsx")
counterfeit_goods_df['content'] = counterfeit_goods_df.apply(lambda row: str(row['标题']) + "，" + str(row['内文']), axis=1)
counterfeit_goods_df.rename(columns={"文章编号":"docid"}, inplace=True)
counterfeit_goods_df.shape, counterfeit_goods_df.columns

((24743, 15),
 Index(['序号', 'docid', '标题', '内文', '媒体名称', '版面', '出版日期', '媒体类型', '作者', '情感',
        '聚类编号', '原文链接', '命中关键字', 'spam', 'content'],
       dtype='object'))

In [25]:
recruitment_prompt

'你是专业的文本内容分析师，以下给你提供“假冒仿制商品”的定义，帮助你理解类型含义\n\n类型定义：\n    假冒仿制商品,在设计、外观或功能上模仿或参照知名品牌、高端产品，但在品牌标识、材料或工艺上有所区别的商品，主要包括高仿、水货、复刻及莆田货\n\n请你结合定义并分析文章内容是否属于“假冒仿制商品”，判断“假冒仿制商品”类文章的方法有以下关键的几点，\n\n1. **关键术语**：检查文章中是否包含诸如“高仿”、“复刻”、“仿真”、“莆田货”等关键词。\n2. **品牌和商品**：文章是否提及知名品牌的仿冒商品，包括但不限于奢侈品、名表、名牌服饰等。\n3. **商品特点**：描述是否涉及商品的设计、外观或功能的模仿，与品牌标识、材料或工艺的区别。\n4. **市场背景**：是否讨论了假冒仿制商品的市场特点、购买渠道等信息。\n\n\n需要满足以上条件才是正确的“假冒仿制商品”类文章，判断结果通过以下json格式输出。\n{\n    "detail_anlysis": "对文章的分析及判断思路",\n    "is_recruitment": "是否是"假冒仿制商品"类文章,yes代表是，no代表否"\n}\n\n'

In [20]:

async def task_func(docid, recruitment_prompt, input_user_prompt):
    response = await deepseek_caller(recruitment_prompt, input_user_prompt)
    result = await parse_response(response)
    return docid, result


async def eval_process(data, recruitment_prompt):
    docid2evalres = {}
    tasks = []
    llm_response_list = []
    for index, row in data.iterrows():
        tasks.append(asyncio.create_task(task_func(
                    docid=row['docid'],
                    recruitment_prompt=recruitment_prompt,
                    input_user_prompt=f"input: \n\t{row['content']}\noutput:"
                )))
    
    for completed_task in tqdm(asyncio.as_completed(tasks), total=len(tasks), desc="Awaiting tasks"):
        try:
            _id, result = await completed_task
            llm_response_list.append({
                "docid":_id,
                "detail_anlysis": result["detail_anlysis"],
                "is_recruitment": result["is_recruitment"]
            })
        except Exception as e:
            print(e)
    
    return llm_response_list  

In [47]:
import math
async def batch_eval_process(data, recruitment_prompt, batch_size=20):
    docid2evalres = {}
    llm_response_list = []
    # batch_size = 120
    for i in range(math.ceil(data.shape[0] / batch_size)):
        tasks = []
        for index, row in data[i*batch_size:(i+1)*batch_size].iterrows():
            tasks.append(asyncio.create_task(task_func(
                        docid=row['docid'],
                        recruitment_prompt=recruitment_prompt,
                        input_user_prompt=f"input: \n\t{row['content']}\noutput:"
                    )))
    
        for completed_task in tqdm(asyncio.as_completed(tasks), total=len(tasks), desc="Awaiting tasks"):
            try:
                _id, result = await completed_task
                llm_response_list.append({
                    "docid":_id,
                    "detail_anlysis": result["detail_anlysis"],
                    "is_belong": result["is_belong"]
                })
            except Exception as e:
                print(e)

    return llm_response_list  

In [28]:
llm_response_list = await batch_eval_process(counterfeit_goods_df, recruitment_prompt, batch_size=200)

Awaiting tasks: 100%|██████████| 200/200 [00:18<00:00, 11.03it/s]
Awaiting tasks: 100%|██████████| 200/200 [00:18<00:00, 10.89it/s]
Awaiting tasks: 100%|██████████| 200/200 [00:19<00:00, 10.50it/s]
Awaiting tasks: 100%|██████████| 200/200 [00:19<00:00, 10.27it/s]
Awaiting tasks: 100%|██████████| 200/200 [00:20<00:00,  9.66it/s]
Awaiting tasks: 100%|██████████| 200/200 [00:18<00:00, 10.72it/s]
Awaiting tasks: 100%|██████████| 200/200 [00:17<00:00, 11.50it/s]
Awaiting tasks: 100%|██████████| 200/200 [00:20<00:00,  9.54it/s]
Awaiting tasks: 100%|██████████| 200/200 [00:23<00:00,  8.36it/s]
Awaiting tasks: 100%|██████████| 200/200 [00:19<00:00, 10.07it/s]
Awaiting tasks:  44%|████▍     | 88/200 [00:13<00:03, 34.09it/s]

'detail_anlysis'


Awaiting tasks: 100%|██████████| 200/200 [00:19<00:00, 10.26it/s]
Awaiting tasks: 100%|██████████| 200/200 [00:18<00:00, 10.95it/s]
Awaiting tasks: 100%|██████████| 200/200 [00:17<00:00, 11.22it/s]
Awaiting tasks: 100%|██████████| 200/200 [00:19<00:00, 10.13it/s]
Awaiting tasks: 100%|██████████| 200/200 [00:17<00:00, 11.16it/s]
Awaiting tasks: 100%|██████████| 200/200 [00:19<00:00, 10.11it/s]
Awaiting tasks: 100%|██████████| 200/200 [00:20<00:00,  9.66it/s]
Awaiting tasks: 100%|██████████| 200/200 [00:18<00:00, 10.85it/s]
Awaiting tasks: 100%|██████████| 200/200 [00:18<00:00, 11.09it/s]
Awaiting tasks: 100%|██████████| 200/200 [00:16<00:00, 12.27it/s]
Awaiting tasks: 100%|██████████| 200/200 [00:15<00:00, 12.51it/s]
Awaiting tasks: 100%|██████████| 200/200 [00:17<00:00, 11.57it/s]
Awaiting tasks: 100%|██████████| 200/200 [00:18<00:00, 10.75it/s]
Awaiting tasks: 100%|██████████| 200/200 [00:17<00:00, 11.66it/s]
Awaiting tasks: 100%|██████████| 200/200 [00:16<00:00, 12.13it/s]
Awaiting t

Expecting ',' delimiter: line 2 column 148 (char 149)


Awaiting tasks: 100%|██████████| 200/200 [00:16<00:00, 11.78it/s]
Awaiting tasks: 100%|██████████| 200/200 [00:18<00:00, 11.04it/s]
Awaiting tasks:  70%|███████   | 141/200 [00:14<00:03, 15.35it/s]

Expecting ',' delimiter: line 2 column 121 (char 122)


Awaiting tasks: 100%|██████████| 200/200 [00:19<00:00, 10.44it/s]
Awaiting tasks: 100%|██████████| 200/200 [00:19<00:00, 10.48it/s]
Awaiting tasks: 100%|██████████| 200/200 [00:18<00:00, 10.93it/s]
Awaiting tasks: 100%|██████████| 200/200 [00:17<00:00, 11.36it/s]
Awaiting tasks: 100%|██████████| 200/200 [00:20<00:00,  9.89it/s]
Awaiting tasks: 100%|██████████| 200/200 [00:18<00:00, 10.66it/s]
Awaiting tasks: 100%|██████████| 200/200 [00:17<00:00, 11.43it/s]
Awaiting tasks: 100%|██████████| 200/200 [00:16<00:00, 11.77it/s]
Awaiting tasks: 100%|██████████| 200/200 [00:18<00:00, 10.78it/s]
Awaiting tasks: 100%|██████████| 200/200 [00:17<00:00, 11.39it/s]
Awaiting tasks: 100%|██████████| 200/200 [00:19<00:00, 10.20it/s]
Awaiting tasks: 100%|██████████| 200/200 [00:18<00:00, 11.06it/s]
Awaiting tasks: 100%|██████████| 200/200 [00:18<00:00, 10.93it/s]
Awaiting tasks: 100%|██████████| 200/200 [00:15<00:00, 12.56it/s]
Awaiting tasks:  44%|████▍     | 89/200 [00:12<00:03, 33.11it/s]

Expecting ',' delimiter: line 2 column 74 (char 75)


Awaiting tasks: 100%|██████████| 200/200 [00:16<00:00, 11.99it/s]
Awaiting tasks: 100%|██████████| 200/200 [00:17<00:00, 11.44it/s]
Awaiting tasks: 100%|██████████| 200/200 [00:18<00:00, 10.96it/s]
Awaiting tasks: 100%|██████████| 200/200 [00:21<00:00,  9.19it/s]
Awaiting tasks: 100%|██████████| 200/200 [00:19<00:00, 10.05it/s]
Awaiting tasks: 100%|██████████| 200/200 [00:18<00:00, 11.02it/s]
Awaiting tasks: 100%|██████████| 200/200 [00:18<00:00, 10.76it/s]
Awaiting tasks: 100%|██████████| 200/200 [00:17<00:00, 11.15it/s]
Awaiting tasks: 100%|██████████| 200/200 [00:18<00:00, 11.05it/s]
Awaiting tasks: 100%|██████████| 200/200 [00:21<00:00,  9.15it/s]
Awaiting tasks: 100%|██████████| 200/200 [00:19<00:00, 10.35it/s]
Awaiting tasks: 100%|██████████| 200/200 [00:18<00:00, 10.58it/s]
Awaiting tasks: 100%|██████████| 200/200 [00:21<00:00,  9.26it/s]
Awaiting tasks: 100%|██████████| 200/200 [00:19<00:00, 10.28it/s]
Awaiting tasks: 100%|██████████| 200/200 [00:18<00:00, 10.86it/s]
Awaiting t

In [29]:
llm_response_list.__len__(), llm_response_list[:5]

(24739,
 [{'docid': 'd07d27fa77fbc72213a272a1c53307ed',
   'detail_anlysis': "文章中多次提及'高仿'、'广州高仿包包批发'、'广州高仿奢侈品批发'等关键词，明确指向了假冒仿制商品的市场和产品。同时，文章讨论了这些商品的市场位置和相关的市场动态，符合假冒仿制商品类文章的特征。",
   'is_recruitment': 'yes'},
  {'docid': 'b56d33e94273979f815622c9ff226807',
   'detail_anlysis': "文章中明确提到了'gucci双gi链条包高仿'，并详细描述了不同品质的高仿商品的价格范围，包括'高仿'、'精仿'和'复刻'等关键词。此外，文章还讨论了这些仿制品的市场价格混乱情况，以及购买渠道，符合假冒仿制商品的定义。",
   'is_recruitment': 'yes'},
  {'docid': 'b635e08b09a89f84f28ac6bdad412fa0',
   'detail_anlysis': "文章中提到的'无法复刻的音综封神现场'和'一开口就是天籁'等表述，主要是对音乐表演的描述，并没有提及任何商品或品牌，也没有涉及到商品的仿制或假冒。因此，这些内容并不符合假冒仿制商品的定义。",
   'is_recruitment': 'no'},
  {'docid': 'c4596526ad42e41ac6eaa7c1252635cb',
   'detail_anlysis': "文章标题明确提到了'高仿LV衣服'，这是典型的假冒仿制商品的关键词。同时，标题中提到的'淘宝'和'7个平台选购'暗示了这些商品的购买渠道，符合假冒仿制商品的市场背景。因此，根据提供的定义和判断方法，这篇文章讨论的是假冒仿制商品。",
   'is_recruitment': 'yes'},
  {'docid': '910e1580de19b21ae07253fc3e31a34c',
   'detail_anlysis': "文章中明确提到了'gucci男鞋精仿'，并详细描述了不同品质和价格段的仿制品，如'精仿品质'、'复刻品质'等，这些都是典型的假冒仿制商品的关键术语。文章还讨论了这些仿制品的市场价格和购买渠道，符合假冒

In [30]:
import pickle
pickle.dump(llm_response_list, open("./data/防火_gpt_res.pkl", 'wb'))

In [32]:
fangzhi_good_eval_res = fangzhi_good_df.merge(pd.DataFrame(llm_response_list), on='docid', how='left')

In [37]:
fangzhi_good_eval_res['docid'] = fangzhi_good_eval_res['docid'].astype(str)
fangzhi_good_eval_res.to_excel("./data/假冒仿制商品验证结果.xlsx", index=False)

In [38]:
fangzhi_good_eval_res[fangzhi_good_eval_res['is_recruitment']=='no'].shape

(2782, 17)

## 维修服务或客服热线

In [43]:
fix_service_df = pd.read_excel("./data/new_data/维修废文0622132327_dedup.xlsx")
fix_service_df['content'] = fix_service_df.apply(lambda row: str(row['标题']) + "，" + str(row['内文']), axis=1)
fix_service_df.rename(columns={"文章编号":"docid"}, inplace=True)
fix_service_df.shape, fix_service_df.columns

((13136, 15),
 Index(['序号', 'docid', '标题', '内文', '媒体名称', '版面', '出版日期', '媒体类型', '作者', '情感',
        '聚类编号', '原文链接', '命中关键字', 'spam', 'content'],
       dtype='object'))

In [56]:
save_cols = list(fix_service_df.columns)
fix_service_eval_res = pd.read_excel("./data/new_data_tag_res/维修服务验证结果.xlsx")
need_second_eval_fix_service_df = fix_service_eval_res[fix_service_eval_res['is_belong']=='yes'][save_cols]
need_second_eval_fix_service_df.shape, need_second_eval_fix_service_df.columns

((3406, 15),
 Index(['序号', 'docid', '标题', '内文', '媒体名称', '版面', '出版日期', '媒体类型', '作者', '情感',
        '聚类编号', '原文链接', '命中关键字', 'spam', 'content'],
       dtype='object'))

In [57]:
recruitment_prompt_path = "./prompt/correct_fix_error_example.txt"
with open(recruitment_prompt_path, 'r', encoding='utf-8') as f:
    fix_service_prompt = f.read()
fix_service_prompt

'你是专业的文本内容分析师，以下给你提供“维修服务或客服热线”的定义，帮助你理解类型含义\n\n类型定义：\n    维修服务或客服热线: 产品维修及上门信息类宣传广告或者全国各类的客服电话类文章\n\n请你结合定义并分析文章内容是否属于“维修服务或客服热线”，判断“维修服务或客服热线”类文章的方法有以下关键的几点，\n\n1. **明确提及商品和维修**：内容中必须明确提到某个具体商品（如电视、手机、家电等）以及其维修相关的内容。\n2. **描述维修服务的具体细节**：包括但不限于维修电话、服务范围、常见故障及解决方案、维修流程等。\n3. **维修服务的联系方式**：提供维修服务的联系方式或售后电话\n4. **避免与维修无关的服务**：主要内容是商品维修相关，避免与维修无关的其他服务内容\n\n需要满足以上条件才是正确的“假冒仿制商品”类文章，判断结果通过以下json格式输出。\n{\n    "detail_anlysis": "对文章的分析及判断思路",\n    "is_belong": "是否属于"维修服务或客服热线"类文章,yes代表是，no代表否"\n}\n\n'

In [58]:
fix_service_llm_response_list = await batch_eval_process(need_second_eval_fix_service_df, fix_service_prompt, batch_size=200)

Awaiting tasks: 100%|██████████| 200/200 [00:11<00:00, 17.41it/s]
Awaiting tasks: 100%|██████████| 200/200 [00:11<00:00, 16.95it/s]
Awaiting tasks: 100%|██████████| 200/200 [00:12<00:00, 15.80it/s]
Awaiting tasks: 100%|██████████| 200/200 [00:11<00:00, 17.84it/s]
Awaiting tasks:  79%|███████▉  | 158/200 [00:08<00:01, 37.83it/s]

Expecting ',' delimiter: line 2 column 141 (char 142)


Awaiting tasks: 100%|██████████| 200/200 [00:10<00:00, 19.48it/s]
Awaiting tasks: 100%|██████████| 200/200 [00:11<00:00, 17.22it/s]
Awaiting tasks:  40%|████      | 80/200 [00:07<00:03, 31.17it/s]

Expecting ',' delimiter: line 2 column 169 (char 170)


Awaiting tasks: 100%|██████████| 200/200 [00:11<00:00, 16.96it/s]
Awaiting tasks: 100%|██████████| 200/200 [00:12<00:00, 15.99it/s]
Awaiting tasks: 100%|██████████| 200/200 [00:10<00:00, 18.70it/s]
Awaiting tasks:  38%|███▊      | 77/200 [00:07<00:06, 18.84it/s]

Expecting ',' delimiter: line 2 column 169 (char 170)


Awaiting tasks: 100%|██████████| 200/200 [00:10<00:00, 18.59it/s]
Awaiting tasks: 100%|██████████| 200/200 [00:10<00:00, 19.26it/s]
Awaiting tasks: 100%|██████████| 200/200 [00:10<00:00, 18.62it/s]
Awaiting tasks:  58%|█████▊    | 117/200 [00:09<00:03, 27.41it/s]

Expecting ',' delimiter: line 2 column 141 (char 142)


Awaiting tasks: 100%|██████████| 200/200 [00:13<00:00, 15.34it/s]
Awaiting tasks:  71%|███████   | 142/200 [00:07<00:01, 49.47it/s]

Expecting ',' delimiter: line 2 column 173 (char 174)


Awaiting tasks: 100%|██████████| 200/200 [00:10<00:00, 18.36it/s]
Awaiting tasks: 100%|██████████| 200/200 [00:12<00:00, 16.09it/s]
Awaiting tasks: 100%|██████████| 200/200 [00:13<00:00, 14.61it/s]
Awaiting tasks: 100%|██████████| 200/200 [00:12<00:00, 16.17it/s]
Awaiting tasks: 100%|██████████| 6/6 [00:08<00:00,  1.41s/it]


In [59]:
pd.DataFrame(fix_service_llm_response_list).head()

Unnamed: 0,docid,detail_anlysis,is_belong
0,15d93d9bd69bee74d36d3dd7bf6900cd,文章内容主要描述了汽车维修和保养服务，包括更换LED灯、正常保养、加注冷媒等，并提供了服务热...,yes
1,b963a7c2d41acb40b433ac25b3caa771,文章内容主要介绍了新型智能无卡设备的安装服务，包括适用车型、安装地址和联系电话。虽然提供了联...,no
2,aa3c7ed50ae906132a177fe75265b3c3,文章内容明确提到了华为手机维修点，并且详细描述了维修服务的具体细节，包括维修经验、上门服务、...,yes
3,80e0223c080b5cb5153e1009fce14647,文章内容主要介绍了新泰水质检测机构服务中心的服务流程、联系方式、价格以及机构具备的条件。虽然...,no
4,cfaa27e24bdd058ea094bd187ce486e6,文章内容主要介绍了老表匠店铺，由国家标准认证的名表维修高级技师朱悦道师傅主修，并提供了咨询电...,yes


In [52]:
fix_service_eval_res = fix_service_df.merge(pd.DataFrame(fix_service_llm_response_list), on='docid', how='left')

In [60]:
need_second_eval_fix_service_eval_df = need_second_eval_fix_service_df.merge(pd.DataFrame(fix_service_llm_response_list), on='docid', how='left')
need_second_eval_fix_service_eval_df.shape

(3406, 17)

In [53]:
fix_service_eval_res['docid'] = fix_service_eval_res['docid'].astype(str)
fix_service_eval_res.to_excel("./data/new_data_tag_res/维修服务验证结果.xlsx", index=False)

In [64]:
is_fix_service_docids = set(need_second_eval_fix_service_eval_df[need_second_eval_fix_service_eval_df['is_belong']=='yes']['docid'].tolist())
is_fix_service_docids.__len__()

2080

In [65]:
fix_service_eval_res['is_belong'] = fix_service_eval_res['docid'].map(lambda docid: "yes" if docid in is_fix_service_docids else "no")
fix_service_eval_res['docid'] = fix_service_eval_res['docid'].astype(str)
fix_service_eval_res.to_excel("./data/new_data_tag_res/维修服务验证结果.xlsx", index=False)

In [66]:
fix_service_eval_res[fix_service_eval_res['is_belong']=='yes'].shape

(2080, 17)

## 跨境代购

In [67]:
cross_border_purchasing_df = pd.read_excel("./data/new_data/跨境代购0622161740_dedup.xlsx")
cross_border_purchasing_df['content'] = cross_border_purchasing_df.apply(lambda row: str(row['标题']) + "，" + str(row['内文']), axis=1)
cross_border_purchasing_df.rename(columns={"文章编号":"docid"}, inplace=True)
cross_border_purchasing_df.shape, cross_border_purchasing_df.columns

((35546, 15),
 Index(['序号', 'docid', '标题', '内文', '媒体名称', '版面', '出版日期', '媒体类型', '作者', '情感',
        '聚类编号', '原文链接', '命中关键字', 'spam', 'content'],
       dtype='object'))

### 跨境代购类中存在很多的好物推荐、直播预告类，促销广告、打折信息类，通过当前废文模型打标后筛选使用

In [80]:
from ailabuap.io.api_caller import APICaller
import requests

spam_endpoint = "http://ess76.wisers.com:13351/text_content_classification/classification"
spam_api_caller = APICaller(url=spam_endpoint)

In [75]:
def _get_post_data(input_data: pd.DataFrame):
    post_data = []
    for i, row in input_data.iterrows():
        if isinstance(row['content'], str) and row['content'].strip() != "":
            item = dict(
                docid=row['docid'],
                headline="",
                content = row['content'].strip()
            )
            post_data.append(dict(json=item))
    return post_data
cross_border_post_data = _get_post_data(cross_border_purchasing_df)

In [79]:
for input_json in cross_border_post_data:
    print(input_json['json'])
    response = requests.post(url=spam_endpoint, json=input_json['json'])
    if response.status_code == 200:
        print(response.json())
    else: 
        print(response.text)
    break

{'docid': '67345ff99596b56eade80830d6049193', 'headline': '', 'content': '淘宝代购原装屏收到货后被诱导确认收货退运费无法申请退款，\n\n6.11在淘宝上海瑞法代买了一个原装屏,号称原装原厂,但是收到货之后是货到付款方式,然后联系商家怎么回事,被诱导确认收货然后退运费,占用了售后申请通道,再此淘宝没有任何流程上的风险提示,而是在流程结束之后给出提示,可是买家已经入坑,商品已经无法申请退款退货了!联系商家也不回复'}
{'retCode': 'S', 'retInfo': 'OK', 'retData': {'docid': '67345ff99596b56eade80830d6049193', 'label_id': 9, 'label_name': '待分类', 'label_probability': '0.8426108956336975', 'top3_label_entropy': '0.29923347524829735', 'label_probs': '{"待分类": 0.8426108956336975, "促销广告、打折信息类": 0.020459776744246483, "合租&租房广告&售房广告": 0.01902010105550289, "纯招聘类": 0.018574221059679985, "股市、债市动态": 0.018096910789608955, "公司财报": 0.01792268455028534, "大众科普，概念解释类": 0.01746123842895031, "灌水水贴": 0.015424370765686035, "好物推荐、直播预告类": 0.015339568257331848, "公司公告": 0.015090182423591614}'}}


In [82]:
# batch_resp = spam_api_caller.call_batch_async(cross_border_post_data, progress=True)

In [87]:
spam_tag_res = pd.DataFrame([single_res["result"]['retData'] for single_res in batch_resp.success])

In [88]:
spam_tag_res.head()

Unnamed: 0,docid,label_id,label_name,label_probability,top3_label_entropy,label_probs
0,67345ff99596b56eade80830d6049193,9,待分类,0.8426108956336975,0.2992334752482973,"{""待分类"": 0.8426108956336975, ""促销广告、打折信息类"": 0.02..."
1,fd8088fd76f90dc3f385fd0b29216f0d,8,促销广告、打折信息类,0.8730148077011108,0.2446485432629036,"{""促销广告、打折信息类"": 0.8730148077011108, ""纯招聘类"": 0.0..."
2,7c10fa31a2149182e6d2ffbe62e4926b,8,促销广告、打折信息类,0.609581470489502,0.6528978692583551,"{""促销广告、打折信息类"": 0.609581470489502, ""好物推荐、直播预告类""..."
3,36e99674bdbdd94c75ea388acabcc9a9,8,促销广告、打折信息类,0.5175511837005615,0.7605079523748566,"{""促销广告、打折信息类"": 0.5175511837005615, ""好物推荐、直播预告类..."
4,10d16835ed68c2447b1b806b0626107c,8,促销广告、打折信息类,0.816947340965271,0.341254198718126,"{""促销广告、打折信息类"": 0.816947340965271, ""好物推荐、直播预告类""..."


In [89]:
cross_border_purchasing_add_spam_df = cross_border_purchasing_df.merge(spam_tag_res, on=['docid'], how='left')

In [97]:
filter_label = set(['促销广告、打折信息类', '好物推荐、直播预告类'])
cross_border_purchasing_add_spam_df['top3_label_entropy'] = cross_border_purchasing_add_spam_df['top3_label_entropy'].astype(float)
need_eval_cross_border_purshasing_df = pd.concat([
        cross_border_purchasing_add_spam_df[~cross_border_purchasing_add_spam_df['label_name'].isin(filter_label)],
        cross_border_purchasing_add_spam_df[(cross_border_purchasing_add_spam_df['label_name'].isin(filter_label)) & (cross_border_purchasing_add_spam_df['top3_label_entropy']>0.8)]
])
need_eval_cross_border_purshasing_df.shape

(13829, 20)

### 通过废文标签及混淆数值筛选之后的数据进行验证

In [105]:
recruitment_prompt_path = "./prompt/correct_kuajing_error_example.txt"
with open(recruitment_prompt_path, 'r', encoding='utf-8') as f:
    cross_border_purchasing_prompt = f.read()
cross_border_purchasing_prompt

'你是专业的文本内容分析师，以下给你提供“跨境代购”的定义，帮助你理解类型含义\n\n类型定义：\n    跨境代购: 是指通过不同渠道（如海淘、带货、代购）购买其他地区或国外商品并带回某地供个人或他人使用。这涉及到跨地区购买和运输商品的行为，通常为了获取其他地区的特色产品、品牌或优惠价格。\n    \n请你结合定义并分析文章内容是否属于“维修服务或客服热线”，判断“维修服务或客服热线”类文章的方法有以下关键的几点，\n\n1. **明确的跨地区购买行为**：文章应明确提及商品是从其他地区或国外购买或代购的，涉及跨地区交易。\n2. **涉及具体商品**：应描述具体的商品、品牌或产品类别，且这些商品是从其他地区或者国外带回或运回本地的。\n3. **跨境运输或物流**：应提及商品的跨地区运输、转运或物流过程，展示商品如何从其他地区到达本地。\n4. **合法性**：商品应为合法的进口品，非仿制品或侵权商品。\n5. **服务细节**：详细描述代购服务的流程，如订单处理、物流安排等。\n6. **购买目的**: 购买是为了个人使用或转售给他人，而不是为了其他目的（如物流服务或犯罪）。\n\n需要满足以上条件才是正确的“跨境代购”类文章，判断结果通过以下json格式输出。\n{\n    "detail_anlysis": "对文章的分析及判断思路",\n    "is_belong": "是否属于"跨境代购"类文章,yes代表是，no代表否"\n}'

In [110]:
cross_border_llm_response_list = await batch_eval_process(need_eval_cross_border_purshasing_df, cross_border_purchasing_prompt, batch_size=200)

Awaiting tasks: 100%|██████████| 200/200 [00:15<00:00, 13.29it/s]
Awaiting tasks:  61%|██████    | 122/200 [00:09<00:01, 53.61it/s]

Expecting ',' delimiter: line 2 column 172 (char 173)


Awaiting tasks: 100%|██████████| 200/200 [00:12<00:00, 15.45it/s]
Awaiting tasks: 100%|██████████| 200/200 [00:13<00:00, 15.28it/s]
Awaiting tasks: 100%|██████████| 200/200 [00:13<00:00, 15.12it/s]
Awaiting tasks: 100%|██████████| 200/200 [00:13<00:00, 15.09it/s]
Awaiting tasks: 100%|██████████| 200/200 [00:16<00:00, 11.81it/s]
Awaiting tasks: 100%|██████████| 200/200 [00:12<00:00, 15.58it/s]
Awaiting tasks: 100%|██████████| 200/200 [00:13<00:00, 14.85it/s]
Awaiting tasks: 100%|██████████| 200/200 [00:14<00:00, 13.91it/s]
Awaiting tasks: 100%|██████████| 200/200 [00:15<00:00, 12.53it/s]
Awaiting tasks:  22%|██▎       | 45/200 [00:08<00:33,  4.67it/s]

Expecting ',' delimiter: line 2 column 132 (char 133)


Awaiting tasks: 100%|██████████| 200/200 [00:13<00:00, 14.38it/s]
Awaiting tasks: 100%|██████████| 200/200 [00:12<00:00, 15.44it/s]
Awaiting tasks: 100%|██████████| 200/200 [00:12<00:00, 15.64it/s]
Awaiting tasks: 100%|██████████| 200/200 [00:14<00:00, 14.15it/s]
Awaiting tasks:   0%|          | 1/200 [00:02<06:50,  2.06s/it]

******* response *******
 Error code: 400 - {'detail': 'Content Exists Risk'} ERROR
expected string or bytes-like object


Awaiting tasks: 100%|██████████| 200/200 [00:12<00:00, 15.47it/s]
Awaiting tasks: 100%|██████████| 200/200 [00:13<00:00, 15.23it/s]
Awaiting tasks:   0%|          | 1/200 [00:01<04:08,  1.25s/it]

******* response *******
 Error code: 400 - {'detail': 'Content Exists Risk'} ERROR
expected string or bytes-like object


Awaiting tasks: 100%|██████████| 200/200 [00:12<00:00, 15.90it/s]
Awaiting tasks: 100%|██████████| 200/200 [00:13<00:00, 14.43it/s]
Awaiting tasks: 100%|██████████| 200/200 [00:18<00:00, 11.02it/s]
Awaiting tasks: 100%|██████████| 200/200 [00:14<00:00, 13.67it/s]
Awaiting tasks: 100%|██████████| 200/200 [00:14<00:00, 14.28it/s]
Awaiting tasks:  92%|█████████▎| 185/200 [00:11<00:00, 21.79it/s]

Expecting ',' delimiter: line 2 column 209 (char 210)


Awaiting tasks: 100%|██████████| 200/200 [00:12<00:00, 15.44it/s]
Awaiting tasks:   0%|          | 1/200 [00:01<04:35,  1.38s/it]

******* response *******
 Error code: 400 - {'detail': 'Content Exists Risk'} ERROR
expected string or bytes-like object


Awaiting tasks:  98%|█████████▊| 196/200 [00:12<00:00,  9.06it/s]

Expecting ',' delimiter: line 2 column 265 (char 266)


Awaiting tasks: 100%|██████████| 200/200 [00:15<00:00, 12.72it/s]
Awaiting tasks: 100%|██████████| 200/200 [00:13<00:00, 15.24it/s]
Awaiting tasks: 100%|██████████| 200/200 [00:13<00:00, 14.68it/s]
Awaiting tasks:   0%|          | 1/200 [00:01<03:52,  1.17s/it]

******* response *******
 Error code: 400 - {'detail': 'Content Exists Risk'} ERROR
expected string or bytes-like object


Awaiting tasks: 100%|██████████| 200/200 [00:13<00:00, 15.10it/s]
Awaiting tasks:  32%|███▎      | 65/200 [00:08<00:06, 20.76it/s]

Expecting ',' delimiter: line 2 column 46 (char 47)


Awaiting tasks: 100%|██████████| 200/200 [00:13<00:00, 14.59it/s]
Awaiting tasks:  70%|███████   | 140/200 [00:10<00:01, 49.36it/s]

Expecting ',' delimiter: line 2 column 158 (char 159)


Awaiting tasks: 100%|██████████| 200/200 [00:14<00:00, 13.88it/s]
Awaiting tasks: 100%|██████████| 200/200 [00:12<00:00, 15.40it/s]
Awaiting tasks: 100%|██████████| 200/200 [00:11<00:00, 16.74it/s]
Awaiting tasks: 100%|██████████| 200/200 [00:13<00:00, 14.79it/s]
Awaiting tasks:  38%|███▊      | 77/200 [00:10<00:03, 32.67it/s]

Expecting ',' delimiter: line 2 column 163 (char 164)


Awaiting tasks: 100%|██████████| 200/200 [00:16<00:00, 11.99it/s]
Awaiting tasks: 100%|██████████| 200/200 [00:12<00:00, 15.91it/s]
Awaiting tasks: 100%|██████████| 200/200 [00:13<00:00, 15.26it/s]
Awaiting tasks: 100%|██████████| 200/200 [00:15<00:00, 13.21it/s]
Awaiting tasks:  86%|████████▌ | 172/200 [00:13<00:03,  8.21it/s]

Expecting ',' delimiter: line 2 column 174 (char 175)


Awaiting tasks: 100%|██████████| 200/200 [00:20<00:00,  9.78it/s]
Awaiting tasks: 100%|██████████| 200/200 [00:16<00:00, 12.25it/s]
Awaiting tasks: 100%|██████████| 200/200 [00:15<00:00, 13.20it/s]
Awaiting tasks: 100%|██████████| 200/200 [00:14<00:00, 13.64it/s]
Awaiting tasks: 100%|██████████| 200/200 [00:14<00:00, 14.01it/s]
Awaiting tasks: 100%|██████████| 200/200 [00:14<00:00, 13.72it/s]
Awaiting tasks:  77%|███████▋  | 154/200 [00:11<00:01, 41.79it/s]

Expecting ',' delimiter: line 2 column 184 (char 185)


Awaiting tasks: 100%|██████████| 200/200 [00:15<00:00, 13.17it/s]
Awaiting tasks: 100%|██████████| 200/200 [00:16<00:00, 12.40it/s]
Awaiting tasks: 100%|██████████| 200/200 [00:14<00:00, 14.02it/s]
Awaiting tasks: 100%|██████████| 200/200 [00:14<00:00, 13.40it/s]
Awaiting tasks: 100%|██████████| 200/200 [00:14<00:00, 13.75it/s]
Awaiting tasks: 100%|██████████| 200/200 [00:14<00:00, 14.22it/s]
Awaiting tasks: 100%|██████████| 200/200 [00:18<00:00, 10.79it/s]
Awaiting tasks: 100%|██████████| 200/200 [00:17<00:00, 11.61it/s]
Awaiting tasks: 100%|██████████| 200/200 [00:15<00:00, 12.74it/s]
Awaiting tasks: 100%|██████████| 200/200 [00:15<00:00, 13.10it/s]
Awaiting tasks: 100%|██████████| 200/200 [00:23<00:00,  8.56it/s]
Awaiting tasks: 100%|██████████| 200/200 [00:17<00:00, 11.76it/s]
Awaiting tasks:   0%|          | 1/200 [00:00<02:51,  1.16it/s]

******* response *******
 Error code: 400 - {'detail': 'Content Exists Risk'} ERROR
expected string or bytes-like object


Awaiting tasks: 100%|██████████| 200/200 [00:15<00:00, 13.25it/s]
Awaiting tasks: 100%|██████████| 200/200 [00:24<00:00,  8.27it/s]
Awaiting tasks:  82%|████████▏ | 163/200 [00:12<00:00, 40.83it/s]

Expecting ',' delimiter: line 2 column 202 (char 203)


Awaiting tasks: 100%|██████████| 200/200 [00:19<00:00, 10.51it/s]
Awaiting tasks: 100%|██████████| 200/200 [00:16<00:00, 12.49it/s]
Awaiting tasks: 100%|██████████| 200/200 [00:20<00:00,  9.56it/s]
Awaiting tasks: 100%|██████████| 200/200 [00:18<00:00, 11.05it/s]
Awaiting tasks:  74%|███████▍  | 148/200 [00:11<00:01, 39.10it/s]

Expecting ',' delimiter: line 2 column 200 (char 201)


Awaiting tasks: 100%|██████████| 200/200 [00:16<00:00, 11.89it/s]
Awaiting tasks: 100%|██████████| 200/200 [00:20<00:00,  9.80it/s]
Awaiting tasks: 100%|██████████| 200/200 [00:18<00:00, 11.08it/s]
Awaiting tasks: 100%|██████████| 200/200 [00:16<00:00, 12.06it/s]
Awaiting tasks: 100%|██████████| 200/200 [00:28<00:00,  7.08it/s]
Awaiting tasks: 100%|██████████| 200/200 [00:18<00:00, 10.78it/s]
Awaiting tasks: 100%|██████████| 200/200 [00:18<00:00, 10.96it/s]
Awaiting tasks: 100%|██████████| 200/200 [00:27<00:00,  7.27it/s]
Awaiting tasks: 100%|██████████| 200/200 [00:16<00:00, 12.08it/s]
Awaiting tasks: 100%|██████████| 200/200 [00:17<00:00, 11.28it/s]
Awaiting tasks: 100%|██████████| 29/29 [00:13<00:00,  2.12it/s]


In [111]:
cross_border_eval_res = need_eval_cross_border_purshasing_df.merge(pd.DataFrame(cross_border_llm_response_list), on='docid', how='left')
cross_border_eval_res.shape

(13829, 22)

In [112]:
cross_border_eval_res.to_excel("./data/new_data_tag_res/跨境代购验证结果.xlsx", index=False)

In [109]:
need_eval_cross_border_purshasing_df.shape

(13829, 20)

In [113]:
cross_border_eval_res[cross_border_eval_res['is_belong']=='no'].shape, cross_border_eval_res[cross_border_eval_res['is_belong']=='yes'].shape

((9764, 22), (4049, 22))