In [263]:
%load_ext autoreload
%autoreload 1
%autoreload utils.tag_char
import os
import pandas as pd
import numpy as np
import argparse
import yaml

from datasets import Dataset
from tqdm.auto import tqdm
from tqdm import tqdm_pandas
from utils import *


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [264]:
with open("../config.yaml", 'r') as config_file:
    config = yaml.safe_load(config_file)
config

{'date': '0606',
 'experiment_name': 'bert_p=0.5_pos=sentence',
 'mode': 'BP',
 'entity_type': ['Company', 'Person'],
 'num_labels': 5,
 'threshold': 0.5,
 'train_data': '/train_dataset',
 'valid_data': '/valid_dataset',
 'test_data': '/test_dataset',
 'pretrained_model': 'hfl/chinese-roberta-wwm-ext',
 'do_training': True,
 'max_len': 512,
 'batch_size': 32,
 'n_epochs': 5,
 'lr': '1e-5',
 'do_prediction': True}

In [265]:
parser = argparse.ArgumentParser(description='') 

parser.set_defaults(**config)
args = parser.parse_args(args = [])

parser.add_argument("--prefix_path", type=str, default=f"../experiments/{args.date}_{args.experiment_name}_{args.mode}")
args = parser.parse_args(args = [])

args

Namespace(batch_size=32, date='0606', do_prediction=True, do_training=True, entity_type=['Company', 'Person'], experiment_name='bert_p=0.5_pos=sentence', lr='1e-5', max_len=512, mode='BP', n_epochs=5, num_labels=5, prefix_path='../experiments/0606_bert_p=0.5_pos=sentence_BP', pretrained_model='hfl/chinese-roberta-wwm-ext', test_data='/test_dataset', threshold=0.5, train_data='/train_dataset', valid_data='/valid_dataset')

In [266]:
labels_to_ids, ids_to_labels = define_labels(param_args=args)
labels_to_ids

{'O': 0, 'B-Company': 1, 'I-Company': 2, 'B-Person': 3, 'I-Person': 4}

## 建立实验文件夹

In [267]:
os.makedirs(args.prefix_path, exist_ok=True)

## split to train, valid, test

In [268]:
# df_data = pd.read_pickle("../data/data_starbucks_yihetang_aligned.pkl")
df_data = pd.read_excel("../bochk/manual_label_data.xlsx")
print(df_data.shape)
df_data.head(4)

(39593, 16)


Unnamed: 0,docid,headline,content,ner,matched_keywords,include_company,include_person,length,split_sentence_index,Companys,Persons,context_cleaned,person_matched,company_matched,context_keywords,sign
0,2023063021774796465&&0,,宝申控股(08151) 停牌 / 内幕消息 / 其他-杂项。宝申控股(08151) 有关复牌...,"[{'label_name': 'Company', 'text_segment': '寶申...",['宝申控股'],True,True,74,0,['宝申控股'],[],宝申控股(08151) 停牌 / 内幕消息 / 其他-杂项。宝申控股(08151) 有关复牌...,[],['宝申控股'],宝申控股(08151) 停牌 / 内幕消息 / 其他-杂项。||可能的实体：宝申控股||宝申...,False
1,2023063021774796485&&0,,利骏集团香港(08360) 股东特别大会的结果。利骏集团香港(08360) 于二零二三年六月...,"[{'label_name': 'Company', 'text_segment': '利駿...","['利骏集团香港', '东特']",True,True,74,0,"['利骏集团', '利骏集团香港', '东特']",[],利骏集团香港(08360) 股东特别大会的结果。利骏集团香港(08360) 于二零二三年六月...,[],"['利骏集团香港', '东特']","利骏集团香港(08360) 股东特别大会的结果。||可能的实体：利骏集团香港,东特||利骏集...",False
2,2023063021774796605&&0,,推进科技自立自强， 国家集成电路设计自动化技术创新中心正式揭牌。21世纪经济报道记者 郭晨 ...,"[{'label_name': 'Company', 'text_segment': '21...","['21世纪经济报道', '陈之常', '杨学鹏', '郭晨', '黄如', '徐光辉', ...",True,True,488,0,['21世纪经济报道'],"['陈之常', '杨学鹏', '郭晨', '黄如', '徐光辉', '胡广杰', '吴刚']",推进科技自立自强， 国家集成电路设计自动化技术创新中心正式揭牌。21世纪经济报道记者 郭晨 ...,"['陈之常', '杨学鹏', '郭晨', '黄如', '徐光辉', '胡广杰', '吴刚']",['21世纪经济报道'],推进科技自立自强， 国家集成电路设计自动化技术创新中心正式揭牌。21世纪经济报道记者 郭晨 ...,False
3,2023063021774796605&&1,,"江苏省副省长胡广杰对EDA国创中心成立表示祝贺。他说,集成电路产业是我省具有较强竞争力的优势...","[{'label_name': 'Company', 'text_segment': '21...","['陈之常', '胡广杰']",True,True,432,1,[],"['陈之常', '胡广杰']","江苏省副省长胡广杰对eda国创中心成立表示祝贺。他说,集成电路产业是我省具有较强竞争力的优势...","['陈之常', '胡广杰']",[],"江苏省副省长胡广杰对eda国创中心成立表示祝贺。||可能的实体：胡广杰||他说,集成电路产业...",False


In [269]:
df_data['person_matched'] = df_data['person_matched'].map(eval)
df_data['company_matched'] = df_data['company_matched'].map(eval)

In [270]:
df_data['sign'] = df_data.apply(lambda row: row['company_matched'].__len__() > 0 or row['person_matched'].__len__() > 0, axis=1)
df_data[df_data['sign']==False].shape

(0, 16)

In [271]:
df_data['docid'].is_unique

True

In [272]:
dataset = Dataset.from_pandas(df_data[["docid"]])

splitted_dataset = dataset.train_test_split(train_size=0.885, seed=109)
train_ = splitted_dataset['train']
split_ = train_.train_test_split(train_size=0.885, seed=109)

train_set = split_['train']
valid_set = split_['test']
test_set = splitted_dataset['test']

len(train_set), len(valid_set), len(test_set)

(31009, 4030, 4554)

In [273]:
with open(f"{args.prefix_path}/train_docid.txt", "w") as f:
    f.write("\n".join(train_set['docid']))

with open(f"{args.prefix_path}/valid_docid.txt", "w") as f:
    f.write("\n".join(valid_set['docid']))

with open(f"{args.prefix_path}/test_docid.txt", "w") as f:
    f.write("\n".join(test_set['docid']))

## read train, valid, test

In [274]:

df_all = pd.read_excel("../bochk/manual_label_data.xlsx")
df_all['person_matched'] = df_all['person_matched'].map(eval)
df_all['company_matched'] = df_all['company_matched'].map(eval)
df_all['content'] = df_all['content'].astype(str)
df_all.head(1)

Unnamed: 0,docid,headline,content,ner,matched_keywords,include_company,include_person,length,split_sentence_index,Companys,Persons,context_cleaned,person_matched,company_matched,context_keywords,sign
0,2023063021774796465&&0,,宝申控股(08151) 停牌 / 内幕消息 / 其他-杂项。宝申控股(08151) 有关复牌...,"[{'label_name': 'Company', 'text_segment': '寶申...",['宝申控股'],True,True,74,0,['宝申控股'],[],宝申控股(08151) 停牌 / 内幕消息 / 其他-杂项。宝申控股(08151) 有关复牌...,[],[宝申控股],宝申控股(08151) 停牌 / 内幕消息 / 其他-杂项。||可能的实体：宝申控股||宝申...,False


In [275]:
df_all.tail()

Unnamed: 0,docid,headline,content,ner,matched_keywords,include_company,include_person,length,split_sentence_index,Companys,Persons,context_cleaned,person_matched,company_matched,context_keywords,sign
39588,998&&0,,原标题：多城“人才争夺战” 房地产市场格局难变。“二十一世纪什么最贵。人才。”直至十余年后，...,"[{'label_name': 'Time', 'text_segment': '二十一世紀...",['方米'],True,True,489,0,['方米'],[],原标题：多城“人才争夺战” 房地产市场格局难变。“二十一世纪什么最贵。人才。”直至十余年后，...,[],[方米],原标题：多城“人才争夺战” 房地产市场格局难变。“二十一世纪什么最贵。人才。”直至十余年后，...,False
39589,998&&1,,工作期间保障交通工具。无独有偶。博州人才政策对于住房方面的关注在其他三四五线城市动作中也有所...,"[{'label_name': 'Time', 'text_segment': '二十一世紀...","['国际金融报', '交通', '工程', '易居研究院智库中心', '严跃进']",True,True,493,1,"['工程', '国际金融报', '易居研究院智库中心', '易居', '交通', '易居研究院']",['严跃进'],工作期间保障交通工具。无独有偶。博州人才政策对于住房方面的关注在其他三四五线城市动作中也有所...,[严跃进],"[国际金融报, 交通, 工程, 易居研究院智库中心]",工作期间保障交通工具。||可能的实体：交通||无独有偶。博州人才政策对于住房方面的关注在其他...,False
39590,998&&2,,这场对于人才的竞争已不再局限于城市之间。“战火”已悄悄弥漫至区、县等更小的行政单位。不过，一...,"[{'label_name': 'Time', 'text_segment': '二十一世紀...",['国际金融报'],True,True,447,2,['国际金融报'],[],这场对于人才的竞争已不再局限于城市之间。“战火”已悄悄弥漫至区、县等更小的行政单位。不过，一...,[],[国际金融报],这场对于人才的竞争已不再局限于城市之间。“战火”已悄悄弥漫至区、县等更小的行政单位。不过，一...,False
39591,998&&3,,3月18日，河北石家庄推动户口迁入“零门槛”，群众仅凭居民身份证、户口簿就可向落户地派出所申...,"[{'label_name': 'Time', 'text_segment': '二十一世紀...","['易居', '方米', '严跃进', '沈路']",True,True,496,3,"['易居', '方米']","['严跃进', '沈路']",3月18日，河北石家庄推动户口迁入“零门槛”，群众仅凭居民身份证、户口簿就可向落户地派出所申...,"[严跃进, 沈路]","[易居, 方米]",3月18日，河北石家庄推动户口迁入“零门槛”，群众仅凭居民身份证、户口簿就可向落户地派出所申...,False
39592,998&&4,,中大城市的人才济济和持续人口聚集效应留给小城市的或是人口流出的落寞。易居研究院研究员沈昕对记...,"[{'label_name': 'Time', 'text_segment': '二十一世紀...","['易居研究院', '国际金融报', '搜狐', '陈晨', '刘民', '刘明', '沈昕']",True,True,486,4,"['国际金融报', '搜狐', '易居', '易居研究院']","['陈晨', '刘民', '刘明', '沈昕']",中大城市的人才济济和持续人口聚集效应留给小城市的或是人口流出的落寞。易居研究院研究员沈昕对记...,"[陈晨, 刘民, 刘明, 沈昕]","[易居研究院, 国际金融报, 搜狐]",中大城市的人才济济和持续人口聚集效应留给小城市的或是人口流出的落寞。易居研究院研究员沈昕对记...,False


In [276]:
df_all[df_all['docid'] == '998&&2']

Unnamed: 0,docid,headline,content,ner,matched_keywords,include_company,include_person,length,split_sentence_index,Companys,Persons,context_cleaned,person_matched,company_matched,context_keywords,sign
39590,998&&2,,这场对于人才的竞争已不再局限于城市之间。“战火”已悄悄弥漫至区、县等更小的行政单位。不过，一...,"[{'label_name': 'Time', 'text_segment': '二十一世紀...",['国际金融报'],True,True,447,2,['国际金融报'],[],这场对于人才的竞争已不再局限于城市之间。“战火”已悄悄弥漫至区、县等更小的行政单位。不过，一...,[],[国际金融报],这场对于人才的竞争已不再局限于城市之间。“战火”已悄悄弥漫至区、县等更小的行政单位。不过，一...,False


In [277]:
df_all['docid'].is_unique

True

In [278]:
with open(f"{args.prefix_path}/train_docid.txt", "r") as f:
    train_docid = f.read().split("\n")

with open(f"{args.prefix_path}/valid_docid.txt", "r") as f:
    valid_docid = f.read().split("\n")

with open(f"{args.prefix_path}/test_docid.txt", "r") as f:
    test_docid = f.read().split("\n")

len(train_docid), len(valid_docid), len(test_docid)

(31009, 4030, 4554)

In [279]:
df_train = df_all[df_all["docid"].isin(train_docid)].reset_index(drop=True)
df_valid = df_all[df_all["docid"].isin(valid_docid)].reset_index(drop=True)
df_test = df_all[df_all["docid"].isin(test_docid)].reset_index(drop=True)

len(df_train), len(df_valid), len(df_test)

(31009, 4030, 4554)

## train, valid

In [280]:
args.threshold == 0.5

True

In [281]:
tqdm.pandas()
df_train["input_text"] = df_train.apply(lambda x: x["context_keywords"] if np.random.random()>args.threshold else x["context_cleaned"], axis = 1)
df_valid["input_text"] = df_valid.apply(lambda x: x["context_keywords"] if np.random.random()>args.threshold else x["context_cleaned"], axis = 1)

df_train['input_text'] = df_train['context_cleaned']
df_valid['input_text'] = df_valid['context_cleaned']



In [282]:
df_train.loc[5556, 'input_text']

'.媒体来源:。apple。2.完整新闻标题:。血汗。海瑞扛丸宁受罚 不给员工加班费。3.完整新闻内文:。新竹十大伴手礼海瑞扛丸遭2员工爆料,指控压榨劳工。一名洪小姐控说,公司让她们超时。工作,加班还没有加班费,只给原本时薪115元,还要她们签下“不得要求加班费”的同意。书,让她觉得非常痛苦,且目前全体员工都采时薪制,一旦遇到小月,员工的薪水就少得可。怜。“公司宁被劳工局开罚,也不肯给我9千块加班费”洪小姐声泪俱下控诉,依劳基法规定员。工一天上班时数为8小时,超过时数必须按照比例支付加班费,公司时常要求他们加班12小。时,但超出来的时数,未依规定加给,只采取支付每小时115元。非但如此,全体员工劳健。保还被低报,目前已被劳工局开罚。当初跟公司调解争取加班费时,公司竟回应说:“你早就知道要加班了,为什么还要加班费。”还呛她:“反正公司已经被罚钱了。”不合理言论让黄小姐哭笑不得。洪小姐说,公司。还要求每个人签“不平等条约”,要他们同意加班,但没有加班费。洪小姐控诉,海瑞扛丸的上司把他们当司机,要求他们骑车或开车载他到火车站或市政府,'

In [283]:
df_train.loc[1, 'input_text']

'利骏集团香港(08360) 股东特别大会的结果。利骏集团香港(08360) 于二零二三年六月三十日举行的股东特别大会投票结果(112kb, pdf)'

In [284]:
df_train.columns

Index(['docid', 'headline', 'content', 'ner', 'matched_keywords',
       'include_company', 'include_person', 'length', 'split_sentence_index',
       'Companys', 'Persons', 'context_cleaned', 'person_matched',
       'company_matched', 'context_keywords', 'sign', 'input_text'],
      dtype='object')

# tag char

In [285]:
def tag_char(df, args):
    def _tag_char(example, param_args):
        content = example['input_text'] if pd.notna(example['input_text']) else ""
        try:
            tag = ['O'] * len(str(content))

            for entity_type in param_args.entity_type:
                pos_list = []
                entity_list = example[f"{entity_type}_matched".lower()]
                if entity_list == []:
                    continue
                else:
                    for entity in entity_list:
                        # try:
                        #     pos_list.extend([(match.start(), match.end()) for match in re.finditer(entity, content)])
                        # except Exception as e:
                        #     print(entity, content)
                        #     continue
                        try:
                            pos_list.extend([(match.start(), match.end()) for match in re.finditer(entity, content)])
                        except Exception as e:
                            print(e)
                            continue
                    for (start, end) in pos_list:
                        tag[start] = f"B-{entity_type}"
                        tag[start+1:end] = [f"I-{entity_type}"] * (end - start - 1)

            assert len(content) == len(tag)
            return tag
        except:
            return None
    
    tqdm.pandas(desc='tagging char-level label')
    df['tag_char'] = df.apply(_tag_char, param_args=args, axis=1)
    return df

In [286]:
df_train = tag_char(df_train, args)


missing ), unterminated subpattern at position 13
nothing to repeat at position 0
unterminated character set at position 0
missing ), unterminated subpattern at position 0
missing ), unterminated subpattern at position 0
expected string or bytes-like object
expected string or bytes-like object
expected string or bytes-like object
expected string or bytes-like object
expected string or bytes-like object
expected string or bytes-like object
missing ), unterminated subpattern at position 0
missing ), unterminated subpattern at position 0
nothing to repeat at position 0
nothing to repeat at position 0
nothing to repeat at position 0
nothing to repeat at position 0
nothing to repeat at position 0
unbalanced parenthesis at position 32
missing ), unterminated subpattern at position 4
missing ), unterminated subpattern at position 0
missing ), unterminated subpattern at position 0
missing ), unterminated subpattern at position 0
missing ), unterminated subpattern at position 0
missing ), unter

In [287]:
# df_train[df_train['docid']=='995&&2'].to_dict()

In [288]:
df_valid = tag_char(df_valid, args)


unbalanced parenthesis at position 10
missing ), unterminated subpattern at position 0
missing ), unterminated subpattern at position 0
nothing to repeat at position 0
missing ), unterminated subpattern at position 0
unbalanced parenthesis at position 16


# tokenize and align label

In [289]:
print(df_train.shape)
df_train = df_train[~df_train['tag_char'].isna()]
print(df_train.shape)
df_train = tokenize_and_align_labels(df_train, args)
df_train.head(3)

(31009, 18)
(31006, 18)
INFO: Pandarallel will run on 32 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=969), Label(value='0 / 969'))), HB…

  return asarray(a).ndim


Unnamed: 0,docid,headline,content,ner,matched_keywords,include_company,include_person,length,split_sentence_index,Companys,Persons,context_cleaned,person_matched,company_matched,context_keywords,sign,input_text,tag_char,tokenized_content,token_labels
0,2023063021774796465&&0,,宝申控股(08151) 停牌 / 内幕消息 / 其他-杂项。宝申控股(08151) 有关复牌...,"[{'label_name': 'Company', 'text_segment': '寶申...",['宝申控股'],True,True,74,0,['宝申控股'],[],宝申控股(08151) 停牌 / 内幕消息 / 其他-杂项。宝申控股(08151) 有关复牌...,[],[宝申控股],宝申控股(08151) 停牌 / 内幕消息 / 其他-杂项。||可能的实体：宝申控股||宝申...,False,宝申控股(08151) 停牌 / 内幕消息 / 其他-杂项。宝申控股(08151) 有关复牌...,"[B-Company, I-Company, I-Company, I-Company, O...","[宝, 申, 控, 股, (, 0, 8, 1, 5, 1, ), 停, 牌, /, 内, ...","[B-Company, I-Company, I-Company, I-Company, O..."
1,2023063021774796485&&0,,利骏集团香港(08360) 股东特别大会的结果。利骏集团香港(08360) 于二零二三年六月...,"[{'label_name': 'Company', 'text_segment': '利駿...","['利骏集团香港', '东特']",True,True,74,0,"['利骏集团', '利骏集团香港', '东特']",[],利骏集团香港(08360) 股东特别大会的结果。利骏集团香港(08360) 于二零二三年六月...,[],"[利骏集团香港, 东特]","利骏集团香港(08360) 股东特别大会的结果。||可能的实体：利骏集团香港,东特||利骏集...",False,利骏集团香港(08360) 股东特别大会的结果。利骏集团香港(08360) 于二零二三年六月...,"[B-Company, I-Company, I-Company, I-Company, I...","[利, 骏, 集, 团, 香, 港, (, 0, 8, 3, 6, 0, ), 股, 东, ...","[B-Company, I-Company, I-Company, I-Company, I..."
2,2023063021774796605&&0,,推进科技自立自强， 国家集成电路设计自动化技术创新中心正式揭牌。21世纪经济报道记者 郭晨 ...,"[{'label_name': 'Company', 'text_segment': '21...","['21世纪经济报道', '陈之常', '杨学鹏', '郭晨', '黄如', '徐光辉', ...",True,True,488,0,['21世纪经济报道'],"['陈之常', '杨学鹏', '郭晨', '黄如', '徐光辉', '胡广杰', '吴刚']",推进科技自立自强， 国家集成电路设计自动化技术创新中心正式揭牌。21世纪经济报道记者 郭晨 ...,"[陈之常, 杨学鹏, 郭晨, 黄如, 徐光辉, 胡广杰, 吴刚]",[21世纪经济报道],推进科技自立自强， 国家集成电路设计自动化技术创新中心正式揭牌。21世纪经济报道记者 郭晨 ...,False,推进科技自立自强， 国家集成电路设计自动化技术创新中心正式揭牌。21世纪经济报道记者 郭晨 ...,"[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[推, 进, 科, 技, 自, 立, 自, 强, ，, 国, 家, 集, 成, 电, 路, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."


In [290]:
print(df_valid.shape)
df_valid = df_valid[~df_valid['tag_char'].isna()]
print(df_valid.shape)
df_valid = tokenize_and_align_labels(df_valid, args)
df_valid.head(3)

(4030, 18)
(4030, 18)
INFO: Pandarallel will run on 32 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=126), Label(value='0 / 126'))), HB…

  return asarray(a).ndim


Unnamed: 0,docid,headline,content,ner,matched_keywords,include_company,include_person,length,split_sentence_index,Companys,Persons,context_cleaned,person_matched,company_matched,context_keywords,sign,input_text,tag_char,tokenized_content,token_labels
0,2023063021774797862&&0,,黑芝麻智能成首间特专科企申请上市 芯片依赖台积电制造。黑芝麻智能成首间特专科企申请上市 芯片...,"[{'label_name': 'Company', 'text_segment': '黑芝...","['台积电', '小米', '中金', '自动', '黑芝麻智能', '腾讯', '388'...",True,True,498,0,"['台积电', '黑芝麻', '小米', '中金', '自动', '黑芝麻智能', '腾讯'...",[],黑芝麻智能成首间特专科企申请上市 芯片依赖台积电制造。黑芝麻智能成首间特专科企申请上市 芯片...,[],"[台积电, 小米, 中金, 自动, 黑芝麻智能, 腾讯, 388, 华泰国际, SoC]","黑芝麻智能成首间特专科企申请上市 芯片依赖台积电制造。||可能的实体：台积电,黑芝麻智能||...",False,黑芝麻智能成首间特专科企申请上市 芯片依赖台积电制造。黑芝麻智能成首间特专科企申请上市 芯片...,"[B-Company, I-Company, I-Company, I-Company, I...","[黑, 芝, 麻, 智, 能, 成, 首, 间, 特, 专, 科, 企, 申, 请, 上, ...","[B-Company, I-Company, I-Company, I-Company, I..."
1,2023063021774807700&&0,,俊盟国际(08062) 财务报表/环境、社会及管治资料 - [年报 / 环境、社会及管治资料...,"[{'label_name': 'Company', 'text_segment': '俊盟...",['俊盟国际'],True,True,80,0,['俊盟国际'],[],俊盟国际(08062) 财务报表/环境、社会及管治资料 - [年报 / 环境、社会及管治资料...,[],[俊盟国际],俊盟国际(08062) 财务报表/环境、社会及管治资料 - [年报 / 环境、社会及管治资料...,False,俊盟国际(08062) 财务报表/环境、社会及管治资料 - [年报 / 环境、社会及管治资料...,"[B-Company, I-Company, I-Company, I-Company, O...","[俊, 盟, 国, 际, (, 0, 8, 0, 6, 2, ), 财, 务, 报, 表, ...","[B-Company, I-Company, I-Company, I-Company, O..."
2,2023063021774822209&&0,,热辣新盘放送｜明隽首轮推售34伙 连天台特色户率先招标沽。热辣新盘放送｜明隽首轮推售34伙 ...,"[{'label_name': 'Company', 'text_segment': '佳明...","['佳明集团', '中原', '明隽', '陈永杰']",True,True,239,0,"['佳明集团', '中原', '明隽']",['陈永杰'],热辣新盘放送｜明隽首轮推售34伙 连天台特色户率先招标沽。热辣新盘放送｜明隽首轮推售34伙 ...,[陈永杰],"[佳明集团, 中原, 明隽]",热辣新盘放送｜明隽首轮推售34伙 连天台特色户率先招标沽。||可能的实体：明隽||热辣新盘放...,False,热辣新盘放送｜明隽首轮推售34伙 连天台特色户率先招标沽。热辣新盘放送｜明隽首轮推售34伙 ...,"[O, O, O, O, O, O, O, B-Company, I-Company, O,...","[热, 辣, 新, 盘, 放, 送, ｜, 明, 隽, 首, 轮, 推, 售, 3, 4, ...","[O, O, O, O, O, O, O, B-Company, I-Company, O,..."


# remove long text

In [291]:
args.max_len

512

In [292]:
df_train['len_tokens'] = df_train['tokenized_content'].apply(lambda x: len(x))
df_train = df_train[df_train['len_tokens'] < args.max_len-2]
df_train = df_train.reset_index(drop=True)
print(df_train['len_tokens'].describe())

count    30794.000000
mean       402.246671
std        122.334387
min          4.000000
25%        387.000000
50%        458.000000
75%        480.000000
max        502.000000
Name: len_tokens, dtype: float64


In [293]:
df_valid['len_tokens'] = df_valid['tokenized_content'].apply(lambda x: len(x))
df_valid = df_valid[df_valid['len_tokens'] < args.max_len-2]
df_valid = df_valid.reset_index(drop=True)
print(df_valid['len_tokens'].describe())

count    4010.000000
mean      400.301247
std       122.649495
min         8.000000
25%       378.000000
50%       458.000000
75%       480.000000
max       499.000000
Name: len_tokens, dtype: float64


In [294]:
print(list(zip(df_train.loc[10300, 'tokenized_content'], df_train.loc[10300, 'token_labels'])))
df_train.shape

[('对', 'O'), ('于', 'O'), ('先', 'O'), ('前', 'O'), ('超', 'O'), ('速', 'O'), ('扩', 'O'), ('张', 'O'), ('引', 'O'), ('发', 'O'), ('的', 'O'), ('种', 'O'), ('种', 'O'), ('问', 'O'), ('题', 'O'), ('，', 'O'), ('体', 'O'), ('育', 'O'), ('品', 'O'), ('牌', 'O'), ('巨', 'O'), ('头', 'O'), ('们', 'O'), ('无', 'O'), ('不', 'O'), ('在', 'O'), ('重', 'O'), ('新', 'O'), ('审', 'O'), ('视', 'O'), ('中', 'O'), ('国', 'O'), ('市', 'O'), ('场', 'O'), ('。', 'O'), ('新', 'O'), ('闻', 'O'), ('链', 'O'), ('接', 'O'), ('。', 'O'), ('1', 'O'), ('.', 'O'), ('6', 'O'), ('5', 'O'), ('亿', 'O'), ('收', 'O'), ('购', 'O'), ('凯', 'B-Company'), ('胜', 'I-Company'), ('李', 'O'), ('宁', 'O'), ('恋', 'O'), ('上', 'O'), ('羽', 'O'), ('毛', 'O'), ('球', 'O'), ('运', 'O'), ('动', 'O'), ('用', 'O'), ('品', 'O'), ('商', 'O'), ('李', 'O'), ('宁', 'O'), ('（', 'O'), ('0', 'O'), ('2', 'O'), ('3', 'O'), ('3', 'O'), ('1', 'O'), ('，', 'O'), ('h', 'O'), ('k', 'O'), ('）', 'O'), ('日', 'O'), ('前', 'O'), ('宣', 'O'), ('布', 'O'), ('1', 'O'), ('.', 'O'), ('6', 'O'), ('5', 'O'), ('亿', 'O'),

(30794, 21)

In [295]:
print(list(zip(df_valid.loc[1310, 'tokenized_content'], df_valid.loc[1310, 'token_labels'])))
df_valid.shape

[('作', 'O'), ('者', 'O'), ('p', 'O'), ('d', 'O'), ('s', 'O'), ('1', 'O'), ('(', 'O'), ('f', 'O'), ('i', 'O'), ('g', 'O'), ('h', 'O'), ('t', 'O'), ('e', 'O'), ('r', 'O'), (')', 'O'), ('。', 'O'), ('看', 'O'), ('板', 'O'), ('h', 'O'), ('o', 'O'), ('m', 'O'), ('e', 'O'), ('-', 'O'), ('s', 'O'), ('a', 'O'), ('l', 'O'), ('e', 'O'), ('。', 'O'), ('标', 'O'), ('题', 'O'), ('[', 'O'), ('新', 'O'), ('闻', 'O'), (']', 'O'), ('宏', 'O'), ('盛', 'O'), ('董', 'O'), ('事', 'O'), ('长', 'O'), ('林', 'B-Person'), ('祖', 'I-Person'), ('郁', 'I-Person'), ('︰', 'O'), ('房', 'O'), ('价', 'O'), ('难', 'O'), ('有', 'O'), ('调', 'O'), ('降', 'O'), ('空', 'O'), ('间', 'O'), ('。', 'O'), ('时', 'O'), ('间', 'O'), ('s', 'O'), ('a', 'O'), ('t', 'O'), ('j', 'O'), ('u', 'O'), ('n', 'O'), ('2', 'O'), ('7', 'O'), ('1', 'O'), ('3', 'O'), (':', 'O'), ('1', 'O'), ('8', 'O'), (':', 'O'), ('5', 'O'), ('5', 'O'), ('2', 'O'), ('0', 'O'), ('1', 'O'), ('5', 'O'), ('。', 'O'), ('内', 'O'), ('文', 'O'), (':', 'O'), ('2', 'O'), ('0', 'O'), ('1', 'O'), ('5', 

(4010, 21)

# 统计有公司的文本量和有人名的文本量

In [296]:
print(df_train[df_train['include_company']==True].shape, df_train[df_train['include_person']==True].shape, df_train.columns)
print(df_valid[df_valid['include_company']==True].shape, df_valid[df_valid['include_person']==True].shape, df_valid.columns)

(28700, 21) (25669, 21) Index(['docid', 'headline', 'content', 'ner', 'matched_keywords',
       'include_company', 'include_person', 'length', 'split_sentence_index',
       'Companys', 'Persons', 'context_cleaned', 'person_matched',
       'company_matched', 'context_keywords', 'sign', 'input_text', 'tag_char',
       'tokenized_content', 'token_labels', 'len_tokens'],
      dtype='object')
(3727, 21) (3348, 21) Index(['docid', 'headline', 'content', 'ner', 'matched_keywords',
       'include_company', 'include_person', 'length', 'split_sentence_index',
       'Companys', 'Persons', 'context_cleaned', 'person_matched',
       'company_matched', 'context_keywords', 'sign', 'input_text', 'tag_char',
       'tokenized_content', 'token_labels', 'len_tokens'],
      dtype='object')


In [297]:
df_train.shape, df_valid.shape, 

((30794, 21), (4010, 21))

# save dataframe

In [298]:
args.prefix_path

'../experiments/0606_bert_p=0.5_pos=sentence_BP'

In [299]:
df_train.to_pickle(f"{args.prefix_path}/train_data.pkl")
df_valid.to_pickle(f"{args.prefix_path}/valid_data.pkl")

In [300]:
# df_train = pd.read_pickle(f"{args.prefix_path}/train_data.pkl")
# df_valid = pd.read_pickle(f"{args.prefix_path}/valid_data.pkl")

In [253]:
for idx, row in df_train.iterrows():
    if "peets" in row['input_text'].lower():
        print(row['input_text'])
        for i, j in zip(row['tokenized_content'], row['token_labels']):
            print(i, j)
        break

# generate dataset

In [301]:
def add_padding_and_mask(example):
    tokenized_context = [tokenizer.cls_token] + example['tokenized_content'] + [tokenizer.sep_token]
    labels = example['token_labels']
    labels.insert(0, 'O')
    labels.insert(len(labels), "O")

    max_len = args.max_len
    if len(tokenized_context) > max_len: 
        tokenized_context = tokenized_context[:max_len]
        labels = labels[:max_len]
    else:
        tokenized_context = tokenized_context + [tokenizer.pad_token] * (max_len - len(tokenized_context))
        labels = labels + ['O'] * (max_len - len(labels))

    attn_mask = [1 if tok != tokenizer.pad_token else 0 for tok in tokenized_context]

    ids = tokenizer.convert_tokens_to_ids(tokenized_context)

    label_ids = [labels_to_ids[label] for label in labels]

    return {
          'ids': torch.tensor(ids, dtype=torch.long),
          'masks': torch.tensor(attn_mask, dtype=torch.long),
          'labels': torch.tensor(label_ids, dtype=torch.long)
        } 

In [302]:
tokenizer = BertTokenizer.from_pretrained(args.pretrained_model)

train_dataset = datasets.Dataset.from_pandas(df_train[['docid', 'tokenized_content', 'token_labels']])
train_dataset = train_dataset.map(add_padding_and_mask, remove_columns=['tokenized_content', 'token_labels'])

HBox(children=(FloatProgress(value=0.0, description='Map', max=30794.0, style=ProgressStyle(description_width=…




In [303]:
tokenizer = BertTokenizer.from_pretrained(args.pretrained_model)

valid_dataset = datasets.Dataset.from_pandas(df_valid[['docid', 'tokenized_content', 'token_labels']])
valid_dataset = valid_dataset.map(add_padding_and_mask, remove_columns=['tokenized_content', 'token_labels'])

HBox(children=(FloatProgress(value=0.0, description='Map', max=4010.0, style=ProgressStyle(description_width='…




In [304]:
train_dataset.save_to_disk(f'{args.prefix_path}/train_dataset')
valid_dataset.save_to_disk(f'{args.prefix_path}/valid_dataset')

HBox(children=(FloatProgress(value=0.0, description='Saving the dataset (0/1 shards)', max=30794.0, style=Prog…




HBox(children=(FloatProgress(value=0.0, description='Saving the dataset (0/1 shards)', max=4010.0, style=Progr…




In [305]:
# datasets.Dataset.load_from_disk(f'{args.pre_path}/dataset/headline_content_keywords/train/train/train_data')

## test

In [306]:
df_test["input_text"] = df_test["context_cleaned"] ## 全不加
# df_test["input_text"] = df_test["context_keywords"] ## 全加keywords
print(df_test.shape)
pandarallel.initialize(nb_workers=32, progress_bar=True, use_memory_fs=False)
df_test[["tokenized_content", "ids", "masks"]] = df_test.parallel_apply(tokenize_test_text, param_args=args, axis=1).to_list()

test_dataset = datasets.Dataset.from_pandas(df_test[["docid", "tokenized_content", "ids", "masks"]])

(4554, 17)
INFO: Pandarallel will run on 32 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=143), Label(value='0 / 143'))), HB…

  return asarray(a).ndim


In [307]:
# df_test.to_pickle(f"{args.pre_path}/dataset/headline_content_keywords/test/test_data_{args.experiment_name}_{args.mode}.pkl")
print(df_test.shape)
df_test.to_pickle(f"{args.prefix_path}/test_data.pkl")

(4554, 20)


In [308]:
# test_dataset.save_to_disk(f'{args.pre_path}/dataset/headline_content_keywords/test/test/test_data_new-dict')
test_dataset.save_to_disk(f'{args.prefix_path}/test_dataset')

HBox(children=(FloatProgress(value=0.0, description='Saving the dataset (0/1 shards)', max=4554.0, style=Progr…




In [262]:
# df_test.head(1).to_dict('record')

# badcase dataset

In [196]:
from zhconv import convert
badcase_data = pd.read_excel("/workspace/client_project/poc/bochk/badcase/bochk_uat_20240410_ner.xlsx")

In [197]:
print(badcase_data.shape)
# badcase_data.head(), 
set(badcase_data['doc_id'].tolist()).__len__()

(193, 5)


45

In [201]:
def merge_labels(data):
    data_list = []
    for doc_id, groupdf in data.groupby("doc_id"):
        content = convert(str(groupdf['headline'].tolist()[0]) + "。" + str(groupdf['content'].tolist()[0]), 'zh-cn')
        single_dic = {
            "doc_id": doc_id, 
            "input_text": content,
            "person_matched": list(set([row['ner_keyword'] for index, row in groupdf.iterrows() if row['ner_label_type'] == "Person" and row['ner_keyword'] in content])),                 
            "company_matched": list(set([row['ner_keyword'] for index, row in groupdf.iterrows() if row['ner_label_type'] == "Company" and row['ner_keyword'] in content]))
        }
        data_list.append(single_dic)
    return pd.DataFrame(data_list)

In [202]:
badcase_test_data = merge_labels(badcase_data)
badcase_test_data['length'] = badcase_test_data['input_text'].map(len)
badcase_test_data.head()

Unnamed: 0,doc_id,input_text,person_matched,company_matched,length
0,2023120100000939360,英再有城市破产 诺定咸超支2.3亿。\n继英国第二大城市伯明翰市议会9月初宣布破产后，英国中...,[],[Nottingham],176
1,2023120100001090313,中央谷长三角 宁沪盈利复增。 江苏宁沪高速（00177）主业务是收费路桥的投资、建设、营运...,[],[],199
2,2023121900002995017,东软熙康（０９６８６）独董方唯一辞任，齐国先接任。\n\n 《经济通通讯社１９日专讯》东...,[方唯],[],240
3,2023122100001384938,毛记葵涌开市“唛高”近7倍。【明报专讯】向来交投淡静的毛记葵涌（1716），昨日在开市不久即...,[],[Ken Sir],465
4,2023122100002680716,九方财富（０９６３６）首席执行官才子辞任，主席陈文彬兼任。\n\n 《经济通通讯社２１日...,[],[],253


In [203]:
badcase_test_data[badcase_test_data['length']> 512].shape

(12, 5)

In [204]:
from zhconv import convert
import copy
def split_sentences(text):
    
    sentences_ls = re.split(r'[?!;~。？！；～>\n\r]', text)
    sentences = [sent for sent in sentences_ls if sent.strip() != '']
    # sentences = list(zip(list(range(0, len(sentences))), sentences))
    return sentences

def split_single_sentence(data: pd.DataFrame, sentence_length_limit: int=500):
    """
    讲文章按照句号分割成句子
    :param data:
    :return:
    """
    complete_data_list = []
    for index, row in data.iterrows():
        raw_single_dic = dict(row)
        content = convert(row['content'], 'zh-cn')
        processed_content_ls = split_sentences(content)
        last_sentence_index = 0
        sentence_index = 0
        for i in range(processed_content_ls.__len__()+1):
            # print(last_sentence_index, i)
            text = "。".join(processed_content_ls[last_sentence_index:i])
            if text.strip() != "":
                if text.__len__() >= sentence_length_limit:
                    # print(i, "===", text)
                    raw_single_dic_backup = copy.copy(raw_single_dic)
                    raw_single_dic_backup['content'] = "。".join(processed_content_ls[last_sentence_index:i-1])
                    raw_single_dic_backup['split_sentence_index'] = sentence_index
                    complete_data_list.append(raw_single_dic_backup)
                    last_sentence_index = i -1
                    sentence_index += 1
                else:
                    if i == processed_content_ls.__len__():
                        # print(i, "***", text)
                        raw_single_dic_backup = copy.copy(raw_single_dic)
                        raw_single_dic_backup['content'] = text
                        raw_single_dic_backup['split_sentence_index'] = sentence_index
                        complete_data_list.append(raw_single_dic_backup)
                        
                    else:
                        pass
    return pd.DataFrame(complete_data_list)

In [205]:
def get_keyword(row):
    companys = set()
    person = set()
    if isinstance(row['ner'], str):
        ner_list = eval(row['ner'])
    elif isinstance(row['ner'], list):
        ner_list = row['ner']
    # print(ner_list)
    for dic in ner_list:
        entity = convert(dic['text_segment'], 'zh-cn')
        if entity in row['content']:
            if dic['label_name']=='Company':
                companys.add(entity)
            elif dic['label_name']=='Person' :
                person.add(entity) 
    return list(companys), list(person)
    



In [206]:
badcase_test_data['content'] = badcase_test_data['input_text']
split_sentence_df = split_single_sentence(badcase_test_data)
split_sentence_df.columns

Index(['doc_id', 'input_text', 'person_matched', 'company_matched', 'length',
       'content', 'split_sentence_index'],
      dtype='object')

In [207]:
split_sentence_df['doc_id'] = split_sentence_df.apply(lambda row: str(row['doc_id'])+"&&"+str(row['split_sentence_index']), axis=1)

In [208]:
split_sentence_df['length'] = split_sentence_df['content'].map(len)

In [209]:
split_sentence_df.head()

Unnamed: 0,doc_id,input_text,person_matched,company_matched,length,content,split_sentence_index
0,2023120100000939360&&0,英再有城市破产 诺定咸超支2.3亿。\n继英国第二大城市伯明翰市议会9月初宣布破产后，英国中...,[],[Nottingham],168,英再有城市破产 诺定咸超支2.3亿。继英国第二大城市伯明翰市议会9月初宣布破产后，英国中部城...,0
1,2023120100001090313&&0,中央谷长三角 宁沪盈利复增。 江苏宁沪高速（00177）主业务是收费路桥的投资、建设、营运...,[],[],194,中央谷长三角 宁沪盈利复增。 江苏宁沪高速（00177）主业务是收费路桥的投资、建设、营运...,0
2,2023121900002995017&&0,东软熙康（０９６８６）独董方唯一辞任，齐国先接任。\n\n 《经济通通讯社１９日专讯》东...,[方唯],[],229,东软熙康（０９６８６）独董方唯一辞任，齐国先接任。 《经济通通讯社１９日专讯》东软熙康（...,0
3,2023122100001384938&&0,毛记葵涌开市“唛高”近7倍。【明报专讯】向来交投淡静的毛记葵涌（1716），昨日在开市不久即...,[],[Ken Sir],457,毛记葵涌开市“唛高”近7倍。【明报专讯】向来交投淡静的毛记葵涌（1716），昨日在开市不久即...,0
4,2023122100002680716&&0,九方财富（０９６３６）首席执行官才子辞任，主席陈文彬兼任。\n\n 《经济通通讯社２１日...,[],[],242,九方财富（０９６３６）首席执行官才子辞任，主席陈文彬兼任。 《经济通通讯社２１日专讯》九...,0


In [210]:
split_sentence_df['input_text'] = split_sentence_df['content']
pandarallel.initialize(nb_workers=32, progress_bar=True, use_memory_fs=False)
split_sentence_df[["tokenized_content", "ids", "masks"]] = split_sentence_df.parallel_apply(tokenize_test_text, param_args=args, axis=1).to_list()


INFO: Pandarallel will run on 32 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=3), Label(value='0 / 3'))), HBox(c…

  return asarray(a).ndim


In [211]:
split_sentence_df['docid'] = split_sentence_df['doc_id']
badcase_dataset = datasets.Dataset.from_pandas(split_sentence_df[["docid", "tokenized_content", "ids", "masks"]])
badcase_dataset.save_to_disk(f'{args.prefix_path}/badcase_dataset')

HBox(children=(FloatProgress(value=0.0, description='Saving the dataset (0/1 shards)', max=70.0, style=Progres…




In [212]:
split_sentence_df.to_pickle(f"{args.prefix_path}/badcase_df.pkl")

In [213]:
split_sentence_df.head()

Unnamed: 0,doc_id,input_text,person_matched,company_matched,length,content,split_sentence_index,tokenized_content,ids,masks,docid
0,2023120100000939360&&0,英再有城市破产 诺定咸超支2.3亿。继英国第二大城市伯明翰市议会9月初宣布破产后，英国中部城...,[],[Nottingham],168,英再有城市破产 诺定咸超支2.3亿。继英国第二大城市伯明翰市议会9月初宣布破产后，英国中部城...,0,"[英, 再, 有, 城, 市, 破, 产, 诺, 定, 咸, 超, 支, 2, ., 3, ...","[101, 5739, 1086, 3300, 1814, 2356, 4788, 772,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",2023120100000939360&&0
1,2023120100001090313&&0,中央谷长三角 宁沪盈利复增。 江苏宁沪高速（00177）主业务是收费路桥的投资、建设、营运...,[],[],194,中央谷长三角 宁沪盈利复增。 江苏宁沪高速（00177）主业务是收费路桥的投资、建设、营运...,0,"[中, 央, 谷, 长, 三, 角, 宁, 沪, 盈, 利, 复, 增, 。, 江, 苏, ...","[101, 704, 1925, 6484, 7270, 676, 6235, 2123, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",2023120100001090313&&0
2,2023121900002995017&&0,东软熙康（０９６８６）独董方唯一辞任，齐国先接任。 《经济通通讯社１９日专讯》东软熙康（...,[方唯],[],229,东软熙康（０９６８６）独董方唯一辞任，齐国先接任。 《经济通通讯社１９日专讯》东软熙康（...,0,"[东, 软, 熙, 康, （, ０, ９, ６, ８, ６, ）, 独, 董, 方, 唯, ...","[101, 691, 6763, 4224, 2434, 8020, 8028, 8037,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",2023121900002995017&&0
3,2023122100001384938&&0,毛记葵涌开市“唛高”近7倍。【明报专讯】向来交投淡静的毛记葵涌（1716），昨日在开市不久即...,[],[Ken Sir],457,毛记葵涌开市“唛高”近7倍。【明报专讯】向来交投淡静的毛记葵涌（1716），昨日在开市不久即...,0,"[毛, 记, 葵, 涌, 开, 市, “, 唛, 高, ”, 近, 7, 倍, 。, 【, ...","[101, 3688, 6381, 5878, 3869, 2458, 2356, 100,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",2023122100001384938&&0
4,2023122100002680716&&0,九方财富（０９６３６）首席执行官才子辞任，主席陈文彬兼任。 《经济通通讯社２１日专讯》九...,[],[],242,九方财富（０９６３６）首席执行官才子辞任，主席陈文彬兼任。 《经济通通讯社２１日专讯》九...,0,"[九, 方, 财, 富, （, ０, ９, ６, ３, ６, ）, 首, 席, 执, 行, ...","[101, 736, 3175, 6568, 2168, 8020, 8028, 8037,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",2023122100002680716&&0


In [214]:
split_sentence_df[["doc_id", "content", "person_matched", "company_matched", "split_sentence_index"]].to_csv("entity_ner.csv")