In [97]:
%load_ext autoreload
%autoreload 1
%autoreload utils.tag_char
import os
import pandas as pd
import numpy as np
import argparse
import yaml

from datasets import Dataset
from tqdm.auto import tqdm
from tqdm import tqdm_pandas
from utils import *


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [98]:
with open("../config.yaml", 'r') as config_file:
    config = yaml.safe_load(config_file)
config

{'date': '0530',
 'experiment_name': 'bert_p=0.5_pos=sentence',
 'mode': 'BP',
 'entity_type': ['Company', 'Person'],
 'num_labels': 5,
 'threshold': 0.2,
 'train_data': '/train_dataset',
 'valid_data': '/valid_dataset',
 'test_data': '/test_dataset',
 'pretrained_model': 'hfl/chinese-roberta-wwm-ext',
 'do_training': False,
 'max_len': 512,
 'batch_size': 32,
 'n_epochs': 5,
 'lr': '1e-5',
 'do_prediction': True}

In [99]:
parser = argparse.ArgumentParser(description='') 

parser.set_defaults(**config)
args = parser.parse_args(args = [])

parser.add_argument("--prefix_path", type=str, default=f"../experiments/{args.date}_{args.experiment_name}_{args.mode}")
args = parser.parse_args(args = [])

args

Namespace(batch_size=32, date='0530', do_prediction=True, do_training=False, entity_type=['Company', 'Person'], experiment_name='bert_p=0.5_pos=sentence', lr='1e-5', max_len=512, mode='BP', n_epochs=5, num_labels=5, prefix_path='../experiments/0530_bert_p=0.5_pos=sentence_BP', pretrained_model='hfl/chinese-roberta-wwm-ext', test_data='/test_dataset', threshold=0.2, train_data='/train_dataset', valid_data='/valid_dataset')

In [100]:
labels_to_ids, ids_to_labels = define_labels(param_args=args)
labels_to_ids

{'O': 0, 'B-Company': 1, 'I-Company': 2, 'B-Person': 3, 'I-Person': 4}

## 建立实验文件夹

In [101]:
os.makedirs(args.prefix_path, exist_ok=True)

## split to train, valid, test

In [102]:
# df_data = pd.read_pickle("../data/data_starbucks_yihetang_aligned.pkl")
df_data = pd.read_excel("../bochk/manual_label_data.xlsx")
print(df_data.shape)
df_data.head(4)

(32254, 15)


Unnamed: 0,docid,headline,content,ner,matched_keywords,include_company,include_person,split_sentence_index,Companys,Persons,context_cleaned,person_matched,company_matched,context_keywords,sign
0,2023063021774796465&&0,,宝申控股(08151) 停牌 / 内幕消息 / 其他-杂项。宝申控股(08151) 有关复牌...,"[{'label_name': 'Company', 'text_segment': '寶申...",['宝申控股'],True,True,0,['宝申控股'],[],宝申控股(08151) 停牌 / 内幕消息 / 其他-杂项。宝申控股(08151) 有关复牌...,[],['宝申控股'],宝申控股(08151) 停牌 / 内幕消息 / 其他-杂项。||可能的实体：宝申控股||宝申...,False
1,2023063021774796485&&0,,利骏集团香港(08360) 股东特别大会的结果。利骏集团香港(08360) 于二零二三年六月...,"[{'label_name': 'Company', 'text_segment': '利駿...",['利骏集团香港'],True,True,0,"['利骏集团', '利骏集团香港']",[],利骏集团香港(08360) 股东特别大会的结果。利骏集团香港(08360) 于二零二三年六月...,[],['利骏集团香港'],利骏集团香港(08360) 股东特别大会的结果。||可能的实体：利骏集团香港||利骏集团香港...,False
2,2023063021774796605&&0,,推进科技自立自强， 国家集成电路设计自动化技术创新中心正式揭牌。21世纪经济报道记者 郭晨 ...,"[{'label_name': 'Company', 'text_segment': '21...","['21世纪经济报道', '东南大学', '杨学鹏', '吴刚', '胡广杰', '黄如',...",True,True,0,"['21世纪经济报道', '东南大学']","['杨学鹏', '吴刚', '胡广杰', '黄如', '郭晨', '陈之常', '徐光辉']",推进科技自立自强， 国家集成电路设计自动化技术创新中心正式揭牌。21世纪经济报道记者 郭晨 ...,"['杨学鹏', '吴刚', '胡广杰', '黄如', '郭晨', '陈之常', '徐光辉']","['21世纪经济报道', '东南大学']",推进科技自立自强， 国家集成电路设计自动化技术创新中心正式揭牌。21世纪经济报道记者 郭晨 ...,False
3,2023063021774796605&&1,,"他说,集成电路产业是我省具有较强竞争力的优势产业之一,已形成覆盖设计、制造、封测、设备、材料...","[{'label_name': 'Company', 'text_segment': '21...","['东南大学', '胡广杰', '陈之常']",True,True,1,['东南大学'],"['陈之常', '胡广杰']","他说,集成电路产业是我省具有较强竞争力的优势产业之一,已形成覆盖设计、制造、封测、设备、材料...","['胡广杰', '陈之常']",['东南大学'],"他说,集成电路产业是我省具有较强竞争力的优势产业之一,已形成覆盖设计、制造、封测、设备、材料...",False


In [103]:
df_data['person_matched'] = df_data['person_matched'].map(eval)
df_data['company_matched'] = df_data['company_matched'].map(eval)

In [104]:
df_data['sign'] = df_data.apply(lambda row: row['company_matched'].__len__() > 0 or row['person_matched'].__len__() > 0, axis=1)
df_data[df_data['sign']==False].shape

(0, 15)

In [105]:
df_data['docid'].is_unique

True

In [106]:
dataset = Dataset.from_pandas(df_data[["docid"]])

splitted_dataset = dataset.train_test_split(train_size=0.885, seed=109)
train_ = splitted_dataset['train']
split_ = train_.train_test_split(train_size=0.885, seed=109)

train_set = split_['train']
valid_set = split_['test']
test_set = splitted_dataset['test']

len(train_set), len(valid_set), len(test_set)

(25261, 3283, 3710)

In [107]:
with open(f"{args.prefix_path}/train_docid.txt", "w") as f:
    f.write("\n".join(train_set['docid']))

with open(f"{args.prefix_path}/valid_docid.txt", "w") as f:
    f.write("\n".join(valid_set['docid']))

with open(f"{args.prefix_path}/test_docid.txt", "w") as f:
    f.write("\n".join(test_set['docid']))

## read train, valid, test

In [108]:

df_all = pd.read_excel("../bochk/manual_label_data.xlsx")
df_all['person_matched'] = df_all['person_matched'].map(eval)
df_all['company_matched'] = df_all['company_matched'].map(eval)
df_all['content'] = df_all['content'].astype(str)
df_all.head(1)

Unnamed: 0,docid,headline,content,ner,matched_keywords,include_company,include_person,split_sentence_index,Companys,Persons,context_cleaned,person_matched,company_matched,context_keywords,sign
0,2023063021774796465&&0,,宝申控股(08151) 停牌 / 内幕消息 / 其他-杂项。宝申控股(08151) 有关复牌...,"[{'label_name': 'Company', 'text_segment': '寶申...",['宝申控股'],True,True,0,['宝申控股'],[],宝申控股(08151) 停牌 / 内幕消息 / 其他-杂项。宝申控股(08151) 有关复牌...,[],[宝申控股],宝申控股(08151) 停牌 / 内幕消息 / 其他-杂项。||可能的实体：宝申控股||宝申...,False


In [109]:
df_all.tail()

Unnamed: 0,docid,headline,content,ner,matched_keywords,include_company,include_person,split_sentence_index,Companys,Persons,context_cleaned,person_matched,company_matched,context_keywords,sign
32249,997&&1,,针对中央环境保护督察指出的海域岸线自然生态和风貌破坏等问题，海南一是着力解决填海造地破坏海洋...,"[{'label_name': 'Time', 'text_segment': '2019年...",['毛超峰'],False,True,1,[],['毛超峰'],针对中央环境保护督察指出的海域岸线自然生态和风貌破坏等问题，海南一是着力解决填海造地破坏海洋...,[毛超峰],[],针对中央环境保护督察指出的海域岸线自然生态和风貌破坏等问题，海南一是着力解决填海造地破坏海洋...,False
32250,998&&1,,无独有偶。博州人才政策对于住房方面的关注在其他三四五线城市动作中也有所体现。比如，4月22日...,"[{'label_name': 'Time', 'text_segment': '二十一世紀...","['国际金融报', '易居研究院智库中心', '严跃进']",True,True,1,"['国际金融报', '易居', '易居研究院', '易居研究院智库中心']",['严跃进'],无独有偶。博州人才政策对于住房方面的关注在其他三四五线城市动作中也有所体现。比如，4月22日...,[严跃进],"[国际金融报, 易居研究院智库中心]",无独有偶。博州人才政策对于住房方面的关注在其他三四五线城市动作中也有所体现。比如，4月22日...,False
32251,998&&2,,“战火”已悄悄弥漫至区、县等更小的行政单位。不过，一位三线城市人才申报工作者对记者表示，对于...,"[{'label_name': 'Time', 'text_segment': '二十一世紀...","['国际金融报', '严跃进']",True,True,2,['国际金融报'],['严跃进'],“战火”已悄悄弥漫至区、县等更小的行政单位。不过，一位三线城市人才申报工作者对记者表示，对于...,[严跃进],[国际金融报],“战火”已悄悄弥漫至区、县等更小的行政单位。不过，一位三线城市人才申报工作者对记者表示，对于...,False
32252,998&&3,,“西安就是一个例子。”沈路(化名)是西安本地人，刚回国就业时正好赶上抢人期。2017年3月，...,"[{'label_name': 'Time', 'text_segment': '二十一世紀...","['易居研究院', '沈昕', '沈路']",True,True,3,"['易居', '易居研究院']","['沈昕', '沈路']",“西安就是一个例子。”沈路(化名)是西安本地人，刚回国就业时正好赶上抢人期。2017年3月，...,"[沈昕, 沈路]",[易居研究院],“西安就是一个例子。”沈路(化名)是西安本地人，刚回国就业时正好赶上抢人期。||可能的实体：...,False
32253,998&&4,,来自四川自贡富顺县的刘民(化名)目前选择在江苏常州创业，他对记者表示，“虽然我家乡很多人外出...,"[{'label_name': 'Time', 'text_segment': '二十一世紀...","['国际金融报', '搜狐', '刘民', '刘明', '陈晨']",True,True,4,"['国际金融报', '搜狐']","['刘民', '刘明', '陈晨']",来自四川自贡富顺县的刘民(化名)目前选择在江苏常州创业，他对记者表示，“虽然我家乡很多人外出...,"[刘民, 刘明, 陈晨]","[国际金融报, 搜狐]",来自四川自贡富顺县的刘民(化名)目前选择在江苏常州创业，他对记者表示，“虽然我家乡很多人外出...,False


In [110]:
df_all[df_all['docid'] == '998&&2']

Unnamed: 0,docid,headline,content,ner,matched_keywords,include_company,include_person,split_sentence_index,Companys,Persons,context_cleaned,person_matched,company_matched,context_keywords,sign
32251,998&&2,,“战火”已悄悄弥漫至区、县等更小的行政单位。不过，一位三线城市人才申报工作者对记者表示，对于...,"[{'label_name': 'Time', 'text_segment': '二十一世紀...","['国际金融报', '严跃进']",True,True,2,['国际金融报'],['严跃进'],“战火”已悄悄弥漫至区、县等更小的行政单位。不过，一位三线城市人才申报工作者对记者表示，对于...,[严跃进],[国际金融报],“战火”已悄悄弥漫至区、县等更小的行政单位。不过，一位三线城市人才申报工作者对记者表示，对于...,False


In [111]:
df_all['docid'].is_unique

True

In [112]:
with open(f"{args.prefix_path}/train_docid.txt", "r") as f:
    train_docid = f.read().split("\n")

with open(f"{args.prefix_path}/valid_docid.txt", "r") as f:
    valid_docid = f.read().split("\n")

with open(f"{args.prefix_path}/test_docid.txt", "r") as f:
    test_docid = f.read().split("\n")

len(train_docid), len(valid_docid), len(test_docid)

(25261, 3283, 3710)

In [113]:
df_train = df_all[df_all["docid"].isin(train_docid)].reset_index(drop=True)
df_valid = df_all[df_all["docid"].isin(valid_docid)].reset_index(drop=True)
df_test = df_all[df_all["docid"].isin(test_docid)].reset_index(drop=True)

len(df_train), len(df_valid), len(df_test)

(25261, 3283, 3710)

## train, valid

In [114]:
args.threshold == 0.5

False

In [115]:
tqdm.pandas()
df_train["input_text"] = df_train.apply(lambda x: x["context_keywords"] if np.random.random()>args.threshold else x["context_cleaned"], axis = 1)
df_valid["input_text"] = df_valid.apply(lambda x: x["context_keywords"] if np.random.random()>args.threshold else x["context_cleaned"], axis = 1)

df_train['input_text'] = df_train['context_cleaned']
df_valid['input_text'] = df_valid['context_cleaned']



In [116]:
df_train.loc[5556, 'input_text']

'希拉里参加下一任美国总统竞选。美国前国务卿希拉里。克林顿12号正式宣布,参选2016年、下届美国总统选举,表示希望成为普罗美国民众的斗士。希拉里透过官方竞选网站宣布参选,视频表达的要求,触及社会各个社群。这是希拉里继2008年,民主党内初选败于奥巴马后,第二次角逐总统选举提名,在网站发布参选视频前,她的竞选助手,已先向支持者发出电邮。预告她在不久后,将前往初选最早投票的州份,包括艾奥瓦和新罕布什尔州,展开竞选演说等活动,竞选主轴将着重社会经济不平等,以及她致力成为美国首位女总统的历史意义。希拉里:我要竞逐美国总统,美国人正努力从经济困难期恢复过来,但好处仍在向上层阶级倾斜,普罗美国民众需要一个斗士,我希望成为这个斗士。希拉里在美国政坛曾经扮演过许多不同角色,包括第一夫人、参议员和国务卿。现在她将作第二次尝试,希望成为美国首位女总统。现年67岁的希拉里,毫不掩饰希望入主白宫的愿望。希拉里也说,自己可胜任总统。希拉里:我认为这样说才公平,难道你不想有一天看到美国有位女总统。普罗美国民众需要一个斗士,我希望成为这个斗士。希拉里:我确实有独特优势及经历,知道什么便利美国运作,什么阻碍美国运作,也清楚总统可以做什么,总统应做什么'

In [117]:
df_train.loc[1, 'input_text']

'利骏集团香港(08360) 股东特别大会的结果。利骏集团香港(08360) 于二零二三年六月三十日举行的股东特别大会投票结果(112kb, pdf)'

In [118]:
df_train.columns

Index(['docid', 'headline', 'content', 'ner', 'matched_keywords',
       'include_company', 'include_person', 'split_sentence_index', 'Companys',
       'Persons', 'context_cleaned', 'person_matched', 'company_matched',
       'context_keywords', 'sign', 'input_text'],
      dtype='object')

# tag char

In [119]:
def tag_char(df, args):
    def _tag_char(example, param_args):
        content = example['input_text'] if pd.notna(example['input_text']) else ""
        try:
            tag = ['O'] * len(str(content))

            for entity_type in param_args.entity_type:
                pos_list = []
                entity_list = example[f"{entity_type}_matched".lower()]
                if entity_list == []:
                    continue
                else:
                    for entity in entity_list:
                        # try:
                        #     pos_list.extend([(match.start(), match.end()) for match in re.finditer(entity, content)])
                        # except Exception as e:
                        #     print(entity, content)
                        #     continue
                        try:
                            pos_list.extend([(match.start(), match.end()) for match in re.finditer(entity, content)])
                        except Exception as e:
                            print(e)
                            continue
                    for (start, end) in pos_list:
                        tag[start] = f"B-{entity_type}"
                        tag[start+1:end] = [f"I-{entity_type}"] * (end - start - 1)

            assert len(content) == len(tag)
            return tag
        except:
            return None
    
    tqdm.pandas(desc='tagging char-level label')
    df['tag_char'] = df.apply(_tag_char, param_args=args, axis=1)
    return df

In [120]:
df_train = tag_char(df_train, args)


missing ), unterminated subpattern at position 13
missing ), unterminated subpattern at position 1
expected string or bytes-like object
nothing to repeat at position 0
unbalanced parenthesis at position 2
nothing to repeat at position 0
unbalanced parenthesis at position 10
missing ), unterminated subpattern at position 0
missing ), unterminated subpattern at position 0
missing ), unterminated subpattern at position 0
missing ), unterminated subpattern at position 3
missing ), unterminated subpattern at position 0
missing ), unterminated subpattern at position 0
missing ), unterminated subpattern at position 0
nothing to repeat at position 0
nothing to repeat at position 0
nothing to repeat at position 0
nothing to repeat at position 0
nothing to repeat at position 0
missing ), unterminated subpattern at position 4
missing ), unterminated subpattern at position 0
missing ), unterminated subpattern at position 0
missing ), unterminated subpattern at position 0
missing ), unterminated su

In [121]:
# df_train[df_train['docid']=='995&&2'].to_dict()

In [122]:
df_valid = tag_char(df_valid, args)


unterminated character set at position 0
nothing to repeat at position 0
unbalanced parenthesis at position 4
missing ), unterminated subpattern at position 8
missing ), unterminated subpattern at position 8


# tokenize and align label

In [123]:
print(df_train.shape)
df_train = df_train[~df_train['tag_char'].isna()]
print(df_train.shape)
df_train = tokenize_and_align_labels(df_train, args)
df_train.head(3)

(25261, 17)
(25258, 17)
INFO: Pandarallel will run on 32 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=790), Label(value='0 / 790'))), HB…

  return asarray(a).ndim


Unnamed: 0,docid,headline,content,ner,matched_keywords,include_company,include_person,split_sentence_index,Companys,Persons,context_cleaned,person_matched,company_matched,context_keywords,sign,input_text,tag_char,tokenized_content,token_labels
0,2023063021774796465&&0,,宝申控股(08151) 停牌 / 内幕消息 / 其他-杂项。宝申控股(08151) 有关复牌...,"[{'label_name': 'Company', 'text_segment': '寶申...",['宝申控股'],True,True,0,['宝申控股'],[],宝申控股(08151) 停牌 / 内幕消息 / 其他-杂项。宝申控股(08151) 有关复牌...,[],[宝申控股],宝申控股(08151) 停牌 / 内幕消息 / 其他-杂项。||可能的实体：宝申控股||宝申...,False,宝申控股(08151) 停牌 / 内幕消息 / 其他-杂项。宝申控股(08151) 有关复牌...,"[B-Company, I-Company, I-Company, I-Company, O...","[宝, 申, 控, 股, (, 0, 8, 1, 5, 1, ), 停, 牌, /, 内, ...","[B-Company, I-Company, I-Company, I-Company, O..."
1,2023063021774796485&&0,,利骏集团香港(08360) 股东特别大会的结果。利骏集团香港(08360) 于二零二三年六月...,"[{'label_name': 'Company', 'text_segment': '利駿...",['利骏集团香港'],True,True,0,"['利骏集团', '利骏集团香港']",[],利骏集团香港(08360) 股东特别大会的结果。利骏集团香港(08360) 于二零二三年六月...,[],[利骏集团香港],利骏集团香港(08360) 股东特别大会的结果。||可能的实体：利骏集团香港||利骏集团香港...,False,利骏集团香港(08360) 股东特别大会的结果。利骏集团香港(08360) 于二零二三年六月...,"[B-Company, I-Company, I-Company, I-Company, I...","[利, 骏, 集, 团, 香, 港, (, 0, 8, 3, 6, 0, ), 股, 东, ...","[B-Company, I-Company, I-Company, I-Company, I..."
2,2023063021774796605&&0,,推进科技自立自强， 国家集成电路设计自动化技术创新中心正式揭牌。21世纪经济报道记者 郭晨 ...,"[{'label_name': 'Company', 'text_segment': '21...","['21世纪经济报道', '东南大学', '杨学鹏', '吴刚', '胡广杰', '黄如',...",True,True,0,"['21世纪经济报道', '东南大学']","['杨学鹏', '吴刚', '胡广杰', '黄如', '郭晨', '陈之常', '徐光辉']",推进科技自立自强， 国家集成电路设计自动化技术创新中心正式揭牌。21世纪经济报道记者 郭晨 ...,"[杨学鹏, 吴刚, 胡广杰, 黄如, 郭晨, 陈之常, 徐光辉]","[21世纪经济报道, 东南大学]",推进科技自立自强， 国家集成电路设计自动化技术创新中心正式揭牌。21世纪经济报道记者 郭晨 ...,False,推进科技自立自强， 国家集成电路设计自动化技术创新中心正式揭牌。21世纪经济报道记者 郭晨 ...,"[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[推, 进, 科, 技, 自, 立, 自, 强, ，, 国, 家, 集, 成, 电, 路, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."


In [124]:
print(df_valid.shape)
df_valid = df_valid[~df_valid['tag_char'].isna()]
print(df_valid.shape)
df_valid = tokenize_and_align_labels(df_valid, args)
df_valid.head(3)

(3283, 17)
(3283, 17)
INFO: Pandarallel will run on 32 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=103), Label(value='0 / 103'))), HB…

  return asarray(a).ndim


Unnamed: 0,docid,headline,content,ner,matched_keywords,include_company,include_person,split_sentence_index,Companys,Persons,context_cleaned,person_matched,company_matched,context_keywords,sign,input_text,tag_char,tokenized_content,token_labels
0,2023063021774831849&&0,,【北水手影】转净入16.6亿 买盈富中移动 沽汇控 腾讯。港股半年结日好淡争持，收市跌17点...,"[{'label_name': 'Company', 'text_segment': '盈富...","['中移动', '美团', '盈富', '腾讯', '兖矿能源', 'JS环球生活', '中...",True,True,0,"['中移动', '美团', '小鹏', '腾讯', '盈富', '兖矿能源', '兖矿', ...",[],【北水手影】转净入16.6亿 买盈富中移动 沽汇控 腾讯。港股半年结日好淡争持，收市跌17点...,[],"[中移动, 美团, 盈富, 腾讯, 兖矿能源, JS环球生活, 中芯, 理想汽车, 工行, ...","【北水手影】转净入16.6亿 买盈富中移动 沽汇控 腾讯。||可能的实体：中移动,盈富,腾讯...",False,【北水手影】转净入16.6亿 买盈富中移动 沽汇控 腾讯。港股半年结日好淡争持，收市跌17点...,"[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[【, 北, 水, 手, 影, 】, 转, 净, 入, 1, 6, ., 6, 亿, 买, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
1,2023063021774832177&&0,,茂宸集团(00273) 延迟发送通函或其他文件 / 《收购守则》所指的受要约公司刊发的公告 ...,"[{'label_name': 'Company', 'text_segment': '茂宸...",['茂宸集团控股有限公司'],True,True,0,"['茂宸集团', '茂宸集团控股有限公司']",[],茂宸集团(00273) 延迟发送通函或其他文件 / 《收购守则》所指的受要约公司刊发的公告 ...,[],[茂宸集团控股有限公司],茂宸集团(00273) 延迟发送通函或其他文件 / 《收购守则》所指的受要约公司刊发的公告 ...,False,茂宸集团(00273) 延迟发送通函或其他文件 / 《收购守则》所指的受要约公司刊发的公告 ...,"[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[茂, 宸, 集, 团, (, 0, 0, 2, 7, 3, ), 延, 迟, 发, 送, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
2,2023063021774832208&&0,,日本“一口马”公开招股。日本“一口马”公开招股。每年七月都是日本马迷投资“一口马”，入股做马...,"[{'label_name': 'Company', 'text_segment': '賽馬...","['星岛头条', '赛马团体Silk Racing Co. Ltd', '文杰', '国枝荣']",True,True,0,"['星岛头条', '赛马团体Silk Racing Co. Ltd']","['文杰', '国枝荣']",日本“一口马”公开招股。日本“一口马”公开招股。每年七月都是日本马迷投资“一口马”，入股做马...,"[文杰, 国枝荣]","[星岛头条, 赛马团体Silk Racing Co. Ltd]",日本“一口马”公开招股。日本“一口马”公开招股。每年七月都是日本马迷投资“一口马”，入股做马...,False,日本“一口马”公开招股。日本“一口马”公开招股。每年七月都是日本马迷投资“一口马”，入股做马...,"[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[日, 本, [UNK], 一, 口, 马, [UNK], 公, 开, 招, 股, 。, 日...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."


# remove long text

In [125]:
args.max_len

512

In [126]:
df_train['len_tokens'] = df_train['tokenized_content'].apply(lambda x: len(x))
df_train = df_train[df_train['len_tokens'] < args.max_len-2]
df_train = df_train.reset_index(drop=True)
print(df_train['len_tokens'].describe())

count    13382.000000
mean       358.544911
std        163.292806
min          4.000000
25%        215.000000
50%        441.000000
75%        501.000000
max        509.000000
Name: len_tokens, dtype: float64


In [127]:
df_valid['len_tokens'] = df_valid['tokenized_content'].apply(lambda x: len(x))
df_valid = df_valid[df_valid['len_tokens'] < args.max_len-2]
df_valid = df_valid.reset_index(drop=True)
print(df_valid['len_tokens'].describe())

count    1674.000000
mean      349.482676
std       168.519374
min         6.000000
25%       191.250000
50%       425.500000
75%       501.000000
max       509.000000
Name: len_tokens, dtype: float64


In [128]:
print(list(zip(df_train.loc[10300, 'tokenized_content'], df_train.loc[10300, 'token_labels'])))
df_train.shape

[('近', 'O'), ('期', 'O'), ('收', 'O'), ('益', 'O'), ('率', 'O'), ('排', 'O'), ('名', 'O'), ('靠', 'O'), ('前', 'O'), ('，', 'O'), ('依', 'O'), ('据', 'O'), ('货', 'O'), ('币', 'O'), ('型', 'O'), ('基', 'O'), ('金', 'O'), ('[UNK]', 'O'), ('买', 'O'), ('涨', 'O'), ('不', 'O'), ('买', 'O'), ('跌', 'O'), ('[UNK]', 'O'), ('的', 'O'), ('原', 'O'), ('则', 'O'), ('，', 'O'), ('短', 'O'), ('期', 'O'), ('可', 'O'), ('关', 'O'), ('注', 'O'), ('。', 'O'), ('低', 'O'), ('风', 'O'), ('险', 'O'), ('。', 'O'), ('立', 'O'), ('即', 'O'), ('购', 'O'), ('买', 'O'), ('。', 'O'), ('2', 'O'), ('、', 'O'), ('新', 'O'), ('发', 'O'), ('基', 'O'), ('金', 'O'), ('专', 'O'), ('区', 'O'), ('。', 'O'), ('基', 'O'), ('金', 'O'), ('代', 'O'), ('码', 'O'), ('。', 'O'), ('基', 'O'), ('金', 'O'), ('名', 'O'), ('称', 'O'), ('。', 'O'), ('开', 'O'), ('放', 'O'), ('日', 'O'), ('期', 'O'), ('。', 'O'), ('截', 'O'), ('止', 'O'), ('日', 'O'), ('期', 'O'), ('。', 'O'), ('操', 'O'), ('作', 'O'), ('。', 'O'), ('0', 'O'), ('0', 'O'), ('2', 'O'), ('7', 'O'), ('0', 'O'), ('8', 'O'), ('。', 'O'), ('大', '

(13382, 20)

In [129]:
print(list(zip(df_valid.loc[1310, 'tokenized_content'], df_valid.loc[1310, 'token_labels'])))
df_valid.shape

[('内', 'O'), ('容', 'O'), ('推', 'O'), ('荐', 'O'), ('。', 'O'), ('死', 'O'), ('得', 'O'), ('漂', 'O'), ('亮', 'O'), ('的', 'O'), ('方', 'O'), ('法', 'O'), ('是', 'O'), ('什', 'O'), ('么', 'O'), ('。', 'O'), ('总', 'O'), ('之', 'O'), ('纳', 'B-Person'), ('斯', 'I-Person'), ('丽', 'I-Person'), ('安', 'O'), ('静', 'O'), ('地', 'O'), ('死', 'O'), ('去', 'O'), ('了', 'O'), ('。', 'O'), ('本', 'O'), ('来', 'O'), ('，', 'O'), ('我', 'O'), ('想', 'O'), ('让', 'O'), ('她', 'O'), ('的', 'O'), ('尸', 'O'), ('体', 'O'), ('笔', 'O'), ('直', 'O'), ('伸', 'O'), ('展', 'O'), ('，', 'O'), ('躺', 'O'), ('在', 'O'), ('里', 'O'), ('面', 'O'), ('，', 'O'), ('不', 'O'), ('料', 'O'), ('冰', 'O'), ('柜', 'O'), ('的', 'O'), ('尺', 'O'), ('寸', 'O'), ('有', 'O'), ('点', 'O'), ('小', 'O'), ('，', 'O'), ('不', 'O'), ('弯', 'O'), ('起', 'O'), ('她', 'O'), ('的', 'O'), ('膝', 'O'), ('盖', 'O'), ('就', 'O'), ('收', 'O'), ('不', 'O'), ('进', 'O'), ('去', 'O'), ('。', 'O'), ('那', 'O'), ('模', 'O'), ('样', 'O'), ('说', 'O'), ('是', 'O'), ('吸', 'O'), ('血', 'O'), ('鬼', 'O'), ('的', 'O'), ('新', 

(1674, 20)

# 统计有公司的文本量和有人名的文本量

In [130]:
print(df_train[df_train['include_company']==True].shape, df_train[df_train['include_person']==True].shape, df_train.columns)
print(df_valid[df_valid['include_company']==True].shape, df_valid[df_valid['include_person']==True].shape, df_valid.columns)

(12329, 20) (11109, 20) Index(['docid', 'headline', 'content', 'ner', 'matched_keywords',
       'include_company', 'include_person', 'split_sentence_index', 'Companys',
       'Persons', 'context_cleaned', 'person_matched', 'company_matched',
       'context_keywords', 'sign', 'input_text', 'tag_char',
       'tokenized_content', 'token_labels', 'len_tokens'],
      dtype='object')
(1533, 20) (1391, 20) Index(['docid', 'headline', 'content', 'ner', 'matched_keywords',
       'include_company', 'include_person', 'split_sentence_index', 'Companys',
       'Persons', 'context_cleaned', 'person_matched', 'company_matched',
       'context_keywords', 'sign', 'input_text', 'tag_char',
       'tokenized_content', 'token_labels', 'len_tokens'],
      dtype='object')


In [131]:
df_train.shape, df_valid.shape, 

((13382, 20), (1674, 20))

# save dataframe

In [132]:
args.prefix_path

'../experiments/0530_bert_p=0.5_pos=sentence_BP'

In [133]:
df_train.to_pickle(f"{args.prefix_path}/train_data.pkl")
df_valid.to_pickle(f"{args.prefix_path}/valid_data.pkl")

In [134]:
# df_train = pd.read_pickle(f"{args.prefix_path}/train_data.pkl")
# df_valid = pd.read_pickle(f"{args.prefix_path}/valid_data.pkl")

In [135]:
for idx, row in df_train.iterrows():
    if "peets" in row['input_text'].lower():
        print(row['input_text'])
        for i, j in zip(row['tokenized_content'], row['token_labels']):
            print(i, j)
        break

generate dataset

In [136]:
def add_padding_and_mask(example):
    tokenized_context = [tokenizer.cls_token] + example['tokenized_content'] + [tokenizer.sep_token]
    labels = example['token_labels']
    labels.insert(0, 'O')
    labels.insert(len(labels), "O")

    max_len = args.max_len
    if len(tokenized_context) > max_len: 
        tokenized_context = tokenized_context[:max_len]
        labels = labels[:max_len]
    else:
        tokenized_context = tokenized_context + [tokenizer.pad_token] * (max_len - len(tokenized_context))
        labels = labels + ['O'] * (max_len - len(labels))

    attn_mask = [1 if tok != tokenizer.pad_token else 0 for tok in tokenized_context]

    ids = tokenizer.convert_tokens_to_ids(tokenized_context)

    label_ids = [labels_to_ids[label] for label in labels]

    return {
          'ids': torch.tensor(ids, dtype=torch.long),
          'masks': torch.tensor(attn_mask, dtype=torch.long),
          'labels': torch.tensor(label_ids, dtype=torch.long)
        } 

In [137]:
tokenizer = BertTokenizer.from_pretrained(args.pretrained_model)

train_dataset = datasets.Dataset.from_pandas(df_train[['docid', 'tokenized_content', 'token_labels']])
train_dataset = train_dataset.map(add_padding_and_mask, remove_columns=['tokenized_content', 'token_labels'])

HBox(children=(FloatProgress(value=0.0, description='Map', max=13382.0, style=ProgressStyle(description_width=…




In [138]:
tokenizer = BertTokenizer.from_pretrained(args.pretrained_model)

valid_dataset = datasets.Dataset.from_pandas(df_valid[['docid', 'tokenized_content', 'token_labels']])
valid_dataset = valid_dataset.map(add_padding_and_mask, remove_columns=['tokenized_content', 'token_labels'])

HBox(children=(FloatProgress(value=0.0, description='Map', max=1674.0, style=ProgressStyle(description_width='…




In [139]:
train_dataset.save_to_disk(f'{args.prefix_path}/train_dataset')
valid_dataset.save_to_disk(f'{args.prefix_path}/valid_dataset')

HBox(children=(FloatProgress(value=0.0, description='Saving the dataset (0/1 shards)', max=13382.0, style=Prog…




HBox(children=(FloatProgress(value=0.0, description='Saving the dataset (0/1 shards)', max=1674.0, style=Progr…




In [140]:
# datasets.Dataset.load_from_disk(f'{args.pre_path}/dataset/headline_content_keywords/train/train/train_data')

## test

In [141]:
df_test["input_text"] = df_test["context_cleaned"] ## 全不加
# df_test["input_text"] = df_test["context_keywords"] ## 全加keywords
print(df_test.shape)
pandarallel.initialize(nb_workers=32, progress_bar=True, use_memory_fs=False)
df_test[["tokenized_content", "ids", "masks"]] = df_test.parallel_apply(tokenize_test_text, param_args=args, axis=1).to_list()

test_dataset = datasets.Dataset.from_pandas(df_test[["docid", "tokenized_content", "ids", "masks"]])

(3710, 16)
INFO: Pandarallel will run on 32 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=116), Label(value='0 / 116'))), HB…

  return asarray(a).ndim


In [145]:
# df_test.to_pickle(f"{args.pre_path}/dataset/headline_content_keywords/test/test_data_{args.experiment_name}_{args.mode}.pkl")
print(df_test.shape)
df_test.to_pickle(f"{args.prefix_path}/test_data.pkl")

(3710, 19)


In [146]:
# test_dataset.save_to_disk(f'{args.pre_path}/dataset/headline_content_keywords/test/test/test_data_new-dict')
test_dataset.save_to_disk(f'{args.prefix_path}/test_dataset')

HBox(children=(FloatProgress(value=0.0, description='Saving the dataset (0/1 shards)', max=3710.0, style=Progr…




In [147]:
df_train.head(1).to_dict('record')

  df_train.head(1).to_dict('record')


[{'docid': '2023063021774796465&&0',
  'headline': nan,
  'content': '宝申控股(08151) 停牌 / 内幕消息 / 其他-杂项。宝申控股(08151) 有关复牌状况及继续暂停买卖之季度情况更新(558KB, pdf)',
  'ner': "[{'label_name': 'Company', 'text_segment': '寶申控股', 'start_ind': 0, 'end_ind': 4}, {'label_name': 'Company', 'text_segment': '寶申控股', 'start_ind': 31, 'end_ind': 35}]",
  'matched_keywords': "['宝申控股']",
  'include_company': True,
  'include_person': True,
  'split_sentence_index': 0,
  'Companys': "['宝申控股']",
  'Persons': '[]',
  'context_cleaned': '宝申控股(08151) 停牌 / 内幕消息 / 其他-杂项。宝申控股(08151) 有关复牌状况及继续暂停买卖之季度情况更新(558kb, pdf)',
  'person_matched': [],
  'company_matched': ['宝申控股'],
  'context_keywords': '宝申控股(08151) 停牌 / 内幕消息 / 其他-杂项。||可能的实体：宝申控股||宝申控股(08151) 有关复牌状况及继续暂停买卖之季度情况更新(558kb, pdf)||可能的实体：宝申控股||',
  'sign': False,
  'input_text': '宝申控股(08151) 停牌 / 内幕消息 / 其他-杂项。宝申控股(08151) 有关复牌状况及继续暂停买卖之季度情况更新(558kb, pdf)',
  'tag_char': ['B-Company',
   'I-Company',
   'I-Company',
   'I-Company',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O'