# hr 数据预处理


In [27]:
import re
from my_py_toolkit.file.file_toolkit import read_file, readjson, writejson, get_file_paths

In [20]:
txt = '1、[@[@投资计划名称#Common_att*]：[@百年一武汉襄阳大厦不动产债权投资计划#Product*]#Relation_ide*]'
reg_rel = '\[@(?P<rel>.+)\#Relation_ide\*\]'

In [22]:
match = re.search(reg_rel, txt)
match.groupdict(), match.group()

({'rel': '[@投资计划名称#Common_att*]：[@百年一武汉襄阳大厦不动产债权投资计划#Product*]'},
 '[@[@投资计划名称#Common_att*]：[@百年一武汉襄阳大厦不动产债权投资计划#Product*]#Relation_ide*]')

In [12]:
txt = '[@投资计划名称#Common_att*]：[@百年一武汉襄阳大厦不动产债权投资计划#Product*]222'
reg_ner = '\[@(?P<ner_content>.*?)\#(?P<ner_name>.*?)\*\]'
for match in re.finditer(reg_ner, txt):
    print(match.groupdict())

{'ner_content': '投资计划名称', 'ner_name': 'Common_att'}
{'ner_content': '百年一武汉襄阳大厦不动产债权投资计划', 'ner_name': 'Product'}


In [6]:
def handle_ner(txt):
    reg_ner = '\[@(?P<ner_content>.*?)\#(?P<ner_name>.*?)\*\]'
    pre_end = 0
    txt_handle = ''
    labels = []
    for match in re.finditer(reg_ner, txt):
        s, e = match.span()
        txt_handle += txt[pre_end:s]
        pre_end = e
        ner_name = match['ner_name']
        ner_content = match['ner_content']
        labels.append((ner_name, ner_content, (len(txt_handle), len(txt_handle) + len(ner_content))))
        txt_handle += ner_content
    txt_handle += txt[pre_end:]
    return txt_handle, labels



In [13]:
handle_ner(txt)

('投资计划名称：百年一武汉襄阳大厦不动产债权投资计划222',
 [('Common_att', '投资计划名称', (0, 6)),
  ('Product', '百年一武汉襄阳大厦不动产债权投资计划', (7, 25))])

In [27]:
match.span(), match['ner_name'], match.group('ner_name')

((22, 52), 'Product', 'Product')

In [21]:
def handle(txt):
    reg_rel = '\[@(?P<rel>.+)\#Relation_ide\*\]'
    pre_end = 0
    txt_handle = ''
    labels = []
    for match in re.finditer(reg_rel, txt):
        s, e = match.span()
        txt_rel = match['rel']
        # print(f'rel: {txt_rel}')
        txt_handle += txt[pre_end:s]
        pre_end = e
        txt_ner, label_sub = handle_ner(txt_rel)
        for i, item in enumerate(label_sub):
            label_sub[i] = (item[0], item[1], [v + len(txt_handle) for v in item[2]])
           
        labels.append(label_sub)
        txt_handle += txt_ner
    
    if pre_end < len(txt):
        txt_ner, label_sub = handle_ner(txt[pre_end:])
        for item in label_sub:
            scope = [v + len(txt_handle) for v in item[2]]
            labels.append([(item[0], item[1], scope)])
        txt_handle += txt_ner
    
    return txt_handle, labels
        


In [16]:
txt = '1、[@[@投资计划名称#Common_att*]：[@百年一武汉襄阳大厦不动产债权投资计划#Product*]#Relation_ide*]'

In [22]:
handle(txt)

('1、投资计划名称：百年一武汉襄阳大厦不动产债权投资计划',
 [[('Common_att', '投资计划名称', [2, 8]),
   ('Product', '百年一武汉襄阳大厦不动产债权投资计划', [9, 27])]])

In [24]:
p = '../resources/dataset/hr/61_6.1 百年-武汉襄阳大厦不动产债权投资计划-募集说明书.pdf.txt.ann'
datas = read_file(p, '\n')
len(datas)

1267

In [25]:
datas_handle = []
for line in datas:
    txt, labels = handle(line)
    if labels:
        datas_handle.append((txt, labels))
len(datas_handle)

433

In [30]:
paths = get_file_paths('../resources/dataset/hr/', ['ann'])
len(paths)

351

In [26]:
datas_handle[:2]

[('百年一武汉襄阳大厦不动产债权投资计划募集说明书',
  [[('document', '百年一武汉襄阳大厦不动产债权投资计划募集说明书', [0, 23])]]),
 ('百年一武汉襄阳大厦不动产债权投资计划募集说明书',
  [[('document', '百年一武汉襄阳大厦不动产债权投资计划募集说明书', [0, 23])]])]

In [18]:
res = handle(txt)
res

rel: [@投资计划名称#Common_att*]：[@百年一武汉襄阳大厦不动产债权投资计划#Product*]


('1、投资计划名称：百年一武汉襄阳大厦不动产债权投资计划[@投资计划名称：百年一武汉襄阳大厦不动产债权投资计划#Relation_ide*]',
 [[('Common_att', '投资计划名称', [2, 8]),
   ('Product', '百年一武汉襄阳大厦不动产债权投资计划', [9, 27])],
  [('Common_att', '[@投资计划名称', [27, 35])],
  [('Product', '百年一武汉襄阳大厦不动产债权投资计划', [36, 54])]])

In [19]:
len(res[1])

3

In [20]:
res[1]

[[('Common_att', '投资计划名称', [2, 8]),
  ('Product', '百年一武汉襄阳大厦不动产债权投资计划', [9, 27])],
 [('Common_att', '[@投资计划名称', [27, 35])],
 [('Product', '百年一武汉襄阳大厦不动产债权投资计划', [36, 54])]]

In [8]:
reg_rel = '\[@(?P<rel>.+)\#Relation_ide\*\]'
re.search(reg_rel, txt)

<_sre.SRE_Match object; span=(2, 71), match='[@[@投资计划名称#Common_att*]：[@百年一武汉襄阳大厦不动产债权投资计划#Prod>

In [9]:
for match in re.finditer(reg_rel, txt):
    print(match.group())

[@[@投资计划名称#Common_att*]：[@百年一武汉襄阳大厦不动产债权投资计划#Product*]#Relation_ide*]


In [10]:
match['rel']

'[@投资计划名称#Common_att*]：[@百年一武汉襄阳大厦不动产债权投资计划#Product*]'