In [13]:
from transformers import AutoModelForTokenClassification, AutoTokenizer, pipeline


model = AutoModelForTokenClassification.from_pretrained("RoBERTa-ext-large-chinese-finetuned-ner")
tokenizer = AutoTokenizer.from_pretrained("RoBERTa-ext-large-chinese-finetuned-ner", model_max_length=512)
pipe = pipeline("token-classification", model=model, tokenizer=tokenizer, device=1, aggregation_strategy='simple')

In [15]:
def transform_output(entities, _id):
    result = []
    for entity in entities:
        category = entity['entity_group']
        word = entity['word'].replace(' ', '')
        start = entity['start']
        end = entity['end'] - 1
        result.append({
            'ID': _id,
            'Category': category,
            'Pos_b': start, 
            'Pos_e': end,
            'Privacy': word
        })
    return result

In [16]:
import pandas as pd
import os

In [19]:
text_list = []
id_list = []
res = []
for filename in sorted(os.listdir('data/original/test/text'), key=lambda x: int(x.split('.')[0])):
    with open(os.path.join('data/original/test/text', filename), 'r') as f:
        text = f.read()
    id_list.append(filename.split('.')[0])
    text_list.append(text)
output_list = pipe(text_list, batch_size=64)
for _id, output in zip(id_list, output_list):
    output = transform_output(output, _id)
    res.extend(output)
df = pd.DataFrame(res)
df.to_csv('data/original/test/predict.csv', encoding='utf-8', index=False)

In [20]:
df.head(5)

Unnamed: 0,ID,Category,Pos_b,Pos_e,Privacy
0,0,book,0,10,《别告诉我你懂ppt》
1,0,book,11,23,《不懂项目管理还敢拼职场》
2,0,book,24,31,《让营销更性感》
3,0,position,33,34,作者
4,0,name,35,36,李治
