In [29]:
from colorama import Fore

In [14]:
!pip install --user xlrd



In [15]:
!tree -h --sort=size 開放權威檔/DDBC-Place

!tree -h --sort=size 開放權威檔/CBDB-Place

# Setup functions

In [181]:
import pandas as pd
import os
import re

_filter_empty = lambda s: s != ''
filter_empty = lambda l: list(filter(_filter_empty , l))
filter_empty(re.split('[()]', '某氏(徐庶母)'))

def split_aliases(names):
    results = []
    for name in names:
        try:
            results.extend(filter_empty(re.split('[=()（）]', name)))
        except TypeError:
#             print('TypeError: value - ', name)
            pass
    return results

def get_entities(data_dirpath, col_name):
    all_entities = []

    for fname in os.listdir(data_dirpath):
        fpath = os.path.join(data_dirpath, fname)
        if os.path.isdir(fpath):
            print(fpath, 'is a directory. Go inside!')
            all_entities.extend(get_entities(fpath, col_name))
#             pass
        elif not os.path.isfile(fpath):
            print(fpath, 'is not a file. Skipped')
            continue
        else:
#             print('Processing', fpath, '...')
            df = pd.read_excel(fpath)
            entities = df[col_name].to_list()
            all_entities.extend(split_aliases(entities))
    return all_entities

def filter_symbols(es):
    special_symbols = '！？!?［］[]{}｛｝＠＃＄％︿＆＊＿＋｜＂：﹀＜～／－~@#$%^&*_-+=\\|"\';:></'

    f = lambda e: not (set(e) & set(special_symbols))
    return list(filter(f, es))

def main(data_dirpath, col_name, segmenter_output_fname, regexner_output_fname, tag, overridden_tag, min_len=2):
    all_entities = get_entities(data_dirpath, col_name)
    print('(info) total', len(all_entities))
    all_entities = filter_symbols(all_entities)
    print('(proc) after removing entites with symbols:', len(all_entities))
    all_entities = list(set(all_entities))
    print('(proc) after removing duplicates:', len(all_entities))
    from collections import Counter
    print('(info)', Counter(map(len, all_entities)))
    df = pd.DataFrame({"entity": all_entities, "tag": [tag] * len(all_entities), "override": [overridden_tag] * len(all_entities)})
    print('(proc) built data frame', df.head())
    print('(info) long entity', df.query('entity.str.len() > 5').head())
    
    if min_len:
        print(f'(proc) removing length < {min_len}:', df.query(f'entity.str.len() < {min_len}').head())
        df = df.query(f'entity.str.len() >= {min_len}')
        print('(info)', f'after removing length < {min_len}:', len(df))

    df.to_csv(regexner_output_fname, header=None, index=None, sep='\t')
    print(f'(proc) saving file to {regexner_output_fname}...')
    df.to_csv(segmenter_output_fname, header=None, index=None, columns=['entity'], sep='\t')
    print(f'(proc) saving file to {segmenter_output_fname}...')
    
    # copy to inside docker container
    print(f'(proc) copying {regexner_output_fname} to inside docker container ...')
    !docker cp {regexner_output_fname} corenlp_zh:/stanford-corenlp-full-2018-10-05


    print(f'(proc) copying {segmenter_output_fname} to inside docker container ...')
    !docker cp {segmenter_output_fname} corenlp_zh:/stanford-corenlp-full-2018-10-05
        
    # check
    print('(check) if existing in local ...')
    !head -n 2 {regexner_output_fname}
    !wc -l {regexner_output_fname}

    !head -n 2 {segmenter_output_fname}
    !wc -l {segmenter_output_fname}
    
    # check in docker
    print('(check) if existing in docker container ...')
    !docker exec corenlp_zh head -n 2 {regexner_output_fname}
    !docker exec corenlp_zh wc -l {regexner_output_fname}

    !docker exec corenlp_zh head -n 2 {segmenter_output_fname}
    !docker exec corenlp_zh wc -l {segmenter_output_fname}

# Main Process

In [184]:
root_dir = '開放權威檔'

## CBDB-Place

# input
col_name = '行政區'
data_dir = 'CBDB-Place'
regexner_output_fname = 'cbdb-gpe-regexner.txt'
segmenter_output_fname = 'cbdb-gpe-segmenter.txt'
tag = 'GPE'
overridden_tag = 'O,PERSON'

data_dirpath = os.path.join(root_dir, data_dir)
main(data_dirpath, col_name, segmenter_output_fname, regexner_output_fname, tag, overridden_tag)

## DDBC-Place

col_name = '名'
data_dir = 'DDBC-Place'
regexner_output_fname = 'ddbc-place-regexner.txt'
segmenter_output_fname = 'ddbc-place-segmenter.txt'
tag = 'LOCATION'
overridden_tag = 'O,PERSON'
data_dirpath = os.path.join(root_dir, data_dir)
main(data_dirpath, col_name, segmenter_output_fname, regexner_output_fname, tag, overridden_tag)

## Generate properties file

In [124]:
regexner_files = ['cbdb-person-regexner.txt', 'ddbc-person-regexner.txt', 'cbdb-gpe-regexner.txt', 'ddbc-place-regexner.txt']
segmenter_files = ['cbdb-person-segmenter.txt', 'ddbc-person-segmenter.txt', 'cbdb-gpe-segmenter.txt', 'ddbc-place-segmenter.txt']

In [193]:
props_inpath = 'StanfordCoreNLP-chinese-fgc-template.properties'
props_outpath = 'StanfordCoreNLP-chinese-fgc.properties'

print('(proc) generating properties file ...')
!cp {props_inpath} {props_outpath}

old_str = '#ner.additional.regexner.mapping = ...'
new_str = 'ner.additional.regexner.mapping = ' + ','.join(regexner_files)
!sed -i 's/{old_str}/{new_str}/g' {props_outpath}

old_str = 'segment.serDictionary = edu\/stanford\/nlp\/models\/segmenter\/chinese\/dict-chris6.ser.gz'
new_str = old_str + ',' + ','.join(segmenter_files)
!sed -i 's/{old_str}/{new_str}/g' {props_outpath}

print('(modified part)')
!diff --color {props_inpath} {props_outpath}

print('(proc) moving to docker container')
!docker cp {props_outpath} corenlp_zh:/stanford-corenlp-full-2018-10-05/{props_outpath}
print('(check) if moved')
!docker exec corenlp_zh grep 'segment.serDictionary' {props_outpath}
!docker exec corenlp_zh grep 'ner.additional.regexner.mapping' {props_outpath}

# Restart CoreNLPServer

In [6]:
# kill existent server
ps_str = !docker exec corenlp_zh ps | grep StanfordCoreNLPServer
if ps_str:
    print('(proc) killing server ps:', ps_str[0])
    pid = int(ps_str[0].strip().split(' ')[0])
    p = !docker exec corenlp_zh kill {pid} && ps | grep StanfordCoreNLPServer
    assert not p, p

# start a new server
print('(proc) starting a new server')
!docker exec -d corenlp_zh ash -c "java -mx8g -cp '*' edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9000 -timeout 15000 -serverProperties StanfordCoreNLP-chinese-fgc.properties &> run.log" && docker exec corenlp_zh ps -a | grep StanfordCoreNLPServer

(proc) starting a new server
   11 root      0:00 java -mx8g -cp * edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9000 -timeout 15000 -serverProperties StanfordCoreNLP-chinese-fgc.properties


# Test Results

In [5]:
!pip install stanfordnlp

Collecting stanfordnlp
[?25l  Downloading https://files.pythonhosted.org/packages/41/bf/5d2898febb6e993fcccd90484cba3c46353658511a41430012e901824e94/stanfordnlp-0.2.0-py3-none-any.whl (158kB)
[K     |████████████████████████████████| 163kB 189kB/s eta 0:00:01
[?25hCollecting protobuf (from stanfordnlp)
[?25l  Downloading https://files.pythonhosted.org/packages/ff/f1/8dcd4219bbae8aa44fe8871a89f05eca2dca9c04f8dbfed8a82b7be97a88/protobuf-3.11.3-cp37-cp37m-manylinux1_x86_64.whl (1.3MB)
[K     |████████████████████████████████| 1.3MB 71.6MB/s eta 0:00:01
Collecting torch>=1.0.0 (from stanfordnlp)
[?25l  Downloading https://files.pythonhosted.org/packages/1a/3b/fa92ece1e58a6a48ec598bab327f39d69808133e5b2fb33002ca754e381e/torch-1.4.0-cp37-cp37m-manylinux1_x86_64.whl (753.4MB)
[K     |████████████████████████████████| 753.4MB 6.7kB/s eta 0:00:010   |▏                               | 4.2MB 48.6MB/s eta 0:00:16
Installing collected packages: protobuf, torch, stanfordnlp
Successfully insta

In [2]:
import sys
sys.path.append('../wiki_kb_inference')
from stanfordnlp_utils import *
from fgc_utils import *
from utils import load_json
import ipywidgets as widgets
from ipywidgets import interact, interact_manual

from stanfordnlp.server import CoreNLPClient

In [3]:
docs = load_json('../wiki_kb_inference/FGC_release_all(cn).json')

In [11]:
dids = [doc['DID'] for doc in docs]
get_doc(dids[0], docs)
# dids

{'DID': 'D001',
 'DTEXT': '蘇軾（1037年1月8日－1101年8月24日），眉州眉山（今四川省眉山市）人，北宋時著名的文學家、政治家、藝術家、醫學家。字子瞻，一字和仲，號東坡居士、鐵冠道人。嘉佑二年進士，累官至端明殿學士兼翰林學士，禮部尚書。南宋理學方熾時，加賜諡號文忠，複追贈太師。有《東坡先生大全集》及《東坡樂府》詞集傳世，宋人王宗稷收其作品，編有《蘇文忠公全集》。\n其散文、詩、詞、賦均有成就，且善書法和繪畫，是文學藝術史上的通才，也是公認韻文散文造詣皆比較傑出的大家。蘇軾的散文為唐宋四家（韓愈、柳宗元、歐蘇）之末，與唐代的古文運動發起者韓愈並稱為「韓潮蘇海」，也與歐陽修並稱「歐蘇」；更與父親蘇洵、弟蘇轍合稱「三蘇」，父子三人，同列唐宋八大家。蘇軾之詩與黃庭堅並稱「蘇黃」，又與陸游並稱「蘇陸」；其詞「以詩入詞」，首開詞壇「豪放」一派，振作了晚唐、五代以來綺靡的西崑體餘風。後世與南宋辛棄疾並稱「蘇辛」，惟蘇軾故作豪放，其實清朗；其賦亦頗有名氣，最知名者為貶謫期間借題發揮寫的前後《赤壁賦》。宋代每逢科考常出現其文命題之考試，故當時學者曰：「蘇文熟，喫羊肉、蘇文生，嚼菜羹」。藝術方面，書法名列「蘇、黃、米、蔡」北宋四大書法家（宋四家）之首；其畫則開創了湖州畫派；並在題畫文學史上佔有舉足輕重的地位。',
 'QUESTIONS': [{'QID': 'D001Q01',
   'QTYPE': '基础题',
   'QTEXT': '蘇東坡在中國歷史上，是哪一個朝代的人？',
   'SENTS': [{'text': '苏东坡在中国历史上，', 'start': 0, 'end': 10},
    {'text': '是哪一个朝代的人？', 'start': 10, 'end': 19}],
   'ANSWER': [{'ATEXT': '北宋',
     'ATOKEN': [{'text': '北宋', 'start': 40}],
     'ATEXT_CN': '北宋'}],
   'ATYPE': 'Date-Duration',
   'AMODE': 'Single-Span-Extraction',
   'ASPAN': [{'text': '苏轼', 'start': 0, 'end': 2},
    {'te

In [15]:

@interact_manual
def pretty_corenlp(did=dids):
    doc_dic = get_doc(did, docs)
    print('[New]')
    with CoreNLPClient(endpoint='http://localhost:9000', start_server=False) as nlp:
        doc = nlp.annotate(doc_dic['DTEXT_CN'], properties={'ssplit.boundaryTokenRegex': '[。]|[!?！？]+'})

        for sent in doc.sentence:
            print(f'(s{sent.sentenceIndex})', end=' ')
            snp_pprint(sent, mode='custom', classes_w_color=['PERSON', 'GPE', 'LOCATION', 'MISC', 'TITLE'])
    print('-----------------------------------------------------')
    print('[Old]')
    with CoreNLPClient(endpoint='http://140.109.19.191:9000', start_server=False) as nlp:
        doc = nlp.annotate(doc_dic['DTEXT_CN'], properties={'ssplit.boundaryTokenRegex': '[。]|[!?！？]+',
                                                            'pipelineLanguage': 'zh'})

        for sent in doc.sentence:
            print(f'(s{sent.sentenceIndex})', end=' ')
            snp_pprint(sent, mode='custom', classes_w_color=['PERSON', 'GPE', 'LOCATION', 'MISC', 'TITLE'])
        

interactive(children=(Dropdown(description='did', options=('D001', 'D002', 'D003', 'D004', 'D006', 'D007', 'D0…

In [3]:
nlp = CoreNLPClient(endpoint='http://140.109.19.191:9000', start_server=False)

doc = nlp.annotate(docs[0]['DTEXT_CN'],properties={'ssplit.boundaryTokenRegex': '[。]|[!?！？]+',
                                                    'pipelineLanguage': 'zh'},
                  annotators='tokenize,ssplit,pos,lemma,ner,depparse,parse,coref,entitylink')

for sent in doc.sentence:
    print(f'(s{sent.sentenceIndex})', end=' ')
    snp_pprint(sent)

(s0) 

TypeError: object of type 'NoneType' has no len()

In [4]:
nlp = CoreNLPClient(endpoint='http://localhost:9000', start_server=False)

doc = nlp.annotate(docs[3]['DTEXT_CN'], properties={'ssplit.boundaryTokenRegex': '[。]|[!?！？]+'})

for sent in doc.sentence:
    print(f'(s{sent.sentenceIndex})', end=' ')
    snp_pprint(sent)

(s0) 

TypeError: object of type 'NoneType' has no len()

In [68]:
nlp = CoreNLPClient(endpoint='http://140.109.19.191:9000', start_server=False)

doc = nlp.annotate(docs[3]['DTEXT_CN'], properties={'ssplit.boundaryTokenRegex': '[。]|[!?！？]+',
                                                    'pipelineLanguage': 'zh'
#                                                  'segment.serDictionary': 'edu/stanford/nlp/models/segmenter/chinese/dict-chris6.ser.gz,ddbc-person-segmenter.txt,cbdb-person-segmenter.txt',
#                                                     'ner.additional.regexner.mapping': 'ddbc-person-regexner.txt,cbdb-person-regexner.txt'
                                                   })

for sent in doc.sentence:
    print(f'(s{sent.sentenceIndex})', end=' ')
    snp_pprint(sent)

(s0) [35m元[0m [35m祐[0m [44m元年[0m [39m（[0m [44m1086年[0m [39m）[0m [39m，[0m [41m宋哲宗[0m [39m即位[0m [39m，[0m [39m高[0m [39m太[0m [39m皇太[0m [39m后[0m [39m垂帘听政[0m [39m，[0m [39m回朝[0m [39m任[0m [45m礼部[0m [45m郎中[0m [39m、[0m [1;40m中书舍人[0m [39m、[0m [39m翰林[0m [39m学士[0m [39m，[0m [35m元祐[0m [33m四[0m [35m年[0m [39m（[0m [44m1089年[0m [39m）[0m [39m拜[0m [39m龙图阁[0m [39m学士[0m [39m，[0m [39m曾[0m [39m出任[0m [42m杭州[0m [39m、[0m [32m颍州[0m [39m等[0m [39m知州[0m [39m职务[0m [39m，[0m [39m官[0m [39m至[0m [39m礼部[0m [39m尚书[0m [39m。[0m
(s1) [35m绍圣[0m [44m元年[0m [39m（[0m [44m1094年[0m [39m）[0m [39m被[0m [39m哲宗[0m [39m贬谪[0m [39m至[0m [42m惠州[0m [39m、[0m [42m儋州[0m [39m（[0m [1;32m[46m海南岛[0m[0m [39m）[0m [39m。[0m
(s2) [35m元符[0m [33m三[0m [35m年[0m [39m（[0m [44m1100年[0m [39m）[0m [39m，[0m [39m宋徽宗[0m [39m即位[0m [39m，[0m [39m向[0m [39m太后[0m [39m垂帘听政[0m [39m，[0m [39m下诏[0m [39m让[0m [41m苏轼北[

In [106]:
nlp = CoreNLPClient(endpoint='http://localhost:9000', start_server=False)

doc = nlp.annotate(docs[27]['DTEXT_CN'], properties={'ssplit.boundaryTokenRegex': '[。]|[!?！？]+',
#                                                  'segment.serDictionary': 'edu/stanford/nlp/models/segmenter/chinese/dict-chris6.ser.gz,ddbc-person-segmenter.txt,cbdb-person-segmenter.txt',
#                                                     'ner.additional.regexner.mapping': 'ddbc-person-regexner.txt,cbdb-person-regexner.txt'
                                                   })

for sent in doc.sentence:
    print(f'(s{sent.sentenceIndex})', end=' ')
    snp_pprint(sent)

(s0) [39m「[0m [39mPTT[0m [39m创世神[0m [39m」[0m [41m杜奕瑾[0m [39m创办[0m [39m的[0m [45m台湾[0m [45m人工[0m [45m智慧[0m [45m实验室[0m [39m推出[0m [39m「[0m [39m雅婷[0m [39m逐字稿[0m [39m」[0m [39mApp[0m [39m，[0m [39m已[0m [39m在[0m [39miOS[0m [39m与[0m [39mAndroid[0m [39m平台[0m [39m上[0m [39m线[0m [39m，[0m [39m官方[0m [39m表示[0m [39m能[0m [39m节省[0m [39m至少[0m [34m[43m60%[0m[0m [39m的[0m [39m听打[0m [39m时间[0m [39m，[0m [39m还[0m [39m听[0m [39m得[0m [39m懂[0m [1;40m台湾[0m [1;40m国语[0m [39m和[0m [39m中英[0m [39m夹杂[0m [39m。[0m
(s1) [39m根据[0m [39mApp[0m [39m官方[0m [39m介绍[0m [39m，[0m [39m由[0m [45m台湾[0m [45m人工[0m [45m智慧[0m [45m实验室[0m [39m（[0m [39mAILabs[0m [39m）[0m [39m推出[0m [39m的[0m [39m「[0m [45m雅婷[0m [45m逐字稿[0m [39m」[0m [39mApp[0m [39m除了[0m [39m能[0m [39m即时[0m [39m做[0m [39m语音[0m [39m转[0m [39m文字[0m [39m，[0m [39m也可以[0m [39m用来[0m [39m提升[0m [39m听障人[0m [39m沟通[0m [39m效率[0m [39m。[0m
(s2

In [104]:
nlp = CoreNLPClient(endpoint='http://140.109.19.191:9000', start_server=False)

doc = nlp.annotate(docs[25]['DTEXT_CN'], properties={'ssplit.boundaryTokenRegex': '[。]|[!?！？]+',
                                                    'pipelineLanguage': 'zh'
#                                                  'segment.serDictionary': 'edu/stanford/nlp/models/segmenter/chinese/dict-chris6.ser.gz,ddbc-person-segmenter.txt,cbdb-person-segmenter.txt',
#                                                     'ner.additional.regexner.mapping': 'ddbc-person-regexner.txt,cbdb-person-regexner.txt'
                                                   })

for sent in doc.sentence:
    print(f'(s{sent.sentenceIndex})', end=' ')
    snp_pprint(sent)

(s0) [39m根据[0m [39m「[0m [39m悬浮[0m [39m微粒[0m [39m特征[0m [39m对[0m [39m民众[0m [39m健康[0m [39m影响[0m [39m之[0m [39m研究[0m [39m」[0m [39m，[0m [39m捷运[0m [39m竟是[0m [39mPM2.5[0m [39m浓度[0m [39m暴露[0m [39m最[0m [39m高[0m [39m的[0m [39m交通[0m [39m工具[0m [39m。[0m
(s1) [45m行政院[0m [39m环境[0m [39m保护[0m [39m署[0m [44m今天[0m [39m表示[0m [39m，[0m [39m这[0m [39m项[0m [39m结果[0m [39m只是[0m [39m瞬间[0m [39m数值[0m [39m，[0m [39m平均[0m [39m浓度[0m [39m最[0m [39m高[0m [39m的[0m [39m仍然[0m [39m是[0m [39m机车[0m [39m。[0m
(s2) [45m环保署[0m [39m与[0m [45m国卫院[0m [45m国家[0m [45m环境[0m [45m医学[0m [45m研究所[0m [44m106年[0m [39m度[0m [39m针对[0m [39m针对[0m [39m捷运[0m [39m、[0m [39m公车[0m [39m、[0m [39m汽车[0m [39m、[0m [39m机车[0m [39m、[0m [39m步行[0m [39m、[0m [39m脚踏车[0m [39m等[0m [33m6[0m [39m大[0m [39m交通[0m [39m方式[0m [39m，[0m [39m进行[0m [39m「[0m [39m悬浮[0m [39m微粒[0m [39m特征[0m [39m对[0m [39m民众[0m [39m健康[0m [39m影响[0m