In [7]:
import pandas as pd
import numpy as np
from collections import Counter
import matplotlib
import matplotlib.pyplot as plt
import srsly
import seaborn as sns
import re

# load the annotation data file
df = pd.read_csv('../assets/annotations.csv')

# set the default font for everything so chinese characters display correctly
matplotlib.rcParams['font.family'] = "Heiti TC"
matplotlib.rcParams.update({'font.size': 14})

# Entity Detection

After preprocessing using heuristics, what are the most common spans that occur just before the markers "云" and "作"?

In [13]:
from scripts.lib.components import doc_to_spans
from spacy.matcher import PhraseMatcher
from pathlib import Path
import spacy
from collections import defaultdict

ents = Counter()

for annotation in df['annotation']:
    spans = doc_to_spans(annotation)
    for i, span in enumerate(spans):
        if "云" in span.text or span.label == "GRAPHIC":
            if not spans[i - 1].label:
                print(annotation)
                ents[spans[i-1].text] += 1

patterns = defaultdict(list)
for pattern in srsly.read_jsonl('../assets/ner_patterns.jsonl'):
    patterns[pattern['label']].append(pattern['pattern'])

nlp = spacy.blank("zh")
matcher = PhraseMatcher(nlp.vocab)
for label, _patterns in patterns.items():
    matcher.add(label, [nlp.make_doc(pattern) for pattern in _patterns])

hits = 0
for ent in ents:
    doc = nlp.make_doc(ent)
    matches = matcher(doc)
    if len(matches) > 0:
        hits += 1

print(len(ents))
print(ents.most_common(20))



精領反雜卦云通也彖云養而不窮周書云黃帝穿井世本云化益作井宋衷云化益伯益也堯臣廣雅云井深也鄭云井法也字林作井子挺反周云井以不變更爲義師說井以淸絜爲義震宫五世卦
音橘徐又居密反鄭云綆也方言云關西謂綆爲繘郭璞云汲水索也又其律反又音述
側舊反馬云爲瓦裹下達上也子夏傳云脩治也干云以􁒑壘井曰甃字林云井壁也
薦絜反止也明禮有制度之名一云分段支節之義坎宫一世卦
徐胡詣反本系也又音係續也字從􁀎若直作􁀎下糸者音口奚反非
本又作磨末何反京云相磑切也磑音古代反馬云摩切也鄭注禮記云迫也迫音百
虞陸董皆云鼓鼓動也
如字京云明也虞董姚顧蜀才並云别也音彼列反
音界注同王肅干韓云纎介也
鄭云範法也馬王肅張作犯違張云犯違猶裁成也
本亦无功字一本功作迹
衆家本並然鄭本作至賾云賾當爲動九家亦作冊
陸姚桓玄荀柔之作儀之
鄭陸蜀才作置鄭云置當爲德
本又云作易者
音也鄭陸虞姚王肅作野言妖野容儀敎誨淫泆也王肅云作野音也
延善反又注演同鄭云衍演也干云合也王廙蜀才云廣也
劉瓛悉殄反盡也王肅韓悉禮反京荀虞董張蜀才作先石經同
音尸說文云蒿屬生千歳三百莖易以爲數天子九尺諸侯七尺大夫五尺士三尺毛詩草木䟽云似藾蕭靑色科生鴻範五行傳云蓍百年一本生百莖論衡云七十歳生一莖七百歳生十莖神靈之物故生遲也史記云生滿百莖者其下必有神龜守之其上常有雲氣覆之淮南子云上有叢蓍下有伏龜
音尸說文云蒿屬生千歳三百莖易以爲數天子九尺諸侯七尺大夫五尺士三尺毛詩草木䟽云似藾蕭靑色科生鴻範五行傳云蓍百年一本生百莖論衡云七十歳生一莖七百歳生十莖神靈之物故生遲也史記云生滿百莖者其下必有神龜守之其上常有雲氣覆之淮南子云上有叢蓍下有伏龜
音尸說文云蒿屬生千歳三百莖易以爲數天子九尺諸侯七尺大夫五尺士三尺毛詩草木䟽云似藾蕭靑色科生鴻範五行傳云蓍百年一本生百莖論衡云七十歳生一莖七百歳生十莖神靈之物故生遲也史記云生滿百莖者其下必有神龜守之其上常有雲氣覆之淮南子云上有叢蓍下有伏龜
如九反王肅奴又反又女九反又如又反馬鄭陸王肅本作此宋衷王廙作揉宋云使曲者直直者曲爲揉京作柔荀作橈
力火反馬云果桃李之屬蓏瓜瓠之屬應劭云木實曰果草實曰蓏說文云在木曰果在地曰蓏張晏云有核曰果無核曰蓏京本作果墮之字
力火反馬云果桃李之屬蓏瓜瓠之屬應劭云木實曰果草實曰蓏說文云在木曰果在地曰蓏張晏云有核曰果無核曰蓏京本作果墮之字
王肅卜伯玊桓玄明僧紹作仁
許冝反字又作羲鄭云鳥獸全具曰犧孟京作戲云伏服也戲化也
力