## Check dataset existence

In [35]:
# check data existence
from pathlib import Path


data_folder = Path.cwd().parent.joinpath("data/processed")

def check_data_existence(folder):
    file_count = len(list(folder.glob("e*_ann.json")))
    if  file_count == 0:
        raise Exception("Processed Data does not exist.")
    else:
        print("{} files exist.".format(file_count))


check_data_existence(data_folder)

230 files exist.


## Read Slot2 data to DataFrame

In [36]:
label_kinds = []

# make labels (exclude NULL and OOD)
for e in ["market", "company", "business", "product"]:
    for a in ["general", "sales", "profit", "amount", "price", "cost"]:
        label_kinds.append(e + "#" + a)
        if e in ["market"]:
            break;

print(label_kinds)

['market#general', 'company#general', 'company#sales', 'company#profit', 'company#amount', 'company#price', 'company#cost', 'business#general', 'business#sales', 'business#profit', 'business#amount', 'business#price', 'business#cost', 'product#general', 'product#sales', 'product#profit', 'product#amount', 'product#price', 'product#cost']


In [37]:
import json
import pandas as pd
from collections import Counter
from janome.tokenizer import Tokenizer


dataset = []
labels = []

for f in data_folder.glob("e*_ann.json"):
    with f.open(encoding="utf-8") as j:
        d = json.load(j)
        for s in d["sentences"]:
            os = []
            cs = []
            ws = []
            for o in s["opinions"]:
                category = o["category"]
                word = o["target"]
                if category in label_kinds and category not in cs:
                    cs.append(category)
                    ws.append(word)
                    os.append((o["from"], o["to"]))
            
            if len(cs) > 0:
                dataset.append(
                    {"sentence": s["sentence"], 
                     "categories": cs,
                     "words": ws}
                )
                labels.append(os)

dataset = pd.DataFrame(dataset)
print(dataset.head(5))
print(labels[:5])

                        categories  \
0  [company#sales, company#profit]   
1                 [product#amount]   
2                 [product#amount]   
3                 [product#amount]   
4                  [product#price]   

                                            sentence             words  
0  以上の結果、当連結会計年度の当社グループの業績は、売上高631億19百万円（前期比3.5％増...  [当社グループ, 当社グループ]  
1  なお、当連結会計年度の生産量は、ブナピーを含めブナシメジ42,602ｔ（同5.5％増）、エリ...           [ブナシメジ]  
2  平成27年４月の火災により生産を休止していた苫小牧第一きのこセンターが、工場を再建し、平成2...           [ブナシメジ]  
3  また、改修のため一時生産を休止しておりました広川きのこセンターにおきまして、平成28年９月上...             [きのこ]  
4      春から夏にかけましては個人消費の低迷などにより、きのこの価格は厳しい状況で推移いたしました             [きのこ]  
[[(14, 20), (14, 20)], [(23, 28)], [(131, 136)], [(75, 78)], [(24, 27)]]


## Make Baseline Model

In [87]:
import numpy as np


class DetectEntities():
    
    def __init__(self):
        self.category_dict = {}
    
    def fit(self, X, y=None):
        self.category_dict = {}
        for index, row in X.iterrows():
            for c, w in zip(row["categories"], row["words"]):
                if c not in self.category_dict:
                    self.category_dict[c] = []
                self.category_dict[c].append(w)

        for c in self.category_dict:
            cnt = Counter(self.category_dict[c])
            freq = sorted(cnt.most_common(), key=lambda x: x[1])
            self.category_dict[c] = [f[0] for f in freq]

        return self
    
    def search(self, sentence, word):
        from_index = sentence.find(word)
        if from_index > -1:
            to_index = from_index + len(word)
            return (from_index, to_index)
        else:
            return None
    
    def predict(self, X, copy=True):
        predictions = []
        for index, row in X.iterrows():
            s = row["sentence"]
            preds = []
            for c in row["categories"]:
                if c not in self.category_dict:
                    continue
                for w in self.category_dict[c]:
                    p = self.search(s, w)
                    if p is not None:
                        preds.append(p)
                        break
            predictions.append(preds)

        return np.array(predictions)


model = DetectEntities()

In [112]:
model.fit(dataset)
sample_indices = np.random.randint(0, len(dataset), 3)
samples = dataset.iloc[sample_indices, :]
pred_samples = model.predict(samples)
true_samples = np.array(labels)[sample_indices]

cnt = 0
for i, row in samples.iterrows():
    print("sentence: {}".format(row["sentence"]))
    print("predicted: {}".format(pred_samples[cnt]))
    print("true: {}".format(true_samples[cnt]))
    cnt += 1
    print("---------------------------------------")

sentence: 事業ごとの状況をみますと、電線については、巻線や建設向けが減少しましたが、注力分野である鉄道車両用電線が中国向けを中心に大きく伸長しました
predicted: [(21, 23)]
true: [(21, 23)]
---------------------------------------
sentence: これらの結果、国際貨物取扱業の営業収益は6,260,681千円（前期比1.1％増）、セグメント利益（営業利益）は431,616千円（前期比13.6％増）となりました
predicted: [(7, 14), (7, 14)]
true: [(7, 14), (7, 14)]
---------------------------------------
sentence: 当社グループが係わる法人向けICT(*1)関連市場は、クラウドコンピューティングの普及を始めとする企業情報システムの変化、企業活動におけるビッグデータやIoT(*)等のICT利活用の進展、情報漏洩等に対応するセキュリティ需要の高まり、4K(*)配信等に伴うネットワーク利用の増大等により、継続的に拡大していくものと認識しております
predicted: [(21, 25)]
true: [(10, 25)]
---------------------------------------


In [113]:
from sklearn.model_selection import ShuffleSplit


def f1_score(preds, trues):
    # calculate by micro
    matches = 0
    for_recall = 0
    for_precision = 0

    def safe_div(x1, x2):
        return 0 if x2 == 0 else x1 / x2

    for p, t in zip(preds, trues):
        for_precision += len(p)
        for_recall += len(t)
        for _p in p:
            if _p in t:
                matches += 1

    recall = safe_div(matches, for_recall)
    precision = safe_div(matches, for_precision)
    f1 = safe_div(2 * (precision * recall), (precision + recall))
    return f1, precision, recall


X = dataset
y = np.array(labels)
cv = ShuffleSplit(n_splits=3, test_size=0.2, random_state=0)


f1s = []
prs = []
rcs = []
for train_index, test_index in cv.split(X):
    model.fit(X.iloc[train_index, :], y[train_index])
    y_pred = model.predict(X.iloc[test_index, :])
    y_true = y[test_index]
    f1, pr, rc = f1_score(y_true, y_pred)
    f1s.append(f1)
    prs.append(pr)
    rcs.append(rc)

print("F1 is {} (+/-{})".format(np.mean(f1s), np.std(f1s)))
print("Precision is {} (+/-{})".format(np.mean(prs), np.std(prs)))
print("Recall is {} (+/-{})".format(np.mean(rcs), np.std(rcs)))

F1 is 0.15256170217061774 (+/-0.01623943392960128)
Precision is 0.1235013134597035 (+/-0.012661984288266091)
Recall is 0.19964089725109377 (+/-0.023017783159678427)


In [115]:
# Show upper accuracy

for train_index, test_index in cv.split(X):
    model.fit(X, y)
    y_pred = model.predict(X.iloc[test_index, :])
    y_true = y[test_index]
    f1, pr, rc = f1_score(y_true, y_pred)
    f1s.append(f1)
    prs.append(pr)
    rcs.append(rc)

print("F1 is {} (+/-{})".format(np.mean(f1s), np.std(f1s)))
print("Precision is {} (+/-{})".format(np.mean(prs), np.std(prs)))
print("Recall is {} (+/-{})".format(np.mean(rcs), np.std(rcs)))

F1 is 0.5477298340779891 (+/-0.2798488854225377)
Precision is 0.5380430378410177 (+/-0.29346963830197725)
Recall is 0.5634228991048145 (+/-0.2578640040410514)
