# Baseline for Slot2

## Check dataset existence

In [1]:
# check data existence
from pathlib import Path


data_folder = Path.cwd().parent.joinpath("data/processed")

def check_data_existence(folder):
    file_count = len(list(folder.glob("e*_ann.json")))
    if  file_count == 0:
        raise Exception("Processed Data does not exist.")
    else:
        print("{} files exist.".format(file_count))


check_data_existence(data_folder)

230 files exist.


## Read Slot2 data to DataFrame

At the Slot2 task, sentence and categori is given, then predict each target(entity) positions.

* X: sentence, categories, target words (to create category: words dictionary)
* y: target positions

In [2]:
label_kinds = []

# make labels (exclude NULL and OOD)
for e in ["market", "company", "business", "product"]:
    for a in ["general", "sales", "profit", "amount", "price", "cost"]:
        label_kinds.append(e + "#" + a)
        if e in ["market"]:
            break;

print(label_kinds)

['market#general', 'company#general', 'company#sales', 'company#profit', 'company#amount', 'company#price', 'company#cost', 'business#general', 'business#sales', 'business#profit', 'business#amount', 'business#price', 'business#cost', 'product#general', 'product#sales', 'product#profit', 'product#amount', 'product#price', 'product#cost']


In [3]:
import json
import pandas as pd
from collections import Counter
from janome.tokenizer import Tokenizer


dataset = []
labels = []

for f in data_folder.glob("e*_ann.json"):
    with f.open(encoding="utf-8") as j:
        d = json.load(j)
        for s in d["sentences"]:
            os = []
            cs = []
            ws = []
            for o in s["opinions"]:
                category = o["category"]
                word = o["target"]
                if category in label_kinds and category not in cs:
                    cs.append(category)
                    ws.append(word)
                    os.append((o["from"], o["to"]))
            
            if len(cs) > 0:
                dataset.append(
                    {"sentence": s["sentence"], 
                     "categories": cs,
                     "words": ws}
                )
                labels.append(os)

dataset = pd.DataFrame(dataset)
print(dataset.head(5))
print(labels[:5])

                          categories  \
0                   [market#general]   
1                    [product#sales]   
2                   [business#sales]   
3  [business#sales, business#profit]   
4                   [market#general]   

                                            sentence                 words  
0  当歯科業界におきましては、デジタル化の進展により市場環境は大きく変化しており、世界規模で企業...                [歯科業界]  
1  これらの新製品に加え、当社の注力分野である化工品やＣＡＤ/ＣＡＭ関連製品が売上に寄与しました...                 [化工品]  
2  海外では、積極的な拡販戦略が功を奏し、北米や中国で売上が堅調に推移しましたが、為替の円高の影...                  [海外]  
3  これらの結果、デンタル関連事業の売上高は、20,267百万円と前年同期比732百万円(3.5...  [デンタル関連事業, デンタル関連事業]  
4  ネイル業界におきましては、市場は緩やかな拡大傾向を維持しているものの、ユーザーの低価格志向の...               [ネイル業界]  
[[(1, 5)], [(21, 24)], [(0, 2)], [(7, 15), (7, 15)], [(0, 5)]]


## Make Baseline Model

In [4]:
import numpy as np


class DetectEntities():
    
    def __init__(self):
        self.category_dict = {}
    
    def fit(self, X, y=None):
        self.category_dict = {}
        for index, row in X.iterrows():
            for c, w in zip(row["categories"], row["words"]):
                if c not in self.category_dict:
                    self.category_dict[c] = []
                self.category_dict[c].append(w)

        for c in self.category_dict:
            cnt = Counter(self.category_dict[c])
            freq = sorted(cnt.most_common(), key=lambda x: x[1])
            self.category_dict[c] = [f[0] for f in freq]

        return self
    
    def search(self, sentence, word):
        from_index = sentence.find(word)
        if from_index > -1:
            to_index = from_index + len(word)
            return (from_index, to_index)
        else:
            return None
    
    def predict(self, X, copy=True):
        predictions = []
        for index, row in X.iterrows():
            s = row["sentence"]
            preds = []
            for c in row["categories"]:
                if c not in self.category_dict:
                    continue
                for w in self.category_dict[c]:
                    p = self.search(s, w)
                    if p is not None:
                        preds.append(p)
                        break
            predictions.append(preds)

        return np.array(predictions)


model = DetectEntities()

In [5]:
model.fit(dataset)
sample_indices = np.random.randint(0, len(dataset), 3)
samples = dataset.iloc[sample_indices, :]
pred_samples = model.predict(samples)
true_samples = np.array(labels)[sample_indices]

cnt = 0
for i, row in samples.iterrows():
    print("sentence: {}".format(row["sentence"]))
    print("predicted: {}".format(pred_samples[cnt]))
    print("true: {}".format(true_samples[cnt]))
    cnt += 1
    print("---------------------------------------")

sentence: 医薬品事業の売上高は、614億５千４百万円（前連結会計年度比0.6％減）となりました
predicted: [(0, 5)]
true: [(0, 5)]
---------------------------------------
sentence: 流通業におきましては、マダムジョイ店舗では直営部門は利用客、売上ともに増加したものの、軽油単価の下落による商事部門の売上高減少やテナント売上高減少の影響により、減収となりました
predicted: [(0, 3), (11, 19)]
true: [(0, 3), (11, 19)]
---------------------------------------
sentence: 報告セグメントに含まれない不動産賃貸料の収入など、その他事業の当連結会計年度の売上高は 200百万円 (前年同期比 1.6％増)、営業利益は 51百万円 (前年同期比 26.1％増) となりました
predicted: [(13, 18), (28, 30)]
true: [(25, 30), (25, 30)]
---------------------------------------


In [12]:
from sklearn.model_selection import ShuffleSplit


def f1_score(preds, trues, digit=-1):
    # calculate by micro
    matches = 0
    for_recall = 0
    for_precision = 0

    def safe_div(x1, x2):
        return 0 if x2 == 0 else x1 / x2

    for p, t in zip(preds, trues):
        for_precision += len(p)
        for_recall += len(t)
        for _p in p:
            if _p in t:
                matches += 1

    recall = safe_div(matches, for_recall)
    precision = safe_div(matches, for_precision)
    if digit >= 0:
        precision, recall = np.round(precision, digit + 1), np.round(recall, digit + 1)
    f1 =  safe_div(2 * (precision * recall), (precision + recall))
    if digit >= 0:
        f1 = np.round(f1, digit)
    return f1, precision, recall


X = dataset
y = np.array(labels)
cv = ShuffleSplit(n_splits=3, test_size=0.2, random_state=0)


f1s = []
prs = []
rcs = []
for train_index, test_index in cv.split(X):
    model.fit(X.iloc[train_index, :], y[train_index])
    y_pred = model.predict(X.iloc[test_index, :])
    y_true = y[test_index]
    f1, pr, rc = f1_score(y_true, y_pred, digit=5)
    f1s.append(f1)
    prs.append(pr)
    rcs.append(rc)

print("F1 is {} (+/-{})".format(np.mean(f1s), np.std(f1s)))
print("Precision is {} (+/-{})".format(np.mean(prs), np.std(prs)))
print("Recall is {} (+/-{})".format(np.mean(rcs), np.std(rcs)))
print("F1: {}".format(np.median(f1s)))

F1 is 0.17668666666666666 (+/-0.03402188543994717)
Precision is 0.14409166666666665 (+/-0.029496274593393802)
Recall is 0.2285366666666667 (+/-0.039427497057538695)
F1: 0.19414


In [13]:
# Show upper accuracy

for train_index, test_index in cv.split(X):
    model.fit(X, y)
    y_pred = model.predict(X.iloc[test_index, :])
    y_true = y[test_index]
    f1, pr, rc = f1_score(y_true, y_pred)
    f1s.append(f1)
    prs.append(pr)
    rcs.append(rc)

print("F1 is {} (+/-{})".format(np.mean(f1s), np.std(f1s)))
print("Precision is {} (+/-{})".format(np.mean(prs), np.std(prs)))
print("Recall is {} (+/-{})".format(np.mean(rcs), np.std(rcs)))

F1 is 0.4603425607707288 (+/-0.2850266240770949)
Precision is 0.4440450607707289 (+/-0.3010113279768014)
Recall is 0.4862675607707288 (+/-0.2596213461296213)
