# CRFを用いたスロットフィリング(スロット識別)

CRFアルゴリズムを用いてスロットフィリングを行う. データセットはSNIPSというデータを用いる.

In [1]:
import json
import os

import nltk
import numpy as np
import scipy
from nltk.tag import pos_tag
from seqeval.metrics import classification_report, f1_score
from sklearn_crfsuite import CRF
from sklearn.metrics import make_scorer
from sklearn.model_selection import RandomizedSearchCV
nltk.download("averaged_perceptron_tagger")

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

## データセット読み込み

In [2]:
!head -34 ./data/train_PlayMusic_full.json

{
  "PlayMusic": [
    {
      "data": [
        {
          "text": "I need to hear the "
        },
        {
          "text": "song",
          "entity": "music_item"
        },
        {
          "text": " "
        },
        {
          "text": "Aspro Mavro",
          "entity": "track"
        },
        {
          "text": " from "
        },
        {
          "text": "Bill Szymczyk",
          "entity": "artist"
        },
        {
          "text": " on "
        },
        {
          "text": "Youtube",
          "entity": "service"
        }
      ]
    },


In [3]:
def load_data(filename):
    with open(filename, encoding="iso-8859-2") as f:
        datalist = json.load(f)
    output = []
    for data in datalist["PlayMusic"]:
        sent = []
        tags = []
        for phrase in data["data"]:
            words = phrase["text"].strip().split()
            if "entity" in phrase:
                label = phrase["entity"]
                labels = [f"B-{label}"] + [f"I-{label}"] * (len(words) - 1)
            else:
                labels = ["O"] * len(words)
            sent.extend(words)
            tags.extend(labels)
        output.append([sent, tags])
    return output

In [4]:
train_file = "./data/train_PlayMusic_full.json"
test_file = "./data/validate_PlayMusic.json"

train_data = load_data(train_file)
test_data = load_data(test_file)
train_data[0]

[['I',
  'need',
  'to',
  'hear',
  'the',
  'song',
  'Aspro',
  'Mavro',
  'from',
  'Bill',
  'Szymczyk',
  'on',
  'Youtube'],
 ['O',
  'O',
  'O',
  'O',
  'O',
  'B-music_item',
  'B-track',
  'I-track',
  'O',
  'B-artist',
  'I-artist',
  'O',
  'B-service']]

単語埋め込みとしてGloVeを用いる.

In [5]:
BASE_DIR = "data"
GLOVE_FILE = os.path.join(BASE_DIR, "glove.6B.100d.txt")
EMBEDDING_DIM = 100

In [6]:
print("Preparing embedding matrix.")
embeddings_index = {}
with open(GLOVE_FILE, encoding="utf-8") as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype="float32")
        embeddings_index[word] = coefs
print("Found %s word vectors in Glove embeddings." % len(embeddings_index))

def get_embeddings(word):
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is None:
        # words not found in embedding index will be all-zeros.
        embedding_vector = np.zeros(shape=(EMBEDDING_DIM,))
    return embedding_vector

Preparing embedding matrix.
Found 400000 word vectors in Glove embeddings.


## 前処理

In [7]:
# CRF特徴量を作成する関数
def sent2feats(sentence):
    feats = []
    sen_tags = pos_tag(sentence)  # This format is specific to this POS tagger!
    for i in range(0, len(sentence)):
        word = sentence[i]
        wordfeats = {}
        # word features: word, prev 2 words, next 2 words in the sentence.
        wordfeats["word"] = word
        if i == 0:
            wordfeats["prevWord"] = wordfeats["prevSecondWord"] = "<S>"
        elif i == 1:
            wordfeats["prevWord"] = sentence[0]
            wordfeats["prevSecondWord"] = "</S>"
        else:
            wordfeats["prevWord"] = sentence[i - 1]
            wordfeats["prevSecondWord"] = sentence[i - 2]
        # next two words as features
        if i == len(sentence) - 2:
            wordfeats["nextWord"] = sentence[i + 1]
            wordfeats["nextNextWord"] = "</S>"
        elif i == len(sentence) - 1:
            wordfeats["nextWord"] = "</S>"
            wordfeats["nextNextWord"] = "</S>"
        else:
            wordfeats["nextWord"] = sentence[i + 1]
            wordfeats["nextNextWord"] = sentence[i + 2]

        # POS tag features: current tag, previous and next 2 tags.
        wordfeats["tag"] = sen_tags[i][1]
        if i == 0:
            wordfeats["prevTag"] = wordfeats["prevSecondTag"] = "<S>"
        elif i == 1:
            wordfeats["prevTag"] = sen_tags[0][1]
            wordfeats["prevSecondTag"] = "</S>"
        else:
            wordfeats["prevTag"] = sen_tags[i - 1][1]

            wordfeats["prevSecondTag"] = sen_tags[i - 2][1]
            # next two words as features
        if i == len(sentence) - 2:
            wordfeats["nextTag"] = sen_tags[i + 1][1]
            wordfeats["nextNextTag"] = "</S>"
        elif i == len(sentence) - 1:
            wordfeats["nextTag"] = "</S>"
            wordfeats["nextNextTag"] = "</S>"
        else:
            wordfeats["nextTag"] = sen_tags[i + 1][1]
            wordfeats["nextNextTag"] = sen_tags[i + 2][1]

        # Adding word vectors
        vector = get_embeddings(word.lower())
        for iv, value in enumerate(vector):
            wordfeats["v{}".format(iv)] = value
        feats.append(wordfeats)
    return feats

In [8]:
# Extract features from the conll data, after loading it.
def get_feats_conll(conll_data):
    feats = []
    labels = []
    for sentence in conll_data:
        feats.append(sent2feats(sentence[0]))
        labels.append(sentence[1])
    return feats, labels

In [9]:
x_train, y_train = get_feats_conll(train_data)
x_valid, y_valid = get_feats_conll(test_data)
print(x_train[0])

[{'word': 'I', 'prevWord': '<S>', 'prevSecondWord': '<S>', 'nextWord': 'need', 'nextNextWord': 'to', 'tag': 'PRP', 'prevTag': '<S>', 'prevSecondTag': '<S>', 'nextTag': 'VBP', 'nextNextTag': 'TO', 'v0': -0.046539, 'v1': 0.61966, 'v2': 0.56647, 'v3': -0.46584, 'v4': -1.189, 'v5': 0.44599, 'v6': 0.066035, 'v7': 0.3191, 'v8': 0.14679, 'v9': -0.22119, 'v10': 0.79239, 'v11': 0.29905, 'v12': 0.16073, 'v13': 0.025324, 'v14': 0.18678, 'v15': -0.31001, 'v16': -0.28108, 'v17': 0.60515, 'v18': -1.0654, 'v19': 0.52476, 'v20': 0.064152, 'v21': 1.0358, 'v22': -0.40779, 'v23': -0.38011, 'v24': 0.30801, 'v25': 0.59964, 'v26': -0.26991, 'v27': -0.76035, 'v28': 0.94222, 'v29': -0.46919, 'v30': -0.18278, 'v31': 0.90652, 'v32': 0.79671, 'v33': 0.24825, 'v34': 0.25713, 'v35': 0.6232, 'v36': -0.44768, 'v37': 0.65357, 'v38': 0.76902, 'v39': -0.51229, 'v40': -0.44333, 'v41': -0.21867, 'v42': 0.3837, 'v43': -1.1483, 'v44': -0.94398, 'v45': -0.15062, 'v46': 0.30012, 'v47': -0.57806, 'v48': 0.20175, 'v49': -1.659

## Modeling

In [11]:
model = CRF(
    algorithm='lbfgs',
    max_iterations=100,
    all_possible_transitions=False
)
try:
    model.fit(x_train, y_train)
except AttributeError:
    pass

In [12]:
y_pred = model.predict(x_valid)
print(classification_report(y_valid, y_pred, digits=4))

              precision    recall  f1-score   support

       album     0.3333    0.0769    0.1250        13
      artist     0.8939    0.9365    0.9147        63
       genre     0.6667    0.6667    0.6667         3
  music_item     0.9375    0.9677    0.9524        31
    playlist     0.7143    0.5556    0.6250         9
     service     0.9487    0.9487    0.9487        39
        sort     0.9375    0.8824    0.9091        17
       track     0.4000    0.6667    0.5000         6
        year     0.9615    1.0000    0.9804        25

   micro avg     0.8812    0.8641    0.8725       206
   macro avg     0.7548    0.7446    0.7358       206
weighted avg     0.8617    0.8641    0.8561       206

