In [1]:
import MeCab

# 分かち書き


In [2]:
tagger = MeCab.Tagger()
mytext = 'こんにちは元木です元気ですか'

In [3]:
print(tagger.parse(mytext))

こんにちは	感動詞,*,*,*,*,*,こんにちは,コンニチハ,コンニチワ
元木	名詞,固有名詞,人名,姓,*,*,元木,モトキ,モトキ
です	助動詞,*,*,*,特殊・デス,基本形,です,デス,デス
元気	名詞,形容動詞語幹,*,*,*,*,元気,ゲンキ,ゲンキ
です	助動詞,*,*,*,特殊・デス,基本形,です,デス,デス
か	助詞,副助詞／並立助詞／終助詞,*,*,*,*,か,カ,カ
EOS



In [4]:
def tokenize(text):
    node = tagger.parseToNode(text)
    tokens = []
    while node:
        if node.surface != '':
            tokens.append(node.surface)
        node = node.next
    return tokens

In [5]:
tokenize(mytext)

['こんにちは', '元木', 'です', '元気', 'です', 'か']

In [6]:
tagger = MeCab.Tagger('-Owakati')

In [7]:
print(tagger.parse(mytext
                  ))

こんにちは 元木 です 元気 です か 



# Bag of words（Bow）

In [38]:
texts = [
    mytext
    
]

In [41]:
texts = [
    '私は私のことが好きなあなたが好きです',
    '私はラーメンが好きです',
    '富士山は日本一高い山です',
]

In [42]:
from sklearn.feature_extraction.text import CountVectorizer
from tokenizer import tokenize

vectorizer = CountVectorizer(tokenizer = tokenize)
vectorizer.fit(texts)
bow = vectorizer.transform(texts)



In [44]:
import pandas as pd
bow_table = pd.DataFrame(bow.toarray(), columns=vectorizer.get_feature_names())
print('Shape: {}'.format(bow.shape))
bow_table

Shape: (3, 14)


Unnamed: 0,あなた,が,こと,です,な,の,は,ラーメン,好き,富士山,山,日本一,私,高い
0,1,2,1,1,1,1,1,0,2,0,0,0,2,0
1,0,1,0,1,0,0,1,1,1,0,0,0,1,0
2,0,0,0,1,0,0,1,0,0,1,1,1,0,1


In [16]:
!pip install tokenizer

Collecting tokenizer
  Downloading tokenizer-2.0.4-py2.py3-none-any.whl (104 kB)
[K     |████████████████████████████████| 104 kB 5.6 MB/s eta 0:00:01
[?25hInstalling collected packages: tokenizer
Successfully installed tokenizer-2.0.4


In [11]:
from collections import Counter

from tokenizer import tokenize


def calc_bow(tokenized_texts):
    counts = [Counter(tokenized_text)
              for tokenized_text in tokenized_texts]  # <1>
    sum_counts = sum(counts, Counter())  # <2>
    vocabulary = sum_counts.keys()

    bow = [[count[word] for word in vocabulary]
           for count in counts]  # <3>

    return vocabulary, bow


# 入力文のlist
texts = [
    '私は私のことが好きなあなたが好きです',
    '私はラーメンが好きです',
    '富士山は日本一高い山です',
]

tokenized_texts = [tokenize(text) for text in texts]
bow = calc_bow(tokenized_texts)
vocabulary, bow

NameError: name 'vocabulary' is not defined

In [12]:
# 入力文のlist
texts = [
    mytext
]

tokenized_texts = [tokenize(text) for text in texts]
bow = calc_bow(tokenized_texts)
bow

(dict_keys([Tok(kind=11001, txt=None, val=(0, None)), Tok(kind=6, txt='こんにちは元木です元気ですか', val=None), Tok(kind=11002, txt=None, val=None)]),
 [[1, 1, 1]])

In [20]:
from tokenizer import tokenize  # <1>


def calc_bow(tokenized_texts):  # <2>
    # Build vocabulary <3>
    vocabulary = {}
    for tokenized_text in tokenized_texts:
        for token in tokenized_text:
            if token not in vocabulary:
                vocabulary[token] = len(vocabulary)

    n_vocab = len(vocabulary)

    # Build BoW Feature Vector <4>
    bow = [[0] * n_vocab for i in range(len(tokenized_texts))]
    for i, tokenized_text in enumerate(tokenized_texts):
        for token in tokenized_text:
            index = vocabulary[token]
            bow[i][index] += 1

    return vocabulary, bow




# 入力文のlist
texts = [
    '私は私のことが好きなあなたが好きです',
    '私はラーメンが好きです',
    '富士山は日本一高い山です',
]

tokenized_texts = [tokenize(text) for text in texts]
vocabulary, bow = calc_bow(tokenized_texts)

In [21]:
print(vocabulary, bow)

{'私': 0, 'は': 1, 'の': 2, 'こと': 3, 'が': 4, '好き': 5, 'な': 6, 'あなた': 7, 'です': 8, 'ラーメン': 9, '富士山': 10, '日本一': 11, '高い': 12, '山': 13} [[2, 1, 1, 1, 2, 2, 1, 1, 1, 0, 0, 0, 0, 0], [1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1]]


In [24]:
# 入力文のlist
texts = [
    mytext
]

tokenized_texts = [tokenize(text) for text in texts]
vocabulary, bow = calc_bow(tokenized_texts)
vocabulary, bow

({'こんにちは': 0, '元木': 1, 'です': 2, '元気': 3, 'か': 4}, [[1, 1, 2, 1, 1]])

In [26]:
print(vocabulary)

{'こんにちは': 0, '元木': 1, 'です': 2, '元気': 3, 'か': 4}


In [28]:
aaa = [1, 2, 3]
df = pd.DataFrame({'word':aaa})
df

Unnamed: 0,word
0,1
1,2
2,3


In [30]:
all_words_df = pd.DataFrame({'word':vocabulary})
all_words_df

Unnamed: 0,word
か,4
こんにちは,0
です,2
元木,1
元気,3


In [22]:
from sklearn.feature_extraction.text import CountVectorizer

from tokenizer import tokenize  # <1>

texts = [
    '私は私のことが好きなあなたが好きです',
    '私はラーメンが好きです。',
    '富士山は日本一高い山です',
]

# Bag of Words計算
vectorizer = CountVectorizer(tokenizer=tokenize)  # <2>
vectorizer.fit(texts)  # <3>
bow = vectorizer.transform(texts)  # <4>
print(bow)

  (0, 1)	1
  (0, 2)	2
  (0, 3)	1
  (0, 4)	1
  (0, 5)	1
  (0, 6)	1
  (0, 7)	1
  (0, 9)	2
  (0, 13)	2
  (1, 0)	1
  (1, 2)	1
  (1, 4)	1
  (1, 7)	1
  (1, 8)	1
  (1, 9)	1
  (1, 13)	1
  (2, 4)	1
  (2, 7)	1
  (2, 10)	1
  (2, 11)	1
  (2, 12)	1
  (2, 14)	1




# 識別器

In [17]:
from os.path import dirname, join, normpath

import MeCab
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import SVC


class DialogueAgent:
    def __init__(self):
        self.tagger = MeCab.Tagger()

    def _tokenize(self, text):
        node = self.tagger.parseToNode(text)

        tokens = []
        while node:
            if node.surface != '':
                tokens.append(node.surface)

            node = node.next

        return tokens

    def train(self, texts, labels):
        vectorizer = CountVectorizer(tokenizer=self._tokenize)
        bow = vectorizer.fit_transform(texts)  # <1>

        classifier = SVC()
        classifier.fit(bow, labels)

        # <2>
        self.vectorizer = vectorizer
        self.classifier = classifier

    def predict(self, texts):
        bow = self.vectorizer.transform(texts)
        return self.classifier.predict(bow)


if __name__ == '__main__':
    BASE_DIR = normpath(dirname(__file__))

    training_data = pd.read_csv(join(BASE_DIR, './training_data.csv'))  # <3>

    dialogue_agent = DialogueAgent()
    dialogue_agent.train(training_data['text'], training_data['label'])

    with open(join(BASE_DIR, './replies.csv')) as f:  # <4>
        replies = f.read().split('\n')

    input_text = '名前を教えてよ'
    predictions = dialogue_agent.predict([input_text])  # <5>
    predicted_class_id = predictions[0]  # <6>

    print(replies[predicted_class_id])

    while True:
        input_text = input()
        predictions = dialogue_agent.predict([input_text])
        predicted_class_id = predictions[0]

        print(replies[predicted_class_id])

NameError: name '__file__' is not defined

In [18]:
!python src/sec30_lets_create_a_dialogue_agent/sampleapp/dialogue_agent.py

python: can't open file 'src/sec30_lets_create_a_dialogue_agent/sampleapp/dialogue_agent.py': [Errno 2] No such file or directory
