### Import Library

In [1]:
import json
from pyvi import ViTokenizer
from sklearn.neural_network import MLPClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle
from sklearn.metrics import f1_score
from sklearn.naive_bayes import MultinomialNB

### Preprocess data for training

In [2]:
f = open('Data.json', encoding="utf-8")
data = json.load(f)
f.close()

In [3]:
def Get_Parterns_Tags(data):
    parterns = []
    tags = []
    for obj in data:
        parterns.extend(obj['parterns'])
        tags.extend([obj['tag']]*len(obj['parterns']))
    return parterns, tags

In [4]:
parterns, tags = Get_Parterns_Tags(data)

In [5]:
print(len(set(tags)))

50


In [6]:
d = dict()
for obj in data:
    if 'responses' in obj.keys():
        d[obj['tag']] = obj['responses']

In [7]:
def preprocess(sent):
    punct = [',', '.', '!', '?']
    sent = sent.lower()
    tokens = ViTokenizer.tokenize(sent)
    tokens = tokens.split()
    for t in tokens:
        if t in punct:
            tokens.remove(t)
    return ' '.join(tokens)

In [8]:
def preprocess_sents(sents):
    for i in range(len(sents)):
        sents[i] = preprocess(sents[i])
    return sents

In [9]:
for i in range(len(parterns)):
    parterns[i] = preprocess(parterns[i])

### Train models

In [10]:
vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(parterns)
y_train = tags

#### 1. UsingNeural network

In [11]:
clf = MLPClassifier(random_state=1, max_iter=1000).fit(X_train, y_train)

#### 2. Using Naive Bayes

In [12]:
mnb = MultinomialNB()
mnb.fit(X_train, y_train)

MultinomialNB()

### Test model

In [13]:
ftest = open('test.json', encoding="utf-8")
test_data = json.load(ftest)
ftest.close()

In [14]:
test_parterns, test_tags = Get_Parterns_Tags(test_data)
test_questions = preprocess_sents(test_parterns)

In [15]:
X_test = vectorizer.transform(test_questions)

#### 1. Neural network

In [16]:
y1_test_pred = clf.predict(X_test)
f1 = f1_score(test_tags, y1_test_pred, average='macro')
print(f1)

0.9479999999999998


#### 2. Naive bayes

In [17]:
y2_test_pred = []
for q in test_questions:
    x = vectorizer.transform([q])
    mnb_predict = mnb.predict(x)
    y2_test_pred.extend(mnb_predict)
f1 = f1_score(test_tags, y2_test_pred, average='macro')
print(f1)

0.6549523809523808


### Conclusion

We decided to use Neural network model because this model give higher f1-score.

In [18]:
with open('model.pkl', 'wb') as fout:
    pickle.dump((vectorizer, clf), fout)