/
test_text_models.py
184 lines (163 loc) · 5.9 KB
/
test_text_models.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
test_standard_models
----------------------------------
Tests for `standard_models` module.
"""
# pylint: disable=locally-disabled,redefined-outer-name
import os
import pytest
from mindmeld import markup
from mindmeld.models import CLASS_LABEL_TYPE, QUERY_EXAMPLE_TYPE, ModelConfig
from mindmeld.models.text_models import TextModel
from mindmeld.query_factory import QueryFactory
from mindmeld.resource_loader import ResourceLoader
from mindmeld.tokenizer import Tokenizer
APP_NAME = "kwik_e_mart"
APP_PATH = os.path.join(
os.path.dirname(os.path.dirname(os.path.abspath(__file__))), APP_NAME
)
@pytest.fixture
def tokenizer():
"""A tokenizer for normalizing text"""
return Tokenizer()
@pytest.fixture
def query_factory(tokenizer):
"""For creating queries"""
return QueryFactory(tokenizer)
@pytest.fixture
def resource_loader(query_factory):
"""A resource loader"""
return ResourceLoader(APP_PATH, query_factory)
class TestTextModel:
@classmethod
def setup_class(cls):
data_dict = {
"greet": [
"Hello",
"Hello!",
"hey",
"what's up",
"greetings",
"yo",
"hi",
"hey, how are you?",
"hola",
"start",
],
"exit": [
"bye",
"goodbye",
"until next time",
"see ya later",
"ttyl",
"talk to you later" "later",
"have a nice day",
"finish",
"gotta go" "I'm leaving",
"I'm done",
"that's all",
],
}
labeled_data = []
for intent in data_dict:
for text in data_dict[intent]:
labeled_data.append(markup.load_query(text, intent=intent))
cls.labeled_data = labeled_data
def test_fit(self, resource_loader):
"""Tests that a basic fit succeeds"""
config = ModelConfig(
**{
"model_type": "text",
"example_type": QUERY_EXAMPLE_TYPE,
"label_type": CLASS_LABEL_TYPE,
"model_settings": {"classifier_type": "logreg"},
"params": {"fit_intercept": True, "C": 100},
"features": {
"bag-of-words": {"lengths": [1]},
"freq": {"bins": 5},
"length": {},
},
}
)
model = TextModel(config)
examples = [q.query for q in self.labeled_data]
labels = [q.intent for q in self.labeled_data]
model.initialize_resources(resource_loader, examples, labels)
model.fit(examples, labels)
assert model._current_params == {"fit_intercept": True, "C": 100}
def test_fit_cv(self, resource_loader):
"""Tests fitting with param selection"""
config = ModelConfig(
**{
"model_type": "text",
"example_type": QUERY_EXAMPLE_TYPE,
"label_type": CLASS_LABEL_TYPE,
"model_settings": {"classifier_type": "logreg"},
"param_selection": {
"type": "k-fold",
"k": 10,
"grid": {"C": [10, 100, 1000], "fit_intercept": [True, False]},
},
"features": {
"bag-of-words": {"lengths": [1]},
"freq": {"bins": 5},
"length": {},
},
}
)
model = TextModel(config)
examples = [q.query for q in self.labeled_data]
labels = [q.intent for q in self.labeled_data]
model.initialize_resources(resource_loader, examples, labels)
model.fit(examples, labels)
assert model._current_params
def test_fit_predict(self, resource_loader):
"""Tests prediction after a fit"""
config = ModelConfig(
**{
"model_type": "text",
"example_type": QUERY_EXAMPLE_TYPE,
"label_type": CLASS_LABEL_TYPE,
"model_settings": {"classifier_type": "logreg"},
"params": {"fit_intercept": True, "C": 100},
"features": {
"bag-of-words": {"lengths": [1]},
"freq": {"bins": 5},
"length": {},
},
}
)
model = TextModel(config)
examples = [q.query for q in self.labeled_data]
labels = [q.intent for q in self.labeled_data]
model.initialize_resources(resource_loader, examples, labels)
model.fit(examples, labels)
assert model.predict([markup.load_query("hi").query]) == "greet"
assert model.predict([markup.load_query("bye").query]) == "exit"
def test_extract_features(self, resource_loader):
"""Tests extracted features after a fit"""
config = ModelConfig(
**{
"model_type": "text",
"example_type": QUERY_EXAMPLE_TYPE,
"label_type": CLASS_LABEL_TYPE,
"model_settings": {"classifier_type": "logreg"},
"params": {"fit_intercept": True, "C": 100},
"features": {"bag-of-words": {"lengths": [1]},},
}
)
model = TextModel(config)
examples = [q.query for q in self.labeled_data]
labels = [q.intent for q in self.labeled_data]
model.initialize_resources(resource_loader, examples, labels)
model.fit(examples, labels)
expected_features = {
"bag_of_words|length:1|ngram:hi": 1,
"bag_of_words|length:1|ngram:there": 1,
}
extracted_features = model.view_extracted_features(
markup.load_query("hi there").query
)
assert extracted_features == expected_features