-
Notifications
You must be signed in to change notification settings - Fork 3
/
test_model.py
286 lines (216 loc) · 9.7 KB
/
test_model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
import pytest
import lxml
import numpy as np
import pandas as pd
import cophi
DOCUMENT = "AAABBCCCDEF"
TOKENS = list(DOCUMENT)
LOWERCASE_TOKENS = [token.lower() for token in TOKENS]
def make_file(tmpdir, fname, content):
p = tmpdir.mkdir("sub").join(fname)
p.write(content)
return p
@pytest.fixture
def textfile_suffix(tmpdir):
p = make_file(tmpdir, "document.txt", DOCUMENT)
return cophi.model.Textfile(str(p), treat_as=None)
@pytest.fixture
def textfile_txt(tmpdir):
p = make_file(tmpdir, "document.txt", DOCUMENT)
return cophi.model.Textfile(str(p), treat_as=".txt")
@pytest.fixture
def textfile_xml(tmpdir):
p = make_file(tmpdir, "document.xml", "<xml>{}</xml>".format(DOCUMENT))
return cophi.model.Textfile(str(p), treat_as=".xml")
@pytest.fixture
def document():
return cophi.model.Document(DOCUMENT, "document", r"\w")
@pytest.fixture
def corpus(document):
return cophi.model.Corpus([document])
class TestTextfile:
def test_suffix(self, textfile_suffix, tmpdir):
assert str(textfile_suffix.filepath) == str(tmpdir.join("sub", "document.txt"))
assert textfile_suffix.title == "document"
assert textfile_suffix.suffix == ".txt"
assert textfile_suffix.parent == str(tmpdir.join("sub"))
assert textfile_suffix.encoding == "utf-8"
assert textfile_suffix.treat_as == None
assert textfile_suffix.content == DOCUMENT
assert textfile_suffix.size == len(DOCUMENT)
def test_txt(self, textfile_txt, tmpdir):
assert str(textfile_txt.filepath) == str(tmpdir.join("sub", "document.txt"))
assert textfile_txt.title == "document"
assert textfile_txt.suffix == ".txt"
assert textfile_txt.parent == str(tmpdir.join("sub"))
assert textfile_txt.encoding == "utf-8"
assert textfile_txt.treat_as == ".txt"
assert textfile_txt.content == DOCUMENT
assert textfile_txt.size == len(DOCUMENT)
def test_xml(self, textfile_xml, tmpdir):
assert str(textfile_xml.filepath) == str(tmpdir.join("sub", "document.xml"))
assert textfile_xml.title == "document"
assert textfile_xml.suffix == ".xml"
assert textfile_xml.parent == str(tmpdir.join("sub"))
assert textfile_xml.encoding == "utf-8"
assert textfile_xml.treat_as == ".xml"
assert textfile_xml.content == DOCUMENT
assert textfile_xml.size == len(DOCUMENT)
def test_parse_xml(self, textfile_xml):
assert isinstance(textfile_xml.parse_xml(), lxml.etree._ElementTree)
def test_stringify(self, textfile_xml):
tree = textfile_xml.parse_xml()
assert textfile_xml.stringify(tree) == "AAABBCCCDEF"
def test_value_error(self, tmpdir):
with pytest.raises(ValueError):
cophi.model.Textfile("raises", treat_as="error")
class TestDocument:
def test_attributes(self, document):
assert document.text == DOCUMENT
assert document.title == "document"
assert document.lowercase == True
assert document.n == None
assert document.token_pattern == r"\w"
assert document.maximum == None
assert document.tokens == LOWERCASE_TOKENS
def test_ngram_value_error(self):
with pytest.raises(ValueError):
cophi.model.Document(DOCUMENT, n=0)
def test_ngrams(self):
document = cophi.model.Document(DOCUMENT, token_pattern=r"\w", n=2)
assert list(document.ngrams)[0] == "a a"
document = cophi.model.Document(DOCUMENT, token_pattern=r"\w", n=1)
assert document.ngrams == LOWERCASE_TOKENS
with pytest.raises(ValueError):
document = cophi.model.Document(DOCUMENT, token_pattern=r"\w", n=None)
document.ngrams == LOWERCASE_TOKENS
def test_types(self, document):
assert len(document.types) == len(set(TOKENS))
def test_lengths(self, document):
assert len(document.lengths) == len(TOKENS)
def test_mean_length(self, document):
assert document.mean_length == 1
def test_num_tokens(self, document):
assert document.num_tokens == len(TOKENS)
def test_num_types(self, document):
assert document.num_types == len(set(TOKENS))
def test_bow(self, document):
assert document.bow.sum() == 11
def test_rel(self, document):
assert round(document.rel.sum()) == 1
def test_mfw(self, document):
assert document.mfw(1, as_list=False).sum() == 3
assert round(document.mfw(1, rel=True, as_list=False).sum()) == 0
assert len(document.mfw(1, as_list=True)) == 1
def test_hapax(self, document):
assert document.hapax == ["d", "e", "f"]
def test_window(self, document):
for expected, chunk in zip(LOWERCASE_TOKENS, document.window(1)):
assert expected == chunk[0]
def test_freq_spectrum(self, document):
assert document.freq_spectrum.sum() == 6
def test_drop(self, document):
features = ["a", "b", "c"]
tokens = document.drop(LOWERCASE_TOKENS, features)
assert list(tokens) == ["d", "e", "f"]
def test_paragraphs(self, document):
assert list(document.paragraphs()) == [DOCUMENT]
def test_segments(self, document):
assert list(document.segments(1))[0] == ["a"]
assert list(document.segments(1, flatten=False))[0] == [["a"]]
def test_bootstrap(self, document):
assert list(document.bootstrap(window=5)) == [0.4, 0.6]
def test_complexity(self, document):
ttr = document.complexity("ttr")
assert ttr == 0.5454545454545454
ttr, ci = document.complexity("ttr", window=3)
assert ttr == 0.5555555555555555
assert ci == 0.17781481095759366
orlov_z = document.complexity("orlov_z", max_iterations=1)
assert orlov_z == 7.461820552205992
orlov_z = document.complexity("orlov_z", window=3, max_iterations=1)
assert orlov_z == 0.3333333333333333
cttr = document.complexity("cttr")
assert cttr == 1.2792042981336627
cttr = document.complexity("cttr", window=3)
assert cttr == 0.6804138174397717
class TestCorpus:
def test_sparse_error(self, document):
with pytest.raises(NotImplementedError):
cophi.model.Corpus([document], sparse=True)
def test_dtm(self, corpus):
assert corpus.dtm.sum().sum() == len(TOKENS)
def test_map_metadata(self, corpus):
metadata = pd.DataFrame({"uuid": "document", "A": "metadata"}, index=[1])
matrix = corpus.map_metadata(corpus.dtm, metadata, fields=["A"])
assert matrix.sum().sum() == len(TOKENS)
assert "metadata" in matrix.index
assert "document" not in matrix.index
def test_stats(self, corpus):
assert corpus.stats.sum() == 21
def test_freq_spectrum(self, corpus):
assert corpus.freq_spectrum.sum() == len(set(TOKENS))
def test_types(self, corpus):
assert len(corpus.types) == len(set(TOKENS))
def test_sort(self, corpus):
assert corpus.sort(corpus.dtm).sum().sum() == len(TOKENS)
def test_mfw(self, corpus):
assert corpus.mfw(1, as_list=False, rel=False).sum() == 3
assert round(corpus.mfw(1, rel=True, as_list=False).sum()) == 0
assert len(corpus.mfw(1, as_list=True, rel=False)) == 1
def test_drop(self, corpus):
matrix = corpus.drop(corpus.dtm, ["a"])
assert "a" not in matrix.columns
def test_cull(self, corpus):
assert corpus.cull(corpus.dtm, 1).sum().sum() == len(TOKENS)
def test_zscores(self, corpus):
assert corpus.zscores.sum().sum() == 0
def test_rel(self, corpus):
assert round(corpus.rel.sum().sum()) == 1
def test_tfidf(self, corpus):
assert corpus.tfidf.sum().sum() == 0
def test_num_types(self, corpus):
assert corpus.num_types.sum().sum() == len(set(TOKENS))
def test_num_tokens(self, corpus):
assert corpus.num_tokens.sum().sum() == len(TOKENS)
def test_complexity(self, corpus):
ttr = corpus.complexity(window=3, measure="ttr")
assert ttr.sum().sum() == 0.7333703665131491
cttr = corpus.complexity(window=3, measure="cttr")
assert cttr.sum().sum() == 0.6804138174397717
def test_ttr(self, corpus):
assert corpus.ttr == 0.5454545454545454
def test_guiraud_r(self, corpus):
assert corpus.guiraud_r == 1.8090680674665818
def test_herdan_c(self, corpus):
assert corpus.herdan_c == 0.7472217363092141
def test_dugast_k(self, corpus):
assert corpus.dugast_k == 2.0486818235486686
def test_dugast_u(self, corpus):
assert corpus.dugast_u == 2.0486818235486686
def test_maas_a2(self, corpus):
assert corpus.maas_a2 == 0.1054167238070372
def test_tuldava_ln(self, corpus):
assert corpus.tuldava_ln == -0.40544815832912834
def test_brunet_w(self, corpus):
assert corpus.brunet_w == 26.138632903383154
def test_cttr(self, corpus):
assert corpus.cttr == 1.2792042981336627
def test_summer_s(self, corpus):
assert corpus.summer_s == 0.6668234928556862
def test_sichel_s(self, corpus):
assert corpus.sichel_s == 0.16666666666666666
def test_michea_m(self, corpus):
assert corpus.michea_m == 6
def test_honore_h(self, corpus):
assert corpus.honore_h == 479.57905455967415
def test_entropy(self, corpus):
assert corpus.entropy == 1.6726254461503205
def test_yule_k(self, corpus):
assert corpus.yule_k == -661.1570247933887
def test_simpson_d(self, corpus):
assert corpus.simpson_d == 0.08181818181818183
def test_herdan_vm(self, corpus):
assert corpus.herdan_vm == 0.1998622114889836
def test_orlov_z(self, corpus):
assert corpus.orlov_z(max_iterations=1) == 7.461820552205992