This repository has been archived by the owner on Nov 3, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 2.1k
/
dict.py
756 lines (659 loc) · 25.8 KB
/
dict.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
#!/usr/bin/env python3
# Copyright (c) Facebook, Inc. and its affiliates.
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
"""
Contains code for parsing and building a dictionary from text.
"""
from parlai.core.opt import Opt
from parlai.core.build_data import modelzoo_path
from parlai.utils.bpe import bpe_factory, BPEHelper
from .agents import Agent
from .build_data import make_dir
from collections import defaultdict
import codecs
import copy
import numpy as np
import os
import json
import re
RETOK = re.compile(r'\w+|[^\w\s]|\n', re.UNICODE)
def escape(s):
r"""
Replace potential special characters with escaped version.
For example, \n => \\n and \t => \\t
:param s:
string to escape
"""
return s.replace('\n', '\\n').replace('\t', '\\t').replace('\r', '\\r')
def unescape(s):
r"""
Revert escaped characters back to their special version.
For example, \\n => \n and \\t => \t
:param s:
string to unescape
"""
return s.replace('\\n', '\n').replace('\\t', '\t').replace('\\r', '\r')
def find_ngrams(token_dict, text, n):
"""
Break text into ngrams that appear in ``token_dict``.
:param token_dict:
``dict`` to check for ngrams
:param text:
``str`` to look for ngrams in
:param n:
``int`` max size of ngrams
"""
# base case
if n <= 1:
return text
# tokens committed to output
saved_tokens = []
# tokens remaining to be searched in sentence
search_tokens = text[:]
# tokens stored until next ngram found
next_search = []
while len(search_tokens) >= n:
ngram = ' '.join(search_tokens[:n])
if ngram in token_dict:
# first, search previous unmatched words for smaller ngrams
sub_n = min(len(next_search), n - 1)
saved_tokens.extend(find_ngrams(token_dict, next_search, sub_n))
next_search.clear()
# then add this ngram
saved_tokens.append(ngram)
# then pop this ngram from the remaining words to search
search_tokens = search_tokens[n:]
else:
next_search.append(search_tokens.pop(0))
remainder = next_search + search_tokens
sub_n = min(len(remainder), n - 1)
saved_tokens.extend(find_ngrams(token_dict, remainder, sub_n))
return saved_tokens
class DictionaryAgent(Agent):
"""
Builds and/or loads a dictionary.
The dictionary provides access to the frequency of each token, functions to
translate sentences from tokens to their vectors (list of ints, each int is the
index of a token in the dictionary) and back from vectors to tokenized text.
"""
default_lang = 'english'
default_maxngram = -1
default_minfreq = 0
default_maxtokens = -1
default_null = '__null__'
default_start = '__start__'
default_end = '__end__'
default_unk = '__unk__'
default_tok = 're'
default_lower = False
default_textfields = 'text,labels'
@staticmethod
def add_cmdline_args(argparser):
"""
Add commandline arguments related to the dictionary.
"""
dictionary = argparser.add_argument_group('Dictionary Arguments')
dictionary.add_argument(
'-df',
'--dict-file',
help='path to dictionary file. defaults to [model_file].dict if '
'not set and model_file is set.',
hidden=True,
)
dictionary.add_argument(
'--dict-initpath',
hidden=True,
help='path to a saved dictionary to load tokens / counts from to '
'seed the dictionary with initial tokens and/or frequencies',
)
dictionary.add_argument(
'--dict-language',
default=DictionaryAgent.default_lang,
hidden=True,
help='sets language for the punkt sentence tokenizer',
)
dictionary.add_argument(
'--dict-max-ngram-size',
type=int,
hidden=True,
default=DictionaryAgent.default_maxngram,
help='looks for ngrams of up to this size. this is ignored when '
'building the dictionary. note: this takes approximate '
'runtime of len(sentence)^max_ngram_size',
)
dictionary.add_argument(
'--dict-minfreq',
default=DictionaryAgent.default_minfreq,
type=int,
help='minimum frequency of words to include them in sorted '
'dict or minimum frequency of bpe codecs',
hidden=True,
)
dictionary.add_argument(
'--dict-maxtokens',
default=DictionaryAgent.default_maxtokens,
type=int,
help='max number of tokens to include in dictionary or bpe codecs',
hidden=True,
)
dictionary.add_argument(
'--dict-nulltoken',
default=DictionaryAgent.default_null,
hidden=True,
help='empty token, can be used for padding or just empty values',
)
dictionary.add_argument(
'--dict-starttoken',
default=DictionaryAgent.default_start,
hidden=True,
help='token for starting sentence generation, if needed',
)
dictionary.add_argument(
'--dict-endtoken',
default=DictionaryAgent.default_end,
hidden=True,
help='token for end of sentence markers, if needed',
)
dictionary.add_argument(
'--dict-unktoken',
default=DictionaryAgent.default_unk,
hidden=True,
help='token to return for unavailable words',
)
dictionary.add_argument(
'-tok',
'--dict-tokenizer',
default=DictionaryAgent.default_tok,
help='Which tokenizer to use. Defaults to "split", which splits '
'on whitespace as well as recognizing basic punctuation. '
'Other options include nltk, gpt2 and bytelevelbpe.',
hidden=True,
)
dictionary.add_argument(
'--dict-lower',
default=DictionaryAgent.default_lower,
type='bool',
help='Whether or not to lowercase all text seen.',
hidden=True,
)
dictionary.add_argument(
'--bpe-debug',
action='store_true',
hidden=True,
help='Leave BPE tokens untouched in output. Useful for debugging.',
)
dictionary.add_argument(
'--dict-textfields',
default=DictionaryAgent.default_textfields,
hidden=True,
help='Observation fields which dictionary learns vocabulary from. '
'Tasks with additional fields may add to this list to handle '
'any extra vocabulary.',
)
dictionary = BPEHelper.add_cmdline_args(dictionary)
return dictionary
def __init__(self, opt: Opt, shared=None):
"""
Initialize DictionaryAgent.
"""
self.opt = copy.deepcopy(opt)
self.minfreq = opt.get('dict_minfreq', DictionaryAgent.default_minfreq)
self.null_token = opt.get('dict_nulltoken', DictionaryAgent.default_null)
self.end_token = opt.get('dict_endtoken', DictionaryAgent.default_end)
self.unk_token = opt.get('dict_unktoken', DictionaryAgent.default_unk)
self.start_token = opt.get('dict_starttoken', DictionaryAgent.default_start)
self.max_ngram_size = opt.get(
'dict_max_ngram_size', DictionaryAgent.default_maxngram
)
self.tokenizer = opt.get('dict_tokenizer', DictionaryAgent.default_tok)
self.lower = opt.get('dict_lower', DictionaryAgent.default_lower)
self.maxtokens = opt.get('dict_maxtokens', DictionaryAgent.default_maxtokens)
self.textfields = opt.get(
'dict_textfields', DictionaryAgent.default_textfields
).split(",")
try:
self.tokenizer_fun = getattr(self, self.tokenizer + '_tokenize')
except AttributeError:
raise AttributeError(
'tokenizer type {} not yet supported'.format(self.tokenizer)
)
if shared:
self.freq = shared.get('freq', {})
self.tok2ind = shared.get('tok2ind', {})
self.ind2tok = shared.get('ind2tok', {})
else:
self.freq = defaultdict(int)
self.tok2ind = {}
self.ind2tok = {}
if self.null_token:
self.add_token(self.null_token)
if self.start_token:
# set special start of sentence word token
self.add_token(self.start_token)
if self.end_token:
# set special end of sentence word token
self.add_token(self.end_token)
if self.unk_token:
# set special unknown word token
self.add_token(self.unk_token)
loaded = False
# If data built via pytorch data teacher, we need to load prebuilt dict
if opt.get('dict_file'):
opt['dict_file'] = modelzoo_path(opt.get('datapath'), opt['dict_file'])
if os.path.isfile(opt['dict_file']):
# load pre-existing dictionary
self.load(opt['dict_file'])
loaded = True
if not loaded and opt.get('dict_initpath'):
# load seed dictionary
opt['dict_initpath'] = modelzoo_path(
opt.get('datapath'), opt['dict_initpath']
)
# don't check isfile first, should fail if file not found
self.load(opt['dict_initpath'])
opt['dict_loaded'] = loaded
# cache unk token for later
self._unk_token_idx = self.tok2ind.get(self.unk_token)
# initialize tokenizers
if self.tokenizer == 'nltk':
try:
import nltk
except ImportError:
raise ImportError('Please install nltk (pip install nltk)')
# nltk-specific setup
st_path = 'tokenizers/punkt/{0}.pickle'.format(opt['dict_language'])
try:
self.sent_tok = nltk.data.load(st_path)
except LookupError:
nltk.download('punkt')
self.sent_tok = nltk.data.load(st_path)
self.word_tok = nltk.tokenize.treebank.TreebankWordTokenizer()
elif self.tokenizer in ['bpe', 'gpt2', 'bytelevelbpe', 'slow_bytelevel_bpe']:
self.bpe = bpe_factory(opt, shared)
self.bpe.sync_with_dict(self)
if not shared:
if self.null_token:
# fix count for null token to one billion and three
self.freq[self.null_token] = 1000000003
if self.start_token:
# fix count for start of sentence token to one billion and two
self.freq[self.start_token] = 1000000002
if self.end_token:
# fix count for end of sentence token to one billion and one
self.freq[self.end_token] = 1000000001
if self.unk_token:
# fix count for unknown token to one billion
self.freq[self.unk_token] = 1000000000
if opt.get('dict_file'):
self.save_path = opt['dict_file']
def add_token(self, word):
"""
Add a single token to the dictionary.
"""
if word not in self.tok2ind:
index = len(self.tok2ind)
self.tok2ind[word] = index
self.ind2tok[index] = word
def __contains__(self, key):
"""
Return if the dictionary contains the key.
If key is an int, returns whether the key is in the indices. If key is a str,
return if the token is in the dict of tokens.
"""
if type(key) == int:
return key in self.ind2tok
elif type(key) == str:
return key in self.tok2ind
def _word_lookup(self, key):
# return index from token, or unk_token's index, or None
return self.tok2ind.get(key, self._unk_token_idx)
def _index_lookup(self, key):
# return token from index, or unk_token
return self.ind2tok.get(key, self.unk_token)
def __getitem__(self, key):
"""
Lookup the word or ID.
If key is an int, returns the corresponding token. If it does not exist, return
the unknown token. If key is a str, return the token's index. If the token is
not in the dictionary, return the index of the unknown token. If there is no
unknown token, return ``None``.
"""
if type(key) == str:
return self._word_lookup(key)
if type(key) == int:
return self._index_lookup(key)
def __len__(self):
return len(self.tok2ind)
def __setitem__(self, key, value):
"""
Set the frequency for a word to a value.
If the key is not in the dictionary, add it to the dictionary and set its
frequency to value.
"""
key = str(key)
if self.lower:
key = key.lower()
self.freq[key] = int(value)
self.add_token(key)
def keys(self):
"""
Return all the words in the dictionary.
"""
return self.tok2ind.keys()
def nltk_tokenize(self, text, building=False):
"""
Tokenize using NLTK PunktTokenizer.
Uses nltk-trained PunktTokenizer for sentence tokenization and Treebank Word
Tokenizer for tokenizing words within sentences.
"""
return (
token
for sent in self.sent_tok.tokenize(text)
for token in self.word_tok.tokenize(sent)
)
def gpt2_tokenize(self, text):
"""
Tokenize using Gpt2 BPE tokenizer.
"""
return self.bpe_tokenize(text)
def slow_bytelevel_bpe_tokenize(self, text):
"""
Tokenize using Gpt2 BPE tokenizer.
"""
return self.bpe_tokenize(text)
def bytelevelbpe_tokenize(self, text):
"""
Tokenize using Gpt2 BPE tokenizer.
"""
return self.bpe_tokenize(text)
@staticmethod
def re_tokenize(text):
r"""
Tokenize using a liberal regular expression.
Find boundaries between word characters, newlines, and non-word
non-whitespace tokens ``(r'[\\w\\n]+ | [^\\w\\s] | \\n')``.
This splits along whitespace and punctuation and keeps the newline as
a token in the returned list.
"""
return RETOK.findall(text)
@staticmethod
def split_tokenize(text):
"""
Tokenize on whitespace and some limited punctuation.
Splits tokens based on whitespace after adding whitespace around
punctuation.
Use re_tokenize if you want more robust handling of punctuation.
"""
return (
text.replace('.', ' . ')
.replace(',', ' , ')
.replace(';', ' ; ')
.replace(':', ' : ')
.replace('!', ' ! ')
.replace('?', ' ? ')
.split()
)
@staticmethod
def space_tokenize(text):
"""
Tokenize exactly on spaces.
Useful when text is pre-tokenized.
"""
return text.strip().split(' ')
def span_tokenize(self, text):
"""
Tokenize and find starting index of each token in the original string.
"""
tokens = self.tokenize(text)
curr_idx = 0
indices = []
for t in tokens:
while text[curr_idx] != t[0]:
curr_idx += 1
indices.append((curr_idx, curr_idx + len(t)))
curr_idx += len(t)
return tokens, indices
def tokenize(self, text, building=False):
"""
Return a sequence of tokens from the iterable.
"""
if self.lower:
text = text.lower()
# calls the selected tokenizer function e.g. 're' => re_tokenize(text)
word_tokens = self.tokenizer_fun(text)
if not building and self.max_ngram_size > 1:
# search for ngrams during parse-time
# TODO(ahm): support build-time ngrams using word2vec heuristic?
word_tokens = find_ngrams(self.tok2ind, word_tokens, self.max_ngram_size)
return word_tokens
def bpe_tokenize(self, text):
"""
Return a sequence of BPE-tokens from the text.
"""
return self.bpe.encode(text)
def add_to_dict(self, tokens):
"""
Build dictionary from the list of provided tokens.
"""
self.built = False
for token in tokens:
self.add_token(token)
self.freq[token] += 1
def remove_tail(self, min_freq):
"""
Remove elements below the frequency cutoff from the dictionary.
"""
to_remove = []
for token, freq in self.freq.items():
if freq < min_freq:
# queue up removals since can't mutate dict during iteration
to_remove.append(token)
for token in to_remove:
del self.freq[token]
idx = self.tok2ind.pop(token)
del self.ind2tok[idx]
def _remove_non_bpe(self):
"""
Set the dictionary vocab to the bpe vocab, merging counts.
"""
to_remove = []
to_add = []
for token, freq in self.freq.items():
tokens = self.bpe_tokenize(token)
if len(tokens) != 1:
for t in tokens:
to_add.append((t, freq))
to_remove.append(token)
for token in to_remove:
del self.freq[token]
idx = self.tok2ind.pop(token)
del self.ind2tok[idx]
for token, freq in to_add:
self.add_token(token)
self.freq[token] += freq
def resize_to_max(self, maxtokens):
"""
Trims the dictionary to the maximum number of tokens.
"""
if maxtokens >= 0 and len(self.tok2ind) > maxtokens:
for k in range(maxtokens, len(self.ind2tok)):
v = self.ind2tok[k]
del self.ind2tok[k]
del self.tok2ind[v]
del self.freq[v]
def load(self, filename):
"""
Load pre-existing dictionary in 'token[<TAB>count]' format.
Initialize counts from other dictionary, or 0 if they aren't included.
"""
print('Dictionary: loading dictionary from {}'.format(filename))
lower_special = self.null_token == self.null_token.lower()
SPECIAL_TOKENS = {'__UNK__', '__NULL__', '__END__', '__START__'}
with codecs.open(filename, 'r', encoding='utf-8', errors='ignore') as read:
for line in read:
split = line.strip().split('\t')
token = unescape(split[0])
if lower_special and token in SPECIAL_TOKENS:
token = token.lower()
cnt = int(split[1]) if len(split) > 1 else 0
self.freq[token] = cnt
self.add_token(token)
print('[ num words = %d ]' % len(self))
def save(self, filename=None, append=False, sort=True):
"""
Save dictionary to file.
Format is 'token<TAB>count' for every token in the dictionary, sorted
by count with the most frequent words first.
If ``append`` (default ``False``) is set to ``True``, appends instead of
overwriting.
If ``sort`` (default ``True``), then first sort the dictionary before saving.
"""
filename = self.opt['dict_file'] if filename is None else filename
if self.tokenizer in ['bpe', 'gpt2', 'bytelevelbpe', 'slow_bytelevel_bpe']:
needs_removal = self.bpe.finalize(
self.freq, num_symbols=self.maxtokens, minfreq=self.minfreq
)
if needs_removal:
self._remove_non_bpe()
elif filename != self.opt.get('dict_file'):
# need to copy over the old codecs file
self.bpe.copy_codecs_file(filename + '.codecs')
if sort and self.bpe.should_sort():
self.sort(trim=False)
elif sort:
self.sort(trim=True)
print('Dictionary: saving dictionary to {}'.format(filename))
make_dir(os.path.dirname(filename))
mode = 'a' if append else 'w'
with open(filename, mode, encoding='utf-8') as write:
for i in self.ind2tok.keys():
tok = self.ind2tok[i]
cnt = self.freq[tok]
write.write('{tok}\t{cnt}\n'.format(tok=escape(tok), cnt=cnt))
# save opt file
with open(filename + '.opt', 'w', encoding='utf-8') as handle:
json.dump(self.opt, handle, indent=4)
# save the byte level bpe model file as well
if self.tokenizer == 'bytelevelbpe':
# This saves filename-vocab.json and filename-merges.txt as
# hugging face tokenizer does
self.bpe.save(os.path.dirname(filename), os.path.basename(filename))
def sort(self, trim=True):
"""
Sort the dictionary.
Inline operation. Rearranges the dictionary so that the elements with
the lowest index have the highest counts. This reindexes the dictionary
according to the sorted frequencies, breaking ties alphabetically by
token.
:param bool trim:
If True, truncate the dictionary based on minfreq and maxtokens.
"""
if trim and self.tokenizer == 'gpt2':
raise RuntimeError("You should not trim the dictionary when using gpt-2.")
if trim and self.tokenizer == 'bytelevelbpe':
raise RuntimeError(
"You should not trim the dictionary when using bytelevelbpe."
)
# sort first by count, then alphabetically
if trim:
self.remove_tail(self.minfreq)
sorted_pairs = sorted(self.freq.items(), key=lambda x: (-x[1], x[0]))
new_tok2ind = {}
new_ind2tok = {}
for i, (tok, _) in enumerate(sorted_pairs):
new_tok2ind[tok] = i
new_ind2tok[i] = tok
self.tok2ind = new_tok2ind
self.ind2tok = new_ind2tok
if trim:
self.resize_to_max(self.maxtokens)
assert len(self.freq) == len(self.ind2tok) == len(self.tok2ind)
return sorted_pairs
def parse(self, txt_or_vec, vec_type=list):
"""
Parse either text or a vector of indices.
Calls `~txt2vec` if `txt_or_vec is a string, or `~vec2txt` otherwise.
:param vec_type:
type of the returned vector if the input is a string.
"""
# TODO: try to deprecate this, preferring straight txt2vec
if type(txt_or_vec) == str:
return self.txt2vec(txt_or_vec, vec_type)
else:
return self.vec2txt(txt_or_vec)
def txt2vec(self, text, vec_type=list):
"""
Convert a string to a vector (list of ints).
First runs a sentence tokenizer, then a word tokenizer.
:param type vec_type:
The type of the returned vector if the input is a string. Suggested
``list``, ``tuple``, ``set``, or ``np.ndarray``.
"""
itr = (self._word_lookup(token) for token in self.tokenize(str(text)))
if vec_type == list or vec_type == tuple or vec_type == set:
res = vec_type(itr)
elif vec_type == np.ndarray:
res = np.fromiter(itr, np.int)
else:
raise RuntimeError('Type {} not supported by dict'.format(vec_type))
return res
def vec2txt(self, vector, delimiter=' '):
"""
Convert a vector of IDs to a string.
Converts a vector (iterable of ints) into a string, with each token separated by
the delimiter (default ``' '``).
"""
tokens = [self[int(idx)] for idx in vector]
if self.tokenizer in ['gpt2', 'bpe', 'slow_bytelevel_bpe']:
# if we used a BPE tokenizer we need to rejoin the encodings
text = self.bpe.decode(tokens, vector, delimiter)
elif self.tokenizer == 'bytelevelbpe':
# We add special tokens in the beginning of ParlAI dict but in the
# end of Hugging Face dict,there is an offset of 4 between them.
vector = [
idx + len(self.tok2ind) - 4 if idx < 4 else idx - 4 for idx in vector
]
tokens = [self[int(idx)] for idx in vector]
text = self.bpe.decode(tokens, vector, delimiter)
else:
text = delimiter.join(self[int(idx)] for idx in vector)
return text
def act(self):
"""
Add words in the last observation to the dictionary.
This checks any fields in the message present in the --dict-textfields argument
(e.g. "text,labels").
"""
for textfield in self.textfields:
source = self.observation.get(textfield)
if source is None:
continue
# fields may be singleton strings or lists of strings.
# wrap the singleton strings in a list to iterate over them
if type(source) is str:
source = [source]
for text in source:
if text:
self.add_to_dict(self.tokenize(text))
return {'id': 'Dictionary'}
def share(self):
"""
Share internal dicts.
"""
shared = super().share()
shared['freq'] = self.freq
shared['tok2ind'] = self.tok2ind
shared['ind2tok'] = self.ind2tok
return shared
def shutdown(self):
"""
Save on shutdown if ``save_path`` is set.
"""
if hasattr(self, 'save_path'):
self.save(self.save_path)
def __str__(self):
"""
Return string representation of frequencies in dictionary.
"""
return str(self.freq)