forked from piskvorky/gensim
/
phrases.py
263 lines (206 loc) · 10.2 KB
/
phrases.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
"""
Automatically detect common phrases (multiword expressions) from a stream of sentences.
The phrases are collocations (frequently co-occurring tokens). See [1]_ for the
exact formula.
For example, if your input stream (=an iterable, with each value a list of token strings) looks like:
>>> print(list(sentence_stream))
[[u'the', u'mayor', u'of', u'new', u'york', u'was', u'there'],
[u'machine', u'learning', u'can', u'be', u'useful', u'sometimes'],
...,
]
you'd train the detector with:
>>> bigram = Phrases(sentence_stream)
and then transform any sentence (list of token strings) using the standard gensim syntax:
>>> sent = [u'the', u'mayor', u'of', u'new', u'york', u'was', u'there']
>>> print(bigram[sent])
[u'the', u'mayor', u'of', u'new_york', u'was', u'there']
(note `new_york` became a single token). As usual, you can also transform an entire
sentence stream using:
>>> print(list(bigram[any_sentence_stream]))
[[u'the', u'mayor', u'of', u'new_york', u'was', u'there'],
[u'machine_learning', u'can', u'be', u'useful', u'sometimes'],
...,
]
You can also continue updating the collocation counts with new sentences, by:
>>> bigram.add_vocab(new_sentence_stream)
These **phrase streams are meant to be used during text preprocessing, before
converting the resulting tokens into vectors using `Dictionary`**. See the
:mod:`gensim.models.word2vec` module for an example application of using phrase detection.
The detection can also be **run repeatedly**, to get phrases longer than
two tokens (e.g. `new_york_times`):
>>> trigram = Phrases(bigram[sentence_stream])
>>> sent = [u'the', u'new', u'york', u'times', u'is', u'a', u'newspaper']
>>> print(trigram[bigram[sent]])
[u'the', u'new_york_times', u'is', u'a', u'newspaper']
.. [1] Tomas Mikolov, Ilya Sutskever, Kai Chen, Greg Corrado, and Jeffrey Dean.
Distributed Representations of Words and Phrases and their Compositionality.
In Proceedings of NIPS, 2013.
"""
import sys
import os
import logging
from collections import defaultdict
from six import iteritems, string_types
from gensim import utils, interfaces
logger = logging.getLogger(__name__)
class Phrases(interfaces.TransformationABC):
"""
Detect phrases, based on collected collocation counts. Adjacent words that appear
together more frequently than expected are joined together with the `_` character.
It can be used to generate phrases on the fly, using the `phrases[sentence]`
and `phrases[corpus]` syntax.
"""
def __init__(self, sentences=None, min_count=5, threshold=10.0,
max_vocab_size=40000000, delimiter=b'_'):
"""
Initialize the model from an iterable of `sentences`. Each sentence must be
a list of words (unicode strings) that will be used for training.
The `sentences` iterable can be simply a list, but for larger corpora,
consider a generator that streams the sentences directly from disk/network,
without storing everything in RAM. See :class:`BrownCorpus`,
:class:`Text8Corpus` or :class:`LineSentence` in the :mod:`gensim.models.word2vec`
module for such examples.
`min_count` ignore all words and bigrams with total collected count lower
than this.
`threshold` represents a threshold for forming the phrases (higher means
fewer phrases). A phrase of words `a` and `b` is accepted if
`(cnt(a, b) - min_count) * N / (cnt(a) * cnt(b)) > threshold`, where `N` is the
total vocabulary size.
`max_vocab_size` is the maximum size of the vocabulary. Used to control
pruning of less common words, to keep memory under control. The default
of 40M needs about 3.6GB of RAM; increase/decrease `max_vocab_size` depending
on how much available memory you have.
`delimiter` is the glue character used to join collocation tokens, and
should be a byte string (e.g. b'_').
"""
if min_count <= 0:
raise ValueError("min_count should be at least 1")
if threshold <= 0:
raise ValueError("threshold should be positive")
self.min_count = min_count
self.threshold = threshold
self.max_vocab_size = max_vocab_size
self.vocab = defaultdict(int) # mapping between utf8 token => its count
self.min_reduce = 1 # ignore any tokens with count smaller than this
self.delimiter = delimiter
if sentences is not None:
self.add_vocab(sentences)
def __str__(self):
"""Get short string representation of this phrase detector."""
return "%s<%i vocab, min_count=%s, threshold=%s, max_vocab_size=%s>" % (
self.__class__.__name__, len(self.vocab), self.min_count,
self.threshold, self.max_vocab_size)
@staticmethod
def learn_vocab(sentences, max_vocab_size, delimiter=b'_'):
"""Collect unigram/bigram counts from the `sentences` iterable."""
sentence_no = -1
total_words = 0
logger.info("collecting all words and their counts")
vocab = defaultdict(int)
min_reduce = 1
for sentence_no, sentence in enumerate(sentences):
if sentence_no % 10000 == 0:
logger.info("PROGRESS: at sentence #%i, processed %i words and %i word types" %
(sentence_no, total_words, len(vocab)))
sentence = [utils.any2utf8(w) for w in sentence]
for bigram in zip(sentence, sentence[1:]):
vocab[bigram[0]] += 1
vocab[delimiter.join(bigram)] += 1
total_words += 1
if sentence: # add last word skipped by previous loop
word = sentence[-1]
vocab[word] += 1
if len(vocab) > max_vocab_size:
utils.prune_vocab(vocab, min_reduce)
min_reduce += 1
logger.info("collected %i word types from a corpus of %i words (unigram + bigrams) and %i sentences" %
(len(vocab), total_words, sentence_no + 1))
return min_reduce, vocab
def add_vocab(self, sentences):
"""
Merge the collected counts `vocab` into this phrase detector.
"""
# uses a separate vocab to collect the token counts from `sentences`.
# this consumes more RAM than merging new sentences into `self.vocab`
# directly, but gives the new sentences a fighting chance to collect
# sufficient counts, before being pruned out by the (large) accummulated
# counts collected in previous learn_vocab runs.
min_reduce, vocab = self.learn_vocab(sentences, self.max_vocab_size, self.delimiter)
logger.info("merging %i counts into %s", len(vocab), self)
self.min_reduce = max(self.min_reduce, min_reduce)
for word, count in iteritems(vocab):
self.vocab[word] += count
if len(self.vocab) > self.max_vocab_size:
utils.prune_vocab(self.vocab, self.min_reduce)
self.min_reduce += 1
logger.info("merged %s", self)
def __getitem__(self, sentence):
"""
Convert the input tokens `sentence` (=list of unicode strings) into phrase
tokens (=list of unicode strings, where detected phrases are joined by u'_').
If `sentence` is an entire corpus (iterable of sentences rather than a single
sentence), return an iterable that converts each of the corpus' sentences
into phrases on the fly, one after another.
Example::
>>> sentences = Text8Corpus(path_to_corpus)
>>> bigram = Phrases(sentences, min_count=5, threshold=100)
>>> for sentence in phrases[sentences]:
... print(u' '.join(s))
he refuted nechaev other anarchists sometimes identified as pacifist anarchists advocated complete
nonviolence leo_tolstoy
"""
try:
is_single = not sentence or isinstance(sentence[0], string_types)
except:
is_single = False
if not is_single:
# if the input is an entire corpus (rather than a single sentence),
# return an iterable stream.
return self._apply(sentence)
s, new_s = [utils.any2utf8(w) for w in sentence], []
last_bigram = False
vocab = self.vocab
threshold = self.threshold
delimiter = self.delimiter
min_count = self.min_count
for word_a, word_b in zip(s, s[1:]):
if word_a in vocab and word_b in vocab:
bigram_word = delimiter.join((word_a, word_b))
if bigram_word in vocab and not last_bigram:
pa = float(vocab[word_a])
pb = float(vocab[word_b])
pab = float(vocab[bigram_word])
score = (pab - min_count) / pa / pb * len(vocab)
# logger.debug("score for %s: (pab=%s - min_count=%s) / pa=%s / pb=%s * vocab_size=%s = %s",
# bigram_word, pab, self.min_count, pa, pb, len(self.vocab), score)
if score > threshold:
new_s.append(bigram_word)
last_bigram = True
continue
if not last_bigram:
new_s.append(word_a)
last_bigram = False
if s: # add last word skipped by previous loop
last_token = s[-1]
if not last_bigram:
new_s.append(last_token)
return [utils.to_unicode(w) for w in new_s]
if __name__ == '__main__':
logging.basicConfig(format='%(asctime)s : %(threadName)s : %(levelname)s : %(message)s', level=logging.INFO)
logging.info("running %s" % " ".join(sys.argv))
# check and process cmdline input
program = os.path.basename(sys.argv[0])
if len(sys.argv) < 2:
print(globals()['__doc__'] % locals())
sys.exit(1)
infile = sys.argv[1]
from gensim.models import Phrases # for pickle
from gensim.models.word2vec import Text8Corpus
sentences = Text8Corpus(infile)
# test_doc = LineSentence('test/test_data/testcorpus.txt')
bigram = Phrases(sentences, min_count=5, threshold=100)
for s in bigram[sentences]:
print(utils.to_utf8(u' '.join(s)))