forked from piskvorky/gensim
/
wikicorpus.py
executable file
·322 lines (266 loc) · 13 KB
/
wikicorpus.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Copyright (C) 2010 Radim Rehurek <radimrehurek@seznam.cz>
# Copyright (C) 2012 Lars Buitinck <larsmans@gmail.com>
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
"""
Construct a corpus from a Wikipedia (or other MediaWiki-based) database dump.
If you have the `pattern` package installed, this module will use a fancy
lemmatization to get a lemma of each token (instead of plain alphabetic
tokenizer). The package is available at https://github.com/clips/pattern .
See scripts/process_wiki.py for a canned (example) script based on this
module.
"""
import bz2
import logging
import re
from xml.etree.cElementTree import iterparse # LXML isn't faster, so let's go with the built-in solution
import multiprocessing
from gensim import utils
# cannot import whole gensim.corpora, because that imports wikicorpus...
from gensim.corpora.dictionary import Dictionary
from gensim.corpora.textcorpus import TextCorpus
logger = logging.getLogger('gensim.corpora.wikicorpus')
# ignore articles shorter than ARTICLE_MIN_WORDS characters (after full preprocessing)
ARTICLE_MIN_WORDS = 50
RE_P0 = re.compile('<!--.*?-->', re.DOTALL | re.UNICODE) # comments
RE_P1 = re.compile('<ref([> ].*?)(</ref>|/>)', re.DOTALL | re.UNICODE) # footnotes
RE_P2 = re.compile("(\n\[\[[a-z][a-z][\w-]*:[^:\]]+\]\])+$", re.UNICODE) # links to languages
RE_P3 = re.compile("{{([^}{]*)}}", re.DOTALL | re.UNICODE) # template
RE_P4 = re.compile("{{([^}]*)}}", re.DOTALL | re.UNICODE) # template
RE_P5 = re.compile('\[(\w+):\/\/(.*?)(( (.*?))|())\]', re.UNICODE) # remove URL, keep description
RE_P6 = re.compile("\[([^][]*)\|([^][]*)\]", re.DOTALL | re.UNICODE) # simplify links, keep description
RE_P7 = re.compile('\n\[\[[iI]mage(.*?)(\|.*?)*\|(.*?)\]\]', re.UNICODE) # keep description of images
RE_P8 = re.compile('\n\[\[[fF]ile(.*?)(\|.*?)*\|(.*?)\]\]', re.UNICODE) # keep description of files
RE_P9 = re.compile('<nowiki([> ].*?)(</nowiki>|/>)', re.DOTALL | re.UNICODE) # outside links
RE_P10 = re.compile('<math([> ].*?)(</math>|/>)', re.DOTALL | re.UNICODE) # math content
RE_P11 = re.compile('<(.*?)>', re.DOTALL | re.UNICODE) # all other tags
RE_P12 = re.compile('\n(({\|)|(\|-)|(\|}))(.*?)(?=\n)', re.UNICODE) # table formatting
RE_P13 = re.compile('\n(\||\!)(.*?\|)*([^|]*?)', re.UNICODE) # table cell formatting
RE_P14 = re.compile('\[\[Category:[^][]*\]\]', re.UNICODE) # categories
# Remove File and Image template
RE_P15 = re.compile('\[\[([fF]ile:|[iI]mage)[^]]*(\]\])', re.UNICODE)
# MediaWiki namespaces (https://www.mediawiki.org/wiki/Manual:Namespace) that
# ought to be ignored
IGNORED_NAMESPACES = ['Wikipedia', 'Category', 'File', 'Portal', 'Template',
'MediaWiki', 'User', 'Help', 'Book', 'Draft',
'WikiProject', 'Special', 'Talk']
def filter_wiki(raw):
"""
Filter out wiki mark-up from `raw`, leaving only text. `raw` is either unicode
or utf-8 encoded string.
"""
# parsing of the wiki markup is not perfect, but sufficient for our purposes
# contributions to improving this code are welcome :)
text = utils.to_unicode(raw, 'utf8', errors='ignore')
text = utils.decode_htmlentities(text) # '&nbsp;' --> '\xa0'
return remove_markup(text)
def remove_markup(text):
text = re.sub(RE_P2, "", text) # remove the last list (=languages)
# the wiki markup is recursive (markup inside markup etc)
# instead of writing a recursive grammar, here we deal with that by removing
# markup in a loop, starting with inner-most expressions and working outwards,
# for as long as something changes.
text = remove_template(text)
text = remove_file(text)
iters = 0
while True:
old, iters = text, iters + 1
text = re.sub(RE_P0, "", text) # remove comments
text = re.sub(RE_P1, '', text) # remove footnotes
text = re.sub(RE_P9, "", text) # remove outside links
text = re.sub(RE_P10, "", text) # remove math content
text = re.sub(RE_P11, "", text) # remove all remaining tags
text = re.sub(RE_P14, '', text) # remove categories
text = re.sub(RE_P5, '\\3', text) # remove urls, keep description
text = re.sub(RE_P6, '\\2', text) # simplify links, keep description only
# remove table markup
text = text.replace('||', '\n|') # each table cell on a separate line
text = re.sub(RE_P12, '\n', text) # remove formatting lines
text = re.sub(RE_P13, '\n\\3', text) # leave only cell content
# remove empty mark-up
text = text.replace('[]', '')
if old == text or iters > 2: # stop if nothing changed between two iterations or after a fixed number of iterations
break
# the following is needed to make the tokenizer see '[[socialist]]s' as a single word 'socialists'
# TODO is this really desirable?
text = text.replace('[', '').replace(']', '') # promote all remaining markup to plain text
return text
def remove_template(s):
"""Remove template wikimedia markup.
Return a copy of `s` with all the wikimedia markup template removed. See
http://meta.wikimedia.org/wiki/Help:Template for wikimedia templates
details.
Note: Since template can be nested, it is difficult remove them using
regular expresssions.
"""
# Find the start and end position of each template by finding the opening
# '{{' and closing '}}'
n_open, n_close = 0, 0
starts, ends = [], []
in_template = False
prev_c = None
for i, c in enumerate(iter(s)):
if not in_template:
if c == '{' and c == prev_c:
starts.append(i - 1)
in_template = True
n_open = 1
if in_template:
if c == '{':
n_open += 1
elif c == '}':
n_close += 1
if n_open == n_close:
ends.append(i)
in_template = False
n_open, n_close = 0, 0
prev_c = c
# Remove all the templates
s = ''.join([s[end + 1:start] for start, end in
zip(starts + [None], [-1] + ends)])
return s
def remove_file(s):
"""Remove the 'File:' and 'Image:' markup, keeping the file caption.
Return a copy of `s` with all the 'File:' and 'Image:' markup replaced by
their corresponding captions. See http://www.mediawiki.org/wiki/Help:Images
for the markup details.
"""
# The regex RE_P15 match a File: or Image: markup
for match in re.finditer(RE_P15, s):
m = match.group(0)
caption = m[:-2].split('|')[-1]
s = s.replace(m, caption, 1)
return s
def tokenize(content):
"""
Tokenize a piece of text from wikipedia. The input string `content` is assumed
to be mark-up free (see `filter_wiki()`).
Return list of tokens as utf8 bytestrings. Ignore words shorted than 2 or longer
that 15 characters (not bytes!).
"""
# TODO maybe ignore tokens with non-latin characters? (no chinese, arabic, russian etc.)
return [token.encode('utf8') for token in utils.tokenize(content, lower=True, errors='ignore')
if 2 <= len(token) <= 15 and not token.startswith('_')]
def get_namespace(tag):
"""Returns the namespace of tag."""
m = re.match("^{(.*?)}", tag)
namespace = m.group(1) if m else ""
if not namespace.startswith("http://www.mediawiki.org/xml/export-"):
raise ValueError("%s not recognized as MediaWiki dump namespace"
% namespace)
return namespace
_get_namespace = get_namespace
def extract_pages(f, filter_namespaces=False):
"""
Extract pages from a MediaWiki database dump = open file-like object `f`.
Return an iterable over (str, str, str) which generates (title, content, pageid) triplets.
"""
elems = (elem for _, elem in iterparse(f, events=("end",)))
# We can't rely on the namespace for database dumps, since it's changed
# it every time a small modification to the format is made. So, determine
# those from the first element we find, which will be part of the metadata,
# and construct element paths.
elem = next(elems)
namespace = get_namespace(elem.tag)
ns_mapping = {"ns": namespace}
page_tag = "{%(ns)s}page" % ns_mapping
text_path = "./{%(ns)s}revision/{%(ns)s}text" % ns_mapping
title_path = "./{%(ns)s}title" % ns_mapping
ns_path = "./{%(ns)s}ns" % ns_mapping
pageid_path = "./{%(ns)s}id" % ns_mapping
for elem in elems:
if elem.tag == page_tag:
title = elem.find(title_path).text
text = elem.find(text_path).text
ns = elem.find(ns_path).text
if filter_namespaces and ns not in filter_namespaces:
text = None
pageid = elem.find(pageid_path).text
yield title, text or "", pageid # empty page will yield None
# Prune the element tree, as per
# http://www.ibm.com/developerworks/xml/library/x-hiperfparse/
# except that we don't need to prune backlinks from the parent
# because we don't use LXML.
# We do this only for <page>s, since we need to inspect the
# ./revision/text element. The pages comprise the bulk of the
# file, so in practice we prune away enough.
elem.clear()
_extract_pages = extract_pages # for backward compatibility
def process_article(args):
"""
Parse a wikipedia article, returning its content as a list of tokens
(utf8-encoded strings).
"""
text, lemmatize, title, pageid = args
text = filter_wiki(text)
if lemmatize:
result = utils.lemmatize(text)
else:
result = tokenize(text)
return result, title, pageid
class WikiCorpus(TextCorpus):
"""
Treat a wikipedia articles dump (\*articles.xml.bz2) as a (read-only) corpus.
The documents are extracted on-the-fly, so that the whole (massive) dump
can stay compressed on disk.
>>> wiki = WikiCorpus('enwiki-20100622-pages-articles.xml.bz2') # create word->word_id mapping, takes almost 8h
>>> MmCorpus.serialize('wiki_en_vocab200k.mm', wiki) # another 8h, creates a file in MatrixMarket format plus file with id->word
"""
def __init__(self, fname, processes=None, lemmatize=utils.has_pattern(), dictionary=None, filter_namespaces=('0',)):
"""
Initialize the corpus. Unless a dictionary is provided, this scans the
corpus once, to determine its vocabulary.
If `pattern` package is installed, use fancier shallow parsing to get
token lemmas. Otherwise, use simple regexp tokenization. You can override
this automatic logic by forcing the `lemmatize` parameter explicitly.
"""
self.fname = fname
self.filter_namespaces = filter_namespaces
self.metadata = False
if processes is None:
processes = max(1, multiprocessing.cpu_count() - 1)
self.processes = processes
self.lemmatize = lemmatize
if dictionary is None:
self.dictionary = Dictionary(self.get_texts())
else:
self.dictionary = dictionary
def get_texts(self):
"""
Iterate over the dump, returning text version of each article as a list
of tokens.
Only articles of sufficient length are returned (short articles & redirects
etc are ignored).
Note that this iterates over the **texts**; if you want vectors, just use
the standard corpus interface instead of this function::
>>> for vec in wiki_corpus:
>>> print(vec)
"""
articles, articles_all = 0, 0
positions, positions_all = 0, 0
texts = ((text, self.lemmatize, title, pageid) for title, text, pageid in extract_pages(bz2.BZ2File(self.fname), self.filter_namespaces))
pool = multiprocessing.Pool(self.processes)
# process the corpus in smaller chunks of docs, because multiprocessing.Pool
# is dumb and would load the entire input into RAM at once...
for group in utils.chunkize(texts, chunksize=10 * self.processes, maxsize=1):
for tokens, title, pageid in pool.imap(process_article, group): # chunksize=10):
articles_all += 1
positions_all += len(tokens)
# article redirects and short stubs are pruned here
if len(tokens) < ARTICLE_MIN_WORDS or any(title.startswith(ignore + ':') for ignore in IGNORED_NAMESPACES):
continue
articles += 1
positions += len(tokens)
if self.metadata:
yield (tokens, (pageid, title))
else:
yield tokens
pool.terminate()
logger.info(
"finished iterating over Wikipedia corpus of %i documents with %i positions"
" (total %i articles, %i positions before pruning articles shorter than %i words)",
articles, positions, articles_all, positions_all, ARTICLE_MIN_WORDS)
self.length = articles # cache corpus length
# endclass WikiCorpus