Skip to content

Commit

Permalink
Merge pull request #219 from ckot/feat-wikiprojects
Browse files Browse the repository at this point in the history
New feature allowing working with Wikinews media dumps
  • Loading branch information
bdewilde committed Jan 8, 2019
2 parents d770f30 + 4baa237 commit db63765
Showing 1 changed file with 47 additions and 5 deletions.
52 changes: 47 additions & 5 deletions textacy/datasets/wikipedia.py
Expand Up @@ -29,6 +29,7 @@
import mwparserfromhell
except ImportError:
pass
import requests

from .. import compat
from .. import data_dir
Expand All @@ -51,6 +52,14 @@
'de': 'Kategorie:'
}

# project name -> canonical wikimedia dump project name
PROJECTS = {
'wikipedia': 'wiki',
'wikinews': 'wikinews',
}
# useful for error messages
PROJECTS_INV = {v: k for k, v in PROJECTS.items()}

# nowiki tags: take contents verbatim
re_nowiki = re.compile(r'<nowiki>(.*?)</nowiki>', flags=re.UNICODE)

Expand Down Expand Up @@ -138,36 +147,48 @@ class Wikipedia(Dataset):
Args:
data_dir (str): Path to directory on disk under which database dump
files are stored. Each file is expected as
``{lang}wiki/{version}/{lang}wiki-{version}-pages-articles.xml.bz2``
``{lang}{project}/{version}/{lang}{project}-{version}-pages-articles.xml.bz2``
immediately under this directory.
lang (str): Standard two-letter language code, e.g. "en" => "English",
"de" => "German". https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes
version (str): Database dump version to use. Either "latest" for the
most recently available version or a date formatted as "YYYYMMDD".
Dumps are produced intermittently; check for available versions at
https://meta.wikimedia.org/wiki/Data_dumps.
project (str): Name of wikimedia project. Currently, only 'wikipedia'
(default) and 'wikinews' are supported.
namespace (str): Allows one to specify a namespace other than '0'
(the default) to extract content from
Attributes:
lang (str): Standard two-letter language code used in instantiation.
version (str): Database dump version used in instantiation.
filestub (str): The component of ``filename`` that is unique to this
lang- and version-specific database dump.
filename (str): Full path on disk for the lang- and version-specific
Wikipedia database dump, found under the ``data_dir`` directory.
project (str): the canonical name for the wikimedia project which is
used as a component of ``filestub``.
Notes:
Namespace values can be discovered via ``bzcat dumpfile.xml.bz2 | head``
"""

def __init__(self, data_dir=DATA_DIR, lang='en', version='latest', namespace='0'):

def __init__(self, data_dir=DATA_DIR, lang='en', version='latest', project='wikipedia', namespace='0'):
super(Wikipedia, self).__init__(
name=NAME, description=DESCRIPTION, site_url=SITE_URL, data_dir=data_dir)
self.lang = lang
self.version = version
self.filestub = '{lang}wiki/{version}/{lang}wiki-{version}-pages-articles.xml.bz2'.format(
version=self.version, lang=self.lang)
if project not in PROJECTS:
raise ValueError(
'Invalid or unsupported wikimedia project: {project}'.format(
project=project
))
self.project = PROJECTS[project]
self.filestub = '{lang}{project}/{version}/{lang}{project}-{version}-pages-articles.xml.bz2'.format(
version=self.version, lang=self.lang, project=self.project)
self._filename = os.path.join(data_dir, self.filestub)
self._namespace = namespace

Expand All @@ -183,6 +204,27 @@ def filename(self):
else:
return None

def _verify_dump_file_url(self, dumpfile_url):
res = requests.head(dumpfile_url)
if res.status_code != 200:
# invalid dump file url. check that {lang}{project} exists
lang_project_url = compat.urljoin(
DOWNLOAD_ROOT,
"{lang}{project}/".format(lang=self.lang, project=self.project)
)
res2 = requests.head(lang_project_url)
if res2.status_code != 200:
raise ValueError(
"'{lang}' variant of '{project}' does not exist".format(
lang=self.lang, project=PROJECTS_INV[self.project]
)
)
else:
# lang/project exists, just bad dumpfile url
raise requests.HTTPError(
"404: Not Found. {url}".format(url=dumpfile_url)
)

def download(self, force=False):
"""
Download the data corresponding to the given ``lang`` and ``version``
Expand All @@ -200,6 +242,7 @@ def download(self, force=False):
return
LOGGER.info(
'Downloading data from %s and writing it to %s', url, fname)
self._verify_dump_file_url(url)
io.write_http_stream(
url, fname, mode='wb', encoding=None,
make_dirs=True, chunk_size=1024)
Expand Down Expand Up @@ -377,7 +420,6 @@ def records(self, min_len=100, limit=-1, fast=False):

n_pages = 0
for page_id, title, content in self:

page = self._parse_content(content, parser, fast)
if len(page['text']) < min_len:
continue
Expand Down

0 comments on commit db63765

Please sign in to comment.