Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

added a named param to records() to allow stripping of section headings #224

Merged
merged 3 commits into from
Feb 2, 2019
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
13 changes: 8 additions & 5 deletions textacy/datasets/wikipedia.py
Original file line number Diff line number Diff line change
Expand Up @@ -299,7 +299,7 @@ def __iter__(self):
yield page_id, title, content
elem.clear()

def _parse_content(self, content, parser, fast):
def _parse_content(self, content, parser, fast, include_headings):
unicode_ = compat.unicode_ # for performance
wikicode = parser.parse(content)

Expand Down Expand Up @@ -345,7 +345,7 @@ def is_bad_template(obj):
texts = []
# strip out references, tables, and file/image links
# then concatenate the stripped text of each section
for i, section in enumerate(wikicode.get_sections(flat=True, include_lead=True, include_headings=True)):
for i, section in enumerate(wikicode.get_sections(flat=True, include_lead=True, include_headings=include_headings)):
for obj in section.ifilter_wikilinks(matches=is_bad_wikilink, recursive=True):
try:
section.remove(obj)
Expand Down Expand Up @@ -396,7 +396,7 @@ def texts(self, min_len=100, limit=-1):
if n_pages == limit:
break

def records(self, min_len=100, limit=-1, fast=False):
def records(self, min_len=100, limit=-1, fast=False, include_headings=True):
"""
Iterate over pages (parsed text and metadata) in a Wikipedia database dump,
optionally filtering by text length.
Expand All @@ -409,6 +409,8 @@ def records(self, min_len=100, limit=-1, fast=False):
fast (bool): If True, text is extracted using a faster method but
which gives lower quality results. Otherwise, a slower but better
method is used to extract article text.
include_headings (bool): Whether to include section headings and
the page title as part of the page's text. Default: True
Yields:
dict: Parsed text and metadata of next page in the Wikipedia database dump.

Expand All @@ -427,12 +429,13 @@ def records(self, min_len=100, limit=-1, fast=False):

n_pages = 0
for page_id, title, content in self:
page = self._parse_content(content, parser, fast)
page = self._parse_content(content, parser, fast, include_headings)
if len(page['text']) < min_len:
continue
page['title'] = title
page['page_id'] = page_id
page['text'] = title + '\n\n' + page['text']
if include_headings:
page['text'] = title + '\n\n' + page['text']

yield page

Expand Down