Skip to content

Commit

Permalink
Merge pull request #224 from ckot/feat-strip-section-headings
Browse files Browse the repository at this point in the history
added a named param to records() to allow stripping of section headings
  • Loading branch information
bdewilde committed Feb 2, 2019
2 parents bdbb4a8 + b28478c commit 7e1f3a8
Showing 1 changed file with 8 additions and 5 deletions.
13 changes: 8 additions & 5 deletions textacy/datasets/wikipedia.py
Original file line number Diff line number Diff line change
Expand Up @@ -299,7 +299,7 @@ def __iter__(self):
yield page_id, title, content
elem.clear()

def _parse_content(self, content, parser, fast):
def _parse_content(self, content, parser, fast, include_headings):
unicode_ = compat.unicode_ # for performance
wikicode = parser.parse(content)

Expand Down Expand Up @@ -345,7 +345,7 @@ def is_bad_template(obj):
texts = []
# strip out references, tables, and file/image links
# then concatenate the stripped text of each section
for i, section in enumerate(wikicode.get_sections(flat=True, include_lead=True, include_headings=True)):
for i, section in enumerate(wikicode.get_sections(flat=True, include_lead=True, include_headings=include_headings)):
for obj in section.ifilter_wikilinks(matches=is_bad_wikilink, recursive=True):
try:
section.remove(obj)
Expand Down Expand Up @@ -396,7 +396,7 @@ def texts(self, min_len=100, limit=-1):
if n_pages == limit:
break

def records(self, min_len=100, limit=-1, fast=False):
def records(self, min_len=100, limit=-1, fast=False, include_headings=True):
"""
Iterate over pages (parsed text and metadata) in a Wikipedia database dump,
optionally filtering by text length.
Expand All @@ -409,6 +409,8 @@ def records(self, min_len=100, limit=-1, fast=False):
fast (bool): If True, text is extracted using a faster method but
which gives lower quality results. Otherwise, a slower but better
method is used to extract article text.
include_headings (bool): Whether to include section headings and
the page title as part of the page's text. Default: True
Yields:
dict: Parsed text and metadata of next page in the Wikipedia database dump.
Expand All @@ -427,12 +429,13 @@ def records(self, min_len=100, limit=-1, fast=False):

n_pages = 0
for page_id, title, content in self:
page = self._parse_content(content, parser, fast)
page = self._parse_content(content, parser, fast, include_headings)
if len(page['text']) < min_len:
continue
page['title'] = title
page['page_id'] = page_id
page['text'] = title + '\n\n' + page['text']
if include_headings:
page['text'] = title + '\n\n' + page['text']

yield page

Expand Down

0 comments on commit 7e1f3a8

Please sign in to comment.