Merge pull request #224 from ckot/feat-strip-section-headings

added a named param to records() to allow stripping of section headings
chartbeat-labs · Feb 2, 2019 · 7e1f3a8 · 7e1f3a8
2 parents bdbb4a8 + b28478c
commit 7e1f3a8
Showing 1 changed file with 8 additions and 5 deletions.
diff --git a/textacy/datasets/wikipedia.py b/textacy/datasets/wikipedia.py
@@ -299,7 +299,7 @@ def __iter__(self):
                     yield page_id, title, content
                     elem.clear()
 
-    def _parse_content(self, content, parser, fast):
+    def _parse_content(self, content, parser, fast, include_headings):
         unicode_ = compat.unicode_  # for performance
         wikicode = parser.parse(content)
 
@@ -345,7 +345,7 @@ def is_bad_template(obj):
             texts = []
             # strip out references, tables, and file/image links
             # then concatenate the stripped text of each section
-            for i, section in enumerate(wikicode.get_sections(flat=True, include_lead=True, include_headings=True)):
+            for i, section in enumerate(wikicode.get_sections(flat=True, include_lead=True, include_headings=include_headings)):
                 for obj in section.ifilter_wikilinks(matches=is_bad_wikilink, recursive=True):
                     try:
                         section.remove(obj)
@@ -396,7 +396,7 @@ def texts(self, min_len=100, limit=-1):
             if n_pages == limit:
                 break
 
-    def records(self, min_len=100, limit=-1, fast=False):
+    def records(self, min_len=100, limit=-1, fast=False, include_headings=True):
         """
         Iterate over pages (parsed text and metadata) in a Wikipedia database dump,
         optionally filtering by text length.
@@ -409,6 +409,8 @@ def records(self, min_len=100, limit=-1, fast=False):
             fast (bool): If True, text is extracted using a faster method but
                 which gives lower quality results. Otherwise, a slower but better
                 method is used to extract article text.
+            include_headings (bool): Whether to include section headings and
+                the page title as part of the page's text. Default: True
         Yields:
             dict: Parsed text and metadata of next page in the Wikipedia database dump.
 
@@ -427,12 +429,13 @@ def records(self, min_len=100, limit=-1, fast=False):
 
         n_pages = 0
         for page_id, title, content in self:
-            page = self._parse_content(content, parser, fast)
+            page = self._parse_content(content, parser, fast, include_headings)
             if len(page['text']) < min_len:
                 continue
             page['title'] = title
             page['page_id'] = page_id
-            page['text'] = title + '\n\n' + page['text']
+            if include_headings:
+                page['text'] = title + '\n\n' + page['text']
 
             yield page