diff --git a/newspaper/article.py b/newspaper/article.py index df0d9c435..2cc01bb32 100644 --- a/newspaper/article.py +++ b/newspaper/article.py @@ -270,13 +270,15 @@ def parse(self): # Before any computations on the body, clean DOM object self.doc = document_cleaner.clean(self.doc) + # Extract clean_top_node from clean_doc + self.clean_top_node = self.extractor.calculate_best_node(self.clean_doc) + self.top_node = self.extractor.calculate_best_node(self.doc) if self.top_node is not None: video_extractor = VideoExtractor(self.config, self.top_node) self.set_movies(video_extractor.get_videos()) self.top_node = self.extractor.post_cleanup(self.top_node) - self.clean_top_node = copy.deepcopy(self.top_node) text, article_html = output_formatter.get_formatted( self.top_node) diff --git a/tests/unit_tests.py b/tests/unit_tests.py index 69c05adfa..aef740239 100644 --- a/tests/unit_tests.py +++ b/tests/unit_tests.py @@ -244,6 +244,12 @@ def test_parse_html(self): self.assertEqual(META_SITE_NAME, self.article.meta_site_name) self.assertEqual('2013-11-27 00:00:00', str(self.article.publish_date)) + @print_test + def test_clean_top_node_exists_in_clean_doc(self): + self.setup_stage('parse') + self.article.parse() + self.assertTrue(self.article.clean_doc.getroottree().getpath(self.article.clean_top_node)) + @print_test def test_meta_type_extraction(self): self.setup_stage('meta')