Skip to content

Commit

Permalink
Fix None title
Browse files Browse the repository at this point in the history
  • Loading branch information
slitayem committed Feb 11, 2015
1 parent 45c9acd commit 51e89df
Showing 1 changed file with 21 additions and 20 deletions.
41 changes: 21 additions & 20 deletions goose/extractors/title.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,28 +76,29 @@ def get_title(self):
title_ = self.article.opengraph.get('title', '')
if title_:
# handle tags without any title: <meta property="og:title" />
return self.clean_title(title_)

# try to fetch the meta headline
meta_headline = self.parser.getElementsByTag(
self.article.doc,
tag="meta",
attr="name",
value="headline")
if meta_headline:
title_ = self.parser.getAttribute(meta_headline[0], 'content')
if title_:
return self.clean_title(title_)

# otherwise use the title meta
title_element = self.parser.getElementsByTag(self.article.doc, tag='title')
if title_element:
title_ = self.parser.getText(title_element[0])
if title_:
return self.clean_title(title_)
title = self.clean_title(title_)
else:
# try to fetch the meta headline
meta_headline = self.parser.getElementsByTag(
self.article.doc,
tag="meta",
attr="name",
value="headline")
if meta_headline:
title_ = self.parser.getAttribute(meta_headline[0], 'content')
if title_:
title = self.clean_title(title_)
else:
# otherwise use the title meta
title_element = self.parser.getElementsByTag(self.article.doc, tag='title')
if title_element:
title_ = self.parser.getText(title_element[0])
if title_:
title = self.clean_title(title_)
except:
print >> sys.stderr, 'ERROR when getting title: ', traceback.format_exec()
return title

return title

def extract(self):
return self.get_title()

0 comments on commit 51e89df

Please sign in to comment.