Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add python 3 support #220

Open
wants to merge 21 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
3 changes: 2 additions & 1 deletion .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,10 @@ language: python
python:
- 2.6
- 2.7
- 3.4

install:
- pip install -r requirements.txt --use-mirrors
- pip install jieba
- python setup.py install

script: python setup.py test
10 changes: 6 additions & 4 deletions goose/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@
limitations under the License.
"""
import os
import platform
from tempfile import mkstemp

from goose.version import version_info, __version__
Expand Down Expand Up @@ -64,9 +63,12 @@ def crawl(self, crawl_candiate):
try:
crawler = Crawler(self.config)
article = crawler.crawl(crawl_candiate)
except (UnicodeDecodeError, ValueError):
self.config.parser_class = parsers[0]
return self.crawl(crawl_candiate)
except (UnicodeDecodeError, ValueError) as e:
if parsers:
self.config.parser_class = parsers[0]
return self.crawl(crawl_candiate)
else:
raise e
return article

def initialize(self):
Expand Down
2 changes: 2 additions & 0 deletions goose/cleaners.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@
See the License for the specific language governing permissions and
limitations under the License.
"""
from __future__ import unicode_literals

from goose.utils import ReplaceSequence


Expand Down
9 changes: 7 additions & 2 deletions goose/configuration.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,9 @@
"""
import os
import tempfile

import six

from goose.text import StopWords
from goose.parsers import Parser
from goose.parsers import ParserSoup
Expand All @@ -30,10 +33,12 @@
HTTP_DEFAULT_TIMEOUT = 30

AVAILABLE_PARSERS = {
'lxml': Parser,
'soup': ParserSoup,
'lxml': Parser
}

if six.PY2:
AVAILABLE_PARSERS['soup'] = ParserSoup


class Configuration(object):

Expand Down
2 changes: 1 addition & 1 deletion goose/extractors/content.py
Original file line number Diff line number Diff line change
Expand Up @@ -260,7 +260,7 @@ def update_score(self, node, addToScore):
if score_string:
current_score = int(score_string)

new_score = current_score + addToScore
new_score = current_score + int(addToScore)
self.parser.setAttribute(node, "gravityScore", str(new_score))

def update_node_count(self, node, add_to_count):
Expand Down
2 changes: 1 addition & 1 deletion goose/extractors/images.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
import re
import os

from urlparse import urlparse, urljoin
from six.moves.urllib.parse import urlparse, urljoin

from goose.extractors import BaseExtractor
from goose.image import Image
Expand Down
4 changes: 2 additions & 2 deletions goose/extractors/metas.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,8 @@
"""

import re
from urlparse import urljoin
from urlparse import urlparse

from six.moves.urllib.parse import urlparse, urljoin

from goose.extractors import BaseExtractor

Expand Down
4 changes: 2 additions & 2 deletions goose/image.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ def __init__(self):
self.extraction_type = "NA"

# stores how many bytes this image is.
self.bytes = long(0)
self.bytes = 0

def get_src(self):
return self.src
Expand Down Expand Up @@ -87,7 +87,7 @@ def set_mime_type(self, mime_type):
class LocallyStoredImage(object):

def __init__(self, src='', local_filename='',
link_hash='', bytes=long(0), file_extension='', height=0, width=0):
link_hash='', bytes=0, file_extension='', height=0, width=0):
self.src = src
self.local_filename = local_filename
self.link_hash = link_hash
Expand Down
17 changes: 9 additions & 8 deletions goose/network.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,12 @@
See the License for the specific language governing permissions and
limitations under the License.
"""
import urllib2
import six

try:
from urllib2 import urlopen, Request
except ImportError:
from urllib.request import urlopen, Request


class HtmlFetcher(object):
Expand All @@ -39,18 +44,14 @@ def get_url(self):

def get_html(self, url):
# utf-8 encode unicode url
if isinstance(url, unicode):
if isinstance(url, six.text_type) and six.PY2:
url = url.encode('utf-8')

# set request
self.request = urllib2.Request(
url,
headers=self.headers)
self.request = Request(url, headers=self.headers)
# do request
try:
self.result = urllib2.urlopen(
self.request,
timeout=self.config.http_timeout)
self.result = urlopen(self.request, timeout=self.config.http_timeout)
except Exception:
self.result = None

Expand Down
3 changes: 2 additions & 1 deletion goose/outputformatters.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,8 @@
See the License for the specific language governing permissions and
limitations under the License.
"""
from HTMLParser import HTMLParser
from six.moves.html_parser import HTMLParser

from goose.text import innerTrim


Expand Down
21 changes: 15 additions & 6 deletions goose/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,11 +21,12 @@
limitations under the License.
"""
import lxml.html
from lxml.html import soupparser

import six

from lxml import etree
from copy import deepcopy
from goose.text import innerTrim
from goose.text import encodeValue
from goose.text import innerTrim, encodeValue, get_encodings_from_content, smart_str


class Parser(object):
Expand All @@ -50,13 +51,20 @@ def css_select(self, node, selector):

@classmethod
def fromstring(self, html):
html = encodeValue(html)
self.doc = lxml.html.fromstring(html)
encoding = get_encodings_from_content(html)
encoding = encoding and encoding[0] or None
if not encoding:
html = encodeValue(html)
self.doc = lxml.html.fromstring(html)
else:
html = smart_str(html, encoding=encoding)
parser = lxml.html.HTMLParser(encoding=encoding)
self.doc = lxml.html.fromstring(html, parser=parser)
return self.doc

@classmethod
def nodeToString(self, node):
return etree.tostring(node)
return etree.tostring(node, encoding=six.text_type)

@classmethod
def replaceTag(self, node, tag):
Expand Down Expand Up @@ -239,6 +247,7 @@ class ParserSoup(Parser):

@classmethod
def fromstring(self, html):
from lxml.html import soupparser
html = encodeValue(html)
self.doc = soupparser.fromstring(html)
return self.doc
47 changes: 42 additions & 5 deletions goose/text.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,9 @@
import os
import re
import string

import six

from goose.utils import FileHelper
from goose.utils.encoding import smart_unicode
from goose.utils.encoding import smart_str
Expand All @@ -31,8 +34,42 @@
TABSSPACE = re.compile(r'[\s\t]+')


def get_encodings_from_content(content):
"""
Code from:
https://github.com/sigmavirus24/requests-toolbelt/blob/master/requests_toolbelt/utils/deprecated.py
Return encodings from given content string.
:param content: string to extract encodings from.
"""
if isinstance(content, six.binary_type) and six.PY3:
find_charset = re.compile(
br'<meta.*?charset=["\']*(.+?)["\'>]', flags=re.I
).findall

find_pragma = re.compile(
br'<meta.*?content=["\']*;?charset=(.+?)["\'>]', flags=re.I
).findall

find_xml = re.compile(
br'^<\?xml.*?encoding=["\']*(.+?)["\'>]'
).findall
else:
find_charset = re.compile(
r'<meta.*?charset=["\']*(.+?)["\'>]', flags=re.I
).findall

find_pragma = re.compile(
r'<meta.*?content=["\']*;?charset=(.+?)["\'>]', flags=re.I
).findall

find_xml = re.compile(
r'^<\?xml.*?encoding=["\']*(.+?)["\'>]'
).findall
return find_charset(content) + find_pragma(content) + find_xml(content)


def innerTrim(value):
if isinstance(value, (unicode, str)):
if isinstance(value, (six.text_type, six.string_types)):
# remove tab and white space
value = re.sub(TABSSPACE, ' ', value)
value = ''.join(value.splitlines())
Expand Down Expand Up @@ -87,7 +124,6 @@ def set_word_count(self, cnt):
class StopWords(object):

PUNCTUATION = re.compile("[^\\p{Ll}\\p{Lu}\\p{Lt}\\p{Lo}\\p{Nd}\\p{Pc}\\s]")
TRANS_TABLE = string.maketrans('', '')
_cached_stop_words = {}

def __init__(self, language='en'):
Expand All @@ -106,9 +142,10 @@ def __init__(self, language='en'):
def remove_punctuation(self, content):
# code taken form
# http://stackoverflow.com/questions/265960/best-way-to-strip-punctuation-from-a-string-in-python
if isinstance(content, unicode):
content = content.encode('utf-8')
return content.translate(self.TRANS_TABLE, string.punctuation)
if not isinstance(content, six.text_type):
content = content.decode('utf-8')
tbl = dict.fromkeys(ord(x) for x in string.punctuation)
return content.translate(tbl)

def candiate_words(self, stripped_input):
return stripped_input.split(' ')
Expand Down
13 changes: 10 additions & 3 deletions goose/utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,13 @@
import os
import goose
import codecs
import urlparse

import six

try:
from urlparse import urlparse
except ImportError:
from urllib.parse import urlparse


class BuildURL(object):
Expand Down Expand Up @@ -89,7 +95,7 @@ def __init__(self, urlString, link_hash):
class RawHelper(object):
@classmethod
def get_parsing_candidate(self, url, raw_html):
if isinstance(raw_html, unicode):
if isinstance(raw_html, six.text_type):
raw_html = raw_html.encode('utf-8')
link_hash = '%s.%s' % (hashlib.md5(raw_html).hexdigest(), time.time())
return ParsingCandidate(url, link_hash)
Expand All @@ -101,7 +107,8 @@ def get_parsing_candidate(self, url_to_crawl):
# replace shebang is urls
final_url = url_to_crawl.replace('#!', '?_escaped_fragment_=') \
if '#!' in url_to_crawl else url_to_crawl
link_hash = '%s.%s' % (hashlib.md5(final_url).hexdigest(), time.time())
url = final_url.encode("utf-8") if isinstance(final_url, six.text_type) else final_url
link_hash = '%s.%s' % (hashlib.md5(url).hexdigest(), time.time())
return ParsingCandidate(final_url, link_hash)


Expand Down