grangier · vetal4444 · Jun 29, 2014 · Jul 14, 2014 · Jul 14, 2014 · Sep 14, 2014
diff --git a/.travis.yml b/.travis.yml
@@ -3,9 +3,10 @@ language: python
 python:
     - 2.6
     - 2.7
+    - 3.4
 
 install:
-    - pip install -r requirements.txt --use-mirrors
+    - pip install jieba
     - python setup.py install
 
 script: python setup.py test
diff --git a/goose/__init__.py b/goose/__init__.py
@@ -21,7 +21,6 @@
 limitations under the License.
 """
 import os
-import platform
 from tempfile import mkstemp
 
 from goose.version import version_info, __version__
@@ -64,9 +63,12 @@ def crawl(self, crawl_candiate):
         try:
             crawler = Crawler(self.config)
             article = crawler.crawl(crawl_candiate)
-        except (UnicodeDecodeError, ValueError):
-            self.config.parser_class = parsers[0]
-            return self.crawl(crawl_candiate)
+        except (UnicodeDecodeError, ValueError) as e:
+            if parsers:
+                self.config.parser_class = parsers[0]
+                return self.crawl(crawl_candiate)
+            else:
+                raise e
         return article
 
     def initialize(self):

diff --git a/goose/cleaners.py b/goose/cleaners.py
@@ -20,6 +20,8 @@
 See the License for the specific language governing permissions and
 limitations under the License.
 """
+from __future__ import unicode_literals
+
 from goose.utils import ReplaceSequence
 
 

diff --git a/goose/configuration.py b/goose/configuration.py
@@ -22,6 +22,9 @@
 """
 import os
 import tempfile
+
+import six
+
 from goose.text import StopWords
 from goose.parsers import Parser
 from goose.parsers import ParserSoup
@@ -30,10 +33,12 @@
 HTTP_DEFAULT_TIMEOUT = 30
 
 AVAILABLE_PARSERS = {
-    'lxml': Parser,
-    'soup': ParserSoup,
+    'lxml': Parser
 }
 
+if six.PY2:
+    AVAILABLE_PARSERS['soup'] = ParserSoup
+
 
 class Configuration(object):
 

diff --git a/goose/extractors/content.py b/goose/extractors/content.py
@@ -260,7 +260,7 @@ def update_score(self, node, addToScore):
         if score_string:
             current_score = int(score_string)
 
-        new_score = current_score + addToScore
+        new_score = current_score + int(addToScore)
         self.parser.setAttribute(node, "gravityScore", str(new_score))
 
     def update_node_count(self, node, add_to_count):

diff --git a/goose/extractors/images.py b/goose/extractors/images.py
@@ -23,7 +23,7 @@
 import re
 import os
 
-from urlparse import urlparse, urljoin
+from six.moves.urllib.parse import urlparse, urljoin
 
 from goose.extractors import BaseExtractor
 from goose.image import Image

diff --git a/goose/extractors/metas.py b/goose/extractors/metas.py
@@ -22,8 +22,8 @@
 """
 
 import re
-from urlparse import urljoin
-from urlparse import urlparse
+
+from six.moves.urllib.parse import urlparse, urljoin
 
 from goose.extractors import BaseExtractor
 

diff --git a/goose/image.py b/goose/image.py
@@ -46,7 +46,7 @@ def __init__(self):
         self.extraction_type = "NA"
 
         # stores how many bytes this image is.
-        self.bytes = long(0)
+        self.bytes = 0
 
     def get_src(self):
         return self.src
@@ -87,7 +87,7 @@ def set_mime_type(self, mime_type):
 class LocallyStoredImage(object):
 
     def __init__(self, src='', local_filename='',
-        link_hash='', bytes=long(0), file_extension='', height=0, width=0):
+                 link_hash='', bytes=0, file_extension='', height=0, width=0):
         self.src = src
         self.local_filename = local_filename
         self.link_hash = link_hash

diff --git a/goose/network.py b/goose/network.py
@@ -20,7 +20,12 @@
 See the License for the specific language governing permissions and
 limitations under the License.
 """
-import urllib2
+import six
+
+try:
+    from urllib2 import urlopen, Request
+except ImportError:
+    from urllib.request import urlopen, Request
 
 
 class HtmlFetcher(object):
@@ -39,18 +44,14 @@ def get_url(self):
 
     def get_html(self, url):
         # utf-8 encode unicode url
-        if isinstance(url, unicode):
+        if isinstance(url, six.text_type) and six.PY2:
             url = url.encode('utf-8')
 
         # set request
-        self.request = urllib2.Request(
-                        url,
-                        headers=self.headers)
+        self.request = Request(url, headers=self.headers)
         # do request
         try:
-            self.result = urllib2.urlopen(
-                            self.request,
-                            timeout=self.config.http_timeout)
+            self.result = urlopen(self.request, timeout=self.config.http_timeout)
         except Exception:
             self.result = None
 

diff --git a/goose/outputformatters.py b/goose/outputformatters.py
@@ -20,7 +20,8 @@
 See the License for the specific language governing permissions and
 limitations under the License.
 """
-from HTMLParser import HTMLParser
+from six.moves.html_parser import HTMLParser
+
 from goose.text import innerTrim
 
 

diff --git a/goose/parsers.py b/goose/parsers.py
@@ -21,11 +21,12 @@
 limitations under the License.
 """
 import lxml.html
-from lxml.html import soupparser
+
+import six
+
 from lxml import etree
 from copy import deepcopy
-from goose.text import innerTrim
-from goose.text import encodeValue
+from goose.text import innerTrim, encodeValue, get_encodings_from_content, smart_str
 
 
 class Parser(object):
@@ -50,13 +51,20 @@ def css_select(self, node, selector):
 
     @classmethod
     def fromstring(self, html):
-        html = encodeValue(html)
-        self.doc = lxml.html.fromstring(html)
+        encoding = get_encodings_from_content(html)
+        encoding = encoding and encoding[0] or None
+        if not encoding:
+            html = encodeValue(html)
+            self.doc = lxml.html.fromstring(html)
+        else:
+            html = smart_str(html, encoding=encoding)
+            parser = lxml.html.HTMLParser(encoding=encoding)
+            self.doc = lxml.html.fromstring(html, parser=parser)
         return self.doc
 
     @classmethod
     def nodeToString(self, node):
-        return etree.tostring(node)
+        return etree.tostring(node, encoding=six.text_type)
 
     @classmethod
     def replaceTag(self, node, tag):
@@ -239,6 +247,7 @@ class ParserSoup(Parser):
 
     @classmethod
     def fromstring(self, html):
+        from lxml.html import soupparser
         html = encodeValue(html)
         self.doc = soupparser.fromstring(html)
         return self.doc
diff --git a/goose/text.py b/goose/text.py
@@ -23,6 +23,9 @@
 import os
 import re
 import string
+
+import six
+
 from goose.utils import FileHelper
 from goose.utils.encoding import smart_unicode
 from goose.utils.encoding import smart_str
@@ -31,8 +34,42 @@
 TABSSPACE = re.compile(r'[\s\t]+')
 
 
+def get_encodings_from_content(content):
+    """
+    Code from:
+    https://github.com/sigmavirus24/requests-toolbelt/blob/master/requests_toolbelt/utils/deprecated.py
+    Return encodings from given content string.
+    :param content: string to extract encodings from.
+    """
+    if isinstance(content, six.binary_type) and six.PY3:
+        find_charset = re.compile(
+            br'<meta.*?charset=["\']*(.+?)["\'>]', flags=re.I
+        ).findall
+
+        find_pragma = re.compile(
+            br'<meta.*?content=["\']*;?charset=(.+?)["\'>]', flags=re.I
+        ).findall
+
+        find_xml = re.compile(
+            br'^<\?xml.*?encoding=["\']*(.+?)["\'>]'
+        ).findall
+    else:
+        find_charset = re.compile(
+            r'<meta.*?charset=["\']*(.+?)["\'>]', flags=re.I
+        ).findall
+
+        find_pragma = re.compile(
+            r'<meta.*?content=["\']*;?charset=(.+?)["\'>]', flags=re.I
+        ).findall
+
+        find_xml = re.compile(
+            r'^<\?xml.*?encoding=["\']*(.+?)["\'>]'
+        ).findall
+    return find_charset(content) + find_pragma(content) + find_xml(content)
+
+
 def innerTrim(value):
-    if isinstance(value, (unicode, str)):
+    if isinstance(value, (six.text_type, six.string_types)):
         # remove tab and white space
         value = re.sub(TABSSPACE, ' ', value)
         value = ''.join(value.splitlines())
@@ -87,7 +124,6 @@ def set_word_count(self, cnt):
 class StopWords(object):
 
     PUNCTUATION = re.compile("[^\\p{Ll}\\p{Lu}\\p{Lt}\\p{Lo}\\p{Nd}\\p{Pc}\\s]")
-    TRANS_TABLE = string.maketrans('', '')
     _cached_stop_words = {}
 
     def __init__(self, language='en'):
@@ -106,9 +142,10 @@ def __init__(self, language='en'):
     def remove_punctuation(self, content):
         # code taken form
         # http://stackoverflow.com/questions/265960/best-way-to-strip-punctuation-from-a-string-in-python
-        if isinstance(content, unicode):
-            content = content.encode('utf-8')
-        return content.translate(self.TRANS_TABLE, string.punctuation)
+        if not isinstance(content, six.text_type):
+            content = content.decode('utf-8')
+        tbl = dict.fromkeys(ord(x) for x in string.punctuation)
+        return content.translate(tbl)
 
     def candiate_words(self, stripped_input):
         return stripped_input.split(' ')

diff --git a/goose/utils/__init__.py b/goose/utils/__init__.py
@@ -26,7 +26,13 @@
 import os
 import goose
 import codecs
-import urlparse
+
+import six
+
+try:
+    from urlparse import urlparse
+except ImportError:
+    from urllib.parse import urlparse
 
 
 class BuildURL(object):
@@ -89,7 +95,7 @@ def __init__(self, urlString, link_hash):
 class RawHelper(object):
     @classmethod
     def get_parsing_candidate(self, url, raw_html):
-        if isinstance(raw_html, unicode):
+        if isinstance(raw_html, six.text_type):
             raw_html = raw_html.encode('utf-8')
         link_hash = '%s.%s' % (hashlib.md5(raw_html).hexdigest(), time.time())
         return ParsingCandidate(url, link_hash)
@@ -101,7 +107,8 @@ def get_parsing_candidate(self, url_to_crawl):
         # replace shebang is urls
         final_url = url_to_crawl.replace('#!', '?_escaped_fragment_=') \
                     if '#!' in url_to_crawl else url_to_crawl
-        link_hash = '%s.%s' % (hashlib.md5(final_url).hexdigest(), time.time())
+        url = final_url.encode("utf-8") if isinstance(final_url, six.text_type) else final_url
+        link_hash = '%s.%s' % (hashlib.md5(url).hexdigest(), time.time())
         return ParsingCandidate(final_url, link_hash)