Skip to content

Commit

Permalink
pattern.web: improved timeout errors, DOM CSS selectors bug fix
Browse files Browse the repository at this point in the history
  • Loading branch information
Tom De Smedt committed Sep 12, 2014
1 parent 5eb34c1 commit be792b0
Showing 1 changed file with 47 additions and 9 deletions.
56 changes: 47 additions & 9 deletions pattern/web/__init__.py
Expand Up @@ -15,14 +15,15 @@
import sys
import threading
import time
import socket, urlparse, urllib, urllib2
import socket, urlparse, urllib, urllib2, ssl
import base64
import htmlentitydefs
import httplib
import sgmllib
import cookielib
import re
import xml.dom.minidom
import unicodedata
import string
import StringIO
import bisect
Expand Down Expand Up @@ -72,13 +73,16 @@
0x2020, 0x2021, 0x2022, 0x2026, 0x2030, 0x2039, 0x203A, 0x20AC, 0x2122
])

def fix(s, ignore=u""):
def fix(s, ignore=""):
""" Returns a Unicode string that fixes common encoding problems (Latin-1, Windows-1252).
For example: fix("cliché") => u"cliché".
"""
# http://blog.luminoso.com/2012/08/20/fix-unicode-mistakes-with-python/
if not isinstance(s, unicode):
s = s.decode("utf-8")
# If this doesn't work,
# copy & paste string in a Unicode .txt,
# and then pass open(f).read() to fix().
u = []
i = 0
for j, ch in enumerate(s):
Expand Down Expand Up @@ -110,6 +114,14 @@ def fix(s, ignore=u""):
u = u.replace("\n ", "\n")
return u

def latin(s):
""" Returns True if the string contains only Latin-1 characters
(no Chinese, Japanese, Arabic, Cyrillic, Hebrew, Greek, ...).
"""
if not isinstance(s, unicode):
s = s.decode("utf-8")
return all(unicodedata.name(ch).startswith("LATIN") for ch in s if ch.isalpha())

def decode_string(v, encoding="utf-8"):
""" Returns the given value as a Unicode string (if possible).
"""
Expand Down Expand Up @@ -454,11 +466,12 @@ def open(self, timeout=10, proxy=None, user_agent=USER_AGENT, referrer=REFERRER,
except socket.timeout as e:
raise URLTimeout(src=e, url=url)
except socket.error as e:
if "timed out" in e.args[1]:
if "timed out" in str((e.args + ("", ""))[0]) \
or "timed out" in str((e.args + ("", ""))[1]):
raise URLTimeout(src=e, url=url)
raise URLError(str(e), src=e, url=url)
except urllib2.URLError as e:
if "timed out" in e.reason:
if "timed out" in str(e.reason):
raise URLTimeout(src=e, url=url)
raise URLError(str(e), src=e, url=url)
except ValueError as e:
Expand Down Expand Up @@ -1189,6 +1202,8 @@ def translate(self, string, input="en", output="fr", **kwargs):
kwargs.setdefault("cached", False)
kwargs.setdefault("unicode", True)
kwargs.setdefault("throttle", self.throttle)
if input == output:
return string
try:
data = url.download(**kwargs)
except HTTP403Forbidden:
Expand Down Expand Up @@ -1370,7 +1385,7 @@ def search(self, query, type=SEARCH, start=1, count=10, sort=RELEVANCY, size=Non
data = data.get("d", {})
data = data.get("results", [{}])[0]
results = Results(BING, query, type)
results.total = int(data.get(src+"Total", 0))
results.total = int(data.get(src+"Total") or 0)
for x in data.get(src, []):
r = Result(url=None)
r.url = self.format(x.get("MediaUrl", x.get("Url")))
Expand Down Expand Up @@ -2048,6 +2063,10 @@ def plaintext(self, **kwargs):
@property
def html(self):
return self.source

@property
def src(self):
return self.source

@property
def string(self):
Expand Down Expand Up @@ -2081,6 +2100,10 @@ def source(self):
@property
def html(self):
return self.source

@property
def src(self):
return self.source

@property
def string(self):
Expand Down Expand Up @@ -2163,6 +2186,10 @@ def plaintext(self, **kwargs):
@property
def html(self):
return self.source

@property
def src(self):
return self.source

@property
def string(self):
Expand Down Expand Up @@ -3015,6 +3042,9 @@ def __str__(self):
return bytestring(self.__unicode__())
def __unicode__(self):
return u(self._p)

def __call__(self, *args, **kwargs):
pass

#--- TEXT ------------------------------------------------------------------------------------------

Expand Down Expand Up @@ -3262,15 +3292,23 @@ def _first_child(self, e):
for e in e.children:
if isinstance(e, Element):
return e

def _first_sibling(self, e):
def _next_sibling(self, e):
""" Returns the first next sibling Element of the given element.
"""
while isinstance(e, Node):
e = e.next
if isinstance(e, Element):
return e

def _previous_sibling(self, e):
""" Returns the last previous sibling Element of the given element.
"""
while isinstance(e, Node):
e = e.previous
if isinstance(e, Element):
return e

def _contains(self, e, s):
""" Returns True if string s occurs in the given element (case-insensitive).
"""
Expand Down Expand Up @@ -3305,7 +3343,7 @@ def search(self, e):
# Map tag to True if it is "*".
tag = self.tag == "*" or self.tag
# Map id into a case-insensitive **kwargs dict.
i = lambda s: re.compile(r"\b%s\b" % s, re.I)
i = lambda s: re.compile(r"\b%s(?=$|\s)" % s, re.I)
a = {"id": i(self.id)} if self.id else {}
a.update(map(lambda kv: (kv[0], kv[1]), self.attributes.items()))
# Match tag + id + all classes + relevant pseudo-elements.
Expand Down Expand Up @@ -3377,7 +3415,7 @@ def search(self, e):
e = filter(s.match, e)
if combinator == "+":
# X + Y => X directly precedes Y
e = map(s._first_sibling, e)
e = map(s._next_sibling, e)
e = filter(s.match, e)
m.extend(e)
return m
Expand Down

0 comments on commit be792b0

Please sign in to comment.