1717from .html_parser import HTMLParseError , HTMLParser
1818
1919# Configuration for urlize() function.
20- TRAILING_PUNCTUATION_RE = re .compile (
21- '^' # Beginning of word
22- '(.*?)' # The URL in word
23- '([.,:;!]+)' # Allowed non-wrapping, trailing punctuation
24- '$' # End of word
25- )
20+ TRAILING_PUNCTUATION_CHARS = '.,:;!'
2621WRAPPING_PUNCTUATION = [('(' , ')' ), ('<' , '>' ), ('[' , ']' ), ('<' , '>' ), ('"' , '"' ), ('\' ' , '\' ' )]
2722
2823# List of possible strings used for bullets in bulleted lists.
3227word_split_re = re .compile (r'''([\s<>"']+)''' )
3328simple_url_re = re .compile (r'^https?://\[?\w' , re .IGNORECASE )
3429simple_url_2_re = re .compile (r'^www\.|^(?!http)\w[^@]+\.(com|edu|gov|int|mil|net|org)($|/.*)$' , re .IGNORECASE )
35- simple_email_re = re .compile (r'^\S+@\S+\.\S+$' )
3630
3731
3832@keep_lazy (six .text_type , SafeText )
@@ -280,10 +274,10 @@ def trim_punctuation(lead, middle, trail):
280274 trimmed_something = False
281275
282276 # Trim trailing punctuation.
283- match = TRAILING_PUNCTUATION_RE . match ( middle )
284- if match :
285- middle = match . group ( 1 )
286- trail = match . group ( 2 ) + trail
277+ stripped = middle . rstrip ( TRAILING_PUNCTUATION_CHARS )
278+ if middle != stripped :
279+ trail = middle [ len ( stripped ):] + trail
280+ middle = stripped
287281 trimmed_something = True
288282
289283 # Trim wrapping punctuation.
@@ -300,6 +294,21 @@ def trim_punctuation(lead, middle, trail):
300294 trimmed_something = True
301295 return lead , middle , trail
302296
297+ def is_email_simple (value ):
298+ """Return True if value looks like an email address."""
299+ # An @ must be in the middle of the value.
300+ if '@' not in value or value .startswith ('@' ) or value .endswith ('@' ):
301+ return False
302+ try :
303+ p1 , p2 = value .split ('@' )
304+ except ValueError :
305+ # value contains more than one @.
306+ return False
307+ # Dot must be in p2 (e.g. example.com)
308+ if '.' not in p2 or p2 .startswith ('.' ):
309+ return False
310+ return True
311+
303312 words = word_split_re .split (force_text (text ))
304313 for i , word in enumerate (words ):
305314 if '.' in word or '@' in word or ':' in word :
@@ -319,7 +328,7 @@ def trim_punctuation(lead, middle, trail):
319328 elif simple_url_2_re .match (middle ):
320329 middle , middle_unescaped , trail = unescape (middle , trail )
321330 url = smart_urlquote ('http://%s' % middle_unescaped )
322- elif ':' not in middle and simple_email_re . match (middle ):
331+ elif ':' not in middle and is_email_simple (middle ):
323332 local , domain = middle .rsplit ('@' , 1 )
324333 try :
325334 domain = domain .encode ('idna' ).decode ('ascii' )
0 commit comments