Skip to content

Commit

Permalink
use requests package for util.urlopen
Browse files Browse the repository at this point in the history
now a requests.Response is returned instead of the file-like object from urllib.
Fixed all usages of util.urlopen: it simplifies getting json, text encoding detection.
In particular feedcore (responsible for fetching feeds) is simplified.

This is a first pass and could benefit from better usage of the requests api
(Sessions for instance, to keep connection pools)

TODO: download.py
  • Loading branch information
elelay committed Jul 11, 2020
1 parent 9fd26f2 commit 3babb86
Show file tree
Hide file tree
Showing 11 changed files with 78 additions and 114 deletions.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
- [Python 3.5](http://python.org/) or newer
- [Podcastparser](http://gpodder.org/podcastparser/) 0.6.0 or newer
- [mygpoclient](http://gpodder.org/mygpoclient/) 1.7 or newer
- [requests](https://requests.readthedocs.io) 2.24.0 or newer
- Python D-Bus bindings

As an alternative to python-dbus on Mac OS X and Windows, you can use
Expand Down
2 changes: 1 addition & 1 deletion src/gpodder/coverart.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ def get_cover(self, filename, cover_url, feed_url, title,

try:
logger.info('Downloading cover art: %s', cover_url)
data = util.urlopen(cover_url, timeout=self.TIMEOUT).read()
data = util.urlopen(cover_url, timeout=self.TIMEOUT).content
except Exception as e:
logger.warn('Cover art download failed: %s', e)
return self._fallback_filename(title)
Expand Down
4 changes: 2 additions & 2 deletions src/gpodder/directory.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ def directory_entry_from_opml(url):

def directory_entry_from_mygpo_json(url):
return [DirectoryEntry(d['title'], d['url'], d['logo_url'], d['subscribers'], d['description'])
for d in json.load(util.urlopen(url))]
for d in util.urlopen(url).json()]


class GPodderNetSearchProvider(Provider):
Expand Down Expand Up @@ -152,7 +152,7 @@ def on_tag(self, tag):
return directory_entry_from_mygpo_json('http://gpodder.net/api/2/tag/%s/50.json' % urllib.parse.quote(tag))

def get_tags(self):
return [DirectoryTag(d['tag'], d['usage']) for d in json.load(util.urlopen('http://gpodder.net/api/2/tags/40.json'))]
return [DirectoryTag(d['tag'], d['usage']) for d in util.urlopen('http://gpodder.net/api/2/tags/40.json').json()]


class SoundcloudSearchProvider(Provider):
Expand Down
8 changes: 3 additions & 5 deletions src/gpodder/escapist_videos.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ def get_real_download_url(url):

logger.debug('Config URL: %s', data_config_url)

data_config_data = util.urlopen(data_config_url).read().decode('utf-8')
data_config_data = util.urlopen(data_config_url).text

# TODO: This second argument should get a real name
real_url = get_escapist_real_url(data_config_data, data_config_frag.group(1))
Expand Down Expand Up @@ -120,8 +120,7 @@ def get_real_cover(url):
if rss_url is None:
return None

# FIXME: can I be sure to decode it as utf-8?
rss_data = util.urlopen(rss_url).read()
rss_data = util.urlopen(rss_url).text
rss_data_frag = DATA_COVERART_RE.search(rss_data)

if rss_data_frag is None:
Expand All @@ -134,9 +133,8 @@ def get_escapist_web(video_id):
if video_id is None:
return None

# FIXME: must check if it's utf-8
web_url = 'http://www.escapistmagazine.com/videos/view/%s' % video_id
return util.urlopen(web_url).read()
return util.urlopen(web_url).text


def get_escapist_config_url(data):
Expand Down
118 changes: 49 additions & 69 deletions src/gpodder/feedcore.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,9 +25,9 @@
import logging
import urllib.parse
from html.parser import HTMLParser
from urllib.error import HTTPError

import podcastparser
from requests.exceptions import RequestException

from gpodder import util

Expand Down Expand Up @@ -132,78 +132,71 @@ def _resolve_url(self, url):
"""
return None

def _normalize_status(self, status):
# Based on Mark Pilgrim's "Atom aggregator behaviour" article
if status in (200, 301, 302, 304, 400, 401, 403, 404, 410, 500):
return status
elif status >= 200 and status < 300:
return 200
elif status >= 300 and status < 400:
return 302
elif status >= 400 and status < 500:
return 400
elif status >= 500 and status < 600:
return 500
else:
return status

def _check_statuscode(self, response, feed):
status = self._normalize_status(response.getcode())

if status == 200:
return Result(UPDATED_FEED, feed)
elif status == 301:
return Result(NEW_LOCATION, feed)
elif status == 302:
return Result(UPDATED_FEED, feed)
@staticmethod
def _check_statuscode(status, url):
if status >= 200 and status < 300:
return UPDATED_FEED
elif status == 304:
return Result(NOT_MODIFIED, feed)
return NOT_MODIFIED
# redirects are handled by requests directly
# => the status should never be 301, 302, 303, 307, 308

if status == 400:
raise BadRequest('bad request')
elif status == 401:
raise AuthenticationRequired('authentication required', feed)
if status == 401:
raise AuthenticationRequired('authentication required', url)
elif status == 403:
raise Unsubscribe('forbidden')
elif status == 404:
raise NotFound('not found')
elif status == 410:
raise Unsubscribe('resource is gone')
elif status == 500:
elif status >= 400 and status < 500:
raise BadRequest('bad request')
elif status >= 500 and status < 600:
raise InternalServerError('internal server error')
else:
raise UnknownStatusCode(status)

@staticmethod
def _podcastparse_feed(url, data_stream):
try:
feed = podcastparser.parse(url, data_stream)
feed['url'] = url
return feed
except ValueError as e:
raise InvalidFeed('Could not parse feed: {msg}'.format(msg=e))

def _parse_feed(self, url, etag, modified, autodiscovery=True, max_episodes=0):
# handle local file first
if url.startswith('file://'):
url = url[len('file://'):]
stream = open(url)
feed = self._podcastparse_feed(url, stream)
feed['headers'] = {}
return Result(UPDATED_FEED, feed)

# remote feed
headers = {}
if modified is not None:
headers['If-Modified-Since'] = modified
if etag is not None:
headers['If-None-Match'] = etag

if url.startswith('file://'):
is_local = True
url = url[len('file://'):]
stream = open(url)
else:
is_local = False
try:
stream = util.urlopen(url, headers)
except HTTPError as e:
return self._check_statuscode(e, e.geturl())

data = stream
if autodiscovery and not is_local and stream.headers.get('content-type', '').startswith('text/html'):
# Not very robust attempt to detect encoding: http://stackoverflow.com/a/1495675/1072626
charset = stream.headers.get_param('charset')
if charset is None:
charset = 'utf-8' # utf-8 appears hard-coded elsewhere in this codebase

# We use StringIO in case the stream needs to be read again
data = StringIO(stream.read().decode(charset))
ad = FeedAutodiscovery(url)
stream = util.urlopen(url, headers)

responses = stream.history + [stream]
for i, resp in enumerate(responses):
if resp.is_permanent_redirect:
# there should always be a next response when a redirect is encountered
# If max redirects is reached, TooManyRedirects is raised
# TODO: since we've got the end contents anyway, modify model.py to accept contents on NEW_LOCATION
return Result(NEW_LOCATION, responses[i+1].url)
res = self._check_statuscode(stream.status_code, stream.url)
if res == NOT_MODIFIED:
return Result(res, stream.url)

ad.feed(data.getvalue())
if autodiscovery and stream.headers.get('content-type', '').startswith('text/html'):
ad = FeedAutodiscovery(url)
ad.feed(stream.text) # uses headers, then apparent encoding
if ad._resolved_url:
try:
self._parse_feed(ad._resolved_url, None, None, False)
Expand All @@ -215,22 +208,9 @@ def _parse_feed(self, url, etag, modified, autodiscovery=True, max_episodes=0):
url = self._resolve_url(url)
if url:
return Result(NEW_LOCATION, url)

# Reset the stream so podcastparser can give it a go
data.seek(0)

try:
feed = podcastparser.parse(url, data)
feed['url'] = url
except ValueError as e:
raise InvalidFeed('Could not parse feed: {msg}'.format(msg=e))

if is_local:
feed['headers'] = {}
return Result(UPDATED_FEED, feed)
else:
feed['headers'] = stream.headers
return self._check_statuscode(stream, feed)
feed = self._podcastparse_feed(url, StringIO(stream.text))
feed['headers'] = stream.headers
return Result(UPDATED_FEED, feed)

def fetch(self, url, etag=None, modified=None, max_episodes=0):
return self._parse_feed(url, etag, modified, max_episodes)
1 change: 1 addition & 0 deletions src/gpodder/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -1138,6 +1138,7 @@ def update(self, max_episodes=0):
if result.status == feedcore.UPDATED_FEED:
self._consume_updated_feed(result.feed, max_episodes)
elif result.status == feedcore.NEW_LOCATION:
# FIXME: could return the feed because in autodiscovery it is parsed already
url = result.feed
logger.info('New feed location: %s => %s', self.url, url)
if url in set(x.url for x in self.model.get_podcasts()):
Expand Down
3 changes: 1 addition & 2 deletions src/gpodder/opml.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,8 +69,7 @@ def __init__(self, url):
if os.path.exists(url):
doc = xml.dom.minidom.parse(url)
else:
# FIXME: is it ok to pass bytes to parseString?
doc = xml.dom.minidom.parseString(util.urlopen(url).read())
doc = xml.dom.minidom.parseString(util.urlopen(url).text)

for outline in doc.getElementsByTagName('outline'):
# Make sure we are dealing with a valid link type (ignore case)
Expand Down
15 changes: 7 additions & 8 deletions src/gpodder/plugins/soundcloud.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,11 +83,10 @@ def get_metadata(url):
URL. Will use the network connection to determine the
metadata via the HTTP header fields.
"""
track_fp = util.urlopen(url)
headers = track_fp.info()
filesize = headers['content-length'] or '0'
filetype = headers['content-type'] or 'application/octet-stream'
headers_s = '\n'.join('%s:%s' % (k, v) for k, v in list(headers.items()))
track_response = util.urlopen(url)
filesize = track_response.headers['content-length'] or '0'
filetype = track_response.headers['content-type'] or 'application/octet-stream'
headers_s = '\n'.join('%s:%s' % (k, v) for k, v in list(track_response.headers.items()))
filename = get_param(headers_s) or os.path.basename(os.path.dirname(url))
track_fp.close()
return filesize, filetype, filename
Expand Down Expand Up @@ -116,7 +115,7 @@ def get_user_info(self):

try:
json_url = 'https://api.soundcloud.com/users/%s.json?consumer_key=%s' % (self.username, CONSUMER_KEY)
user_info = json.loads(util.urlopen(json_url).read().decode('utf-8'))
user_info = util.urlopen(json_url).json()
self.cache[key] = user_info
finally:
self.commit_cache()
Expand Down Expand Up @@ -146,7 +145,7 @@ def get_tracks(self, feed):
"consumer_key": CONSUMER_KEY})
logger.debug("loading %s", json_url)

json_tracks = json.loads(util.urlopen(json_url).read().decode('utf-8'))
json_tracks = util.urlopen(json_url).json()
tracks = [track for track in json_tracks if track['streamable'] or track['downloadable']]
total_count = len(json_tracks)

Expand Down Expand Up @@ -265,4 +264,4 @@ def get_new_episodes(self, channel, existing_guids):

def search_for_user(query):
json_url = 'https://api.soundcloud.com/users.json?q=%s&consumer_key=%s' % (urllib.parse.quote(query), CONSUMER_KEY)
return json.loads(util.urlopen(json_url).read().decode('utf-8'))
return util.urlopen(json_url).json()
31 changes: 9 additions & 22 deletions src/gpodder/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,12 +55,14 @@
import time
import urllib.error
import urllib.parse
import urllib.request
import webbrowser
import xml.dom.minidom
from html.entities import entitydefs, name2codepoint
from html.parser import HTMLParser

import requests
import requests.exceptions

import gpodder

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -1181,39 +1183,25 @@ def url_add_authentication(url, username, password):
return urllib.parse.urlunsplit(url_parts)


def urlopen(url, headers=None, data=None, timeout=None):
def urlopen(url, headers=None, data=None, timeout=None, **kwargs):
"""
An URL opener with the User-agent set to gPodder (with version)
"""
username, password = username_password_from_url(url)
if username is not None or password is not None:
url = url_strip_authentication(url)
password_mgr = urllib.request.HTTPPasswordMgrWithDefaultRealm()
password_mgr.add_password(None, url, username, password)
handler = urllib.request.HTTPBasicAuthHandler(password_mgr)
opener = urllib.request.build_opener(handler)
else:
opener = urllib.request.build_opener()

if headers is None:
headers = {}
else:
headers = dict(headers)

headers.update({'User-agent': gpodder.user_agent})
request = urllib.request.Request(url, data=data, headers=headers)
if timeout is None:
return opener.open(request)
else:
return opener.open(request, timeout=timeout)
return requests.get(url, headers=headers, data=data, timeout=timeout, **kwargs)


def get_real_url(url):
"""
Gets the real URL of a file and resolves all redirects.
"""
try:
return urlopen(url).geturl()
return urlopen(url).url
except:
logger.error('Getting real url for %s', url, exc_info=True)
return url
Expand Down Expand Up @@ -1799,8 +1787,7 @@ def get_update_info():
(False, '3.0.5', '2012-02-29', 10)
"""
url = 'https://api.github.com/repos/gpodder/gpodder/releases/latest'
data = urlopen(url).read().decode('utf-8')
info = json.loads(data)
info = urlopen(url).json()

latest_version = info.get('tag_name', '').replace('gpodder-', '')
release_date = info['published_at']
Expand Down Expand Up @@ -1916,9 +1903,9 @@ def website_reachable(url):
return (False, None)

try:
response = urllib.request.urlopen(url, timeout=1)
response = requests.get(url, timeout=1)
return (True, response)
except urllib.error.URLError as err:
except requests.exceptions.RequestException:
pass

return (False, None)
Expand Down
3 changes: 1 addition & 2 deletions src/gpodder/vimeo.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,8 +65,7 @@ def get_real_download_url(url, preferred_fileformat=None):
data_config_url = 'https://player.vimeo.com/video/%s/config' % (video_id,)

def get_urls(data_config_url):
data_config_data = util.urlopen(data_config_url).read().decode('utf-8')
data_config = json.loads(data_config_data)
data_config = util.urlopen(data_config_url).json()
for fileinfo in list(data_config['request']['files'].values()):
if not isinstance(fileinfo, list):
continue
Expand Down

0 comments on commit 3babb86

Please sign in to comment.