Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: fix memory leak while downloading large files #1707

Closed
wants to merge 2 commits into from
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
123 changes: 63 additions & 60 deletions googleapiclient/http.py
Original file line number Diff line number Diff line change
Expand Up @@ -661,37 +661,32 @@ def __init__(


class MediaIoBaseDownload(object):
""" "Download media resources.
"""Download media resources.

Note that the Python file object is compatible with io.Base and can be used
with this class also.


Example:
request = farms.animals().get_media(id='cow')
fh = io.FileIO('cow.png', mode='wb')
downloader = MediaIoBaseDownload(fh, request, chunksize=1024*1024)

done = False
while done is False:
status, done = downloader.next_chunk()
if status:
print "Download %d%%." % int(status.progress() * 100)
print "Download Complete!"
request = farms.animals().get_media(id='cow')
downloader = MediaIoBaseDownload(request, chunksize=1024*1024)

for chunk, status, done in downloader.next_chunk():
with open('cow.png', 'ab') as cow_file:
cow_file.write(chunk)
print("Download %d%%." % int(status.progress() * 100))

print("Download Complete!")
"""

@util.positional(3)
def __init__(self, fd, request, chunksize=DEFAULT_CHUNK_SIZE):
@util.positional(2)
def __init__(self, request, chunksize=DEFAULT_CHUNK_SIZE):
"""Constructor.

Args:
fd: io.Base or file object, The stream in which to write the downloaded
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

To avoid breaking existing users, ideally we should continue support the existing behaviour as well as the new desired behaviour.

bytes.
request: googleapiclient.http.HttpRequest, the media request to perform in
chunks.
chunksize: int, File will be downloaded in chunks of this many bytes.
"""
self._fd = fd
self._request = request
self._uri = request.uri
self._chunksize = chunksize
Expand Down Expand Up @@ -722,61 +717,69 @@ def next_chunk(self, num_retries=0):
request only once.

Returns:
(status, done): (MediaDownloadProgress, boolean)
(chunk, status, done): (bytes, MediaDownloadProgress, boolean)
The value of 'done' will be True when the media has been fully
downloaded or the total size of the media is unknown.

Raises:
googleapiclient.errors.HttpError if the response was not a 2xx.
httplib2.HttpLib2Error if a transport error has occurred.
"""
headers = self._headers.copy()
headers["range"] = "bytes=%d-%d" % (
self._progress,
self._progress + self._chunksize - 1,
)
http = self._request.http

resp, content = _retry_request(
http,
num_retries,
"media download",
self._sleep,
self._rand,
self._uri,
"GET",
headers=headers,
)
while self._done is False:
headers = self._headers.copy()
headers["range"] = "bytes=%d-%d" % (
self._progress,
self._progress + self._chunksize - 1,
)
http = self._request.http

if resp.status in [200, 206]:
if "content-location" in resp and resp["content-location"] != self._uri:
self._uri = resp["content-location"]
self._progress += len(content)
self._fd.write(content)
resp, chunk = _retry_request(
http,
num_retries,
"media download",
self._sleep,
self._rand,
self._uri,
"GET",
headers=headers,
)

if "content-range" in resp:
if resp.status in [200, 206]:
if "content-location" in resp and resp["content-location"] != self._uri:
self._uri = resp["content-location"]
self._progress += len(chunk)

if "content-range" in resp:
content_range = resp["content-range"]
length = content_range.rsplit("/", 1)[1]
self._total_size = int(length)
elif "content-length" in resp:
self._total_size = int(resp["content-length"])

if self._total_size is None or self._progress == self._total_size:
self._done = True
yield (
chunk,
MediaDownloadProgress(self._progress, self._total_size),
self._done
)
elif resp.status == 416:
# 416 is Range Not Satisfiable
# This typically occurs with a zero byte file
content_range = resp["content-range"]
length = content_range.rsplit("/", 1)[1]
self._total_size = int(length)
elif "content-length" in resp:
self._total_size = int(resp["content-length"])

if self._total_size is None or self._progress == self._total_size:
self._done = True
return MediaDownloadProgress(self._progress, self._total_size), self._done
elif resp.status == 416:
# 416 is Range Not Satisfiable
# This typically occurs with a zero byte file
content_range = resp["content-range"]
length = content_range.rsplit("/", 1)[1]
self._total_size = int(length)
if self._total_size == 0:
self._done = True
return (
MediaDownloadProgress(self._progress, self._total_size),
self._done,
)
raise HttpError(resp, content, uri=self._uri)
if self._total_size == 0:
self._done = True
yield (
chunk,
MediaDownloadProgress(self._progress, self._total_size),
self._done,
)
else:
raise HttpError(resp, content, uri=self._uri)
else:
raise HttpError(resp, content, uri=self._uri)


class _StreamSlice(object):
Expand Down