Skip to content

Commit

Permalink
add process
Browse files Browse the repository at this point in the history
  • Loading branch information
cyang812 committed May 7, 2018
1 parent 581d481 commit 450c484
Show file tree
Hide file tree
Showing 3 changed files with 86 additions and 2 deletions.
2 changes: 2 additions & 0 deletions README.md
Expand Up @@ -21,6 +21,8 @@

- `download_thread.py` 是多线程下载,理论上比 `download.py` 要快,因此第三步可以用 `download_thread.py` 下载

- `download_process.py` 也是并发下载,理论上比 `download_thread.py` 要快,因此第三步也可以用 `download_process.py` 下载

# 三、其他

- 由于众所周知的原因,tumblr 的资源地址是不能直接下载的,因此需要设置代理。测试时使用 ssr 代理本地连接,因此 `download.py` 中有 `PROXIES = { "http": "http://127.0.0.1:1080", "https": "https://127.0.0.1:1080" } `,如果是在可直接访问 tumblr 的 VPS 上运行,可对代码做如下修改。
Expand Down
9 changes: 7 additions & 2 deletions download.py
Expand Up @@ -3,6 +3,7 @@
import os
import urllib
import requests
import time

url_filename = 'url_list.txt'

Expand Down Expand Up @@ -40,9 +41,9 @@ def download(url):
print('downloading ->',name)

with open(name, "wb") as code:
code.write(r.content)
code.write(r.content)
except Exception as e:
print('download err ->', name)
print('downloading err ->', name)
pass
else:
print("file exist")
Expand All @@ -60,5 +61,9 @@ def chdir():

url_list = get_url()
chdir()
start_time = time.time()

for i in range(0,len(url_list)):
download(url_list[i])
end_time = time.time()
print('下载完毕,用时:%s秒' % (end_time - start_time))
77 changes: 77 additions & 0 deletions download_process.py
@@ -0,0 +1,77 @@
# !/usr/bin/env python3
# -*- coding: utf-8 -*-

import time
import os
import urllib.request
import requests
from multiprocessing import Process, Queue, Pool

url_filename = 'url_list.txt'

PROXIES = { "http": "http://127.0.0.1:1080", "https": "https://127.0.0.1:1080" }
# PROXIES = {}

def get_url():
with open(url_filename, "r") as f:
raw_sites = f.read()

raw_sites = raw_sites.replace("\n", ",")
raw_sites = raw_sites.split(",")

sites = list()
for raw_site in raw_sites:
site = raw_site.lstrip().rstrip()
if site:
sites.append(site)

print('list_len = ',len(sites))
return sites

def get_filename(url):
name = url.split("/")[-1].split("?")[0]
return name

def download_one(url):

name = get_filename(url)

file_path = os.path.join(name)
if not os.path.isfile(file_path):
try:
r = requests.get(url,proxies=PROXIES) # use proxy
print('downloading ->',name)

with open(name, "wb") as code:
code.write(r.content)
except Exception as e:
print('downloading err ->', name)
pass
else:
print("file exist")

def chdir():

current_folder = os.getcwd()
print(current_folder)
target_folder = os.path.join(current_folder, 'download')
if not os.path.isdir(target_folder):
os.mkdir(target_folder)
os.chdir(target_folder)

def download(imgs, processes=10):
""" 并发下载所有图片 """
start_time = time.time()
pool = Pool(processes)
for img in imgs:
pool.apply_async(download_one, (img, ))

pool.close()
pool.join()
end_time = time.time()
print('下载完毕,用时:%s秒' % (end_time - start_time))

if __name__ == '__main__':
url = get_url()
chdir()
download(url)

0 comments on commit 450c484

Please sign in to comment.