-
Notifications
You must be signed in to change notification settings - Fork 1
/
91hanman-list-new.py
125 lines (113 loc) · 5.28 KB
/
91hanman-list-new.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
# -*- coding:UTF-8 -*-
from bs4 import BeautifulSoup
from urllib.request import urlretrieve
import requests
import os
import time
import pdb
import socket
from python_lib import BaseCommon
if __name__ == '__main__':
base_folder ='F:/py3project/91hanman/'#以/结束
#base_folder ='F:/py3workspace/python-spider/'#以/结束
base_url='https://www.91hanman.com'
manhua_list=[]
#manhua_list.append('https://www.91hanman.com/book/webBookDetail/60=过人')
manhua_list.append('https://www.91hanman.com/book/webBookDetail/61=从今天开始当城主')
manhua_list.append('https://www.91hanman.com/book/webBookDetail/62=影帝X影帝')
manhua_list.append('https://www.91hanman.com/book/webBookDetail/63=快意十三刀')
manhua_list.append('https://www.91hanman.com/book/webBookDetail/64=废柴特工')
print(manhua_list)
for each_manhua_list in manhua_list:
list_url = []
each_manhua_list_info = each_manhua_list.split('=')
base_manga_name=each_manhua_list_info[1]
base_log=base_folder+base_manga_name+".txt"
url = each_manhua_list_info[0]
print(base_manga_name)
print(url)
status_count=0
if base_manga_name=='从今天开始当城主':
status_count=11
print(status_count)
# base_folder ='F:/py3project/91hanman/'
# #base_folder ='F:/py3workspace/python-spider/'
# base_url='https://www.91hanman.com'
# base_manga_name='过人'
# base_log=base_folder+base_manga_name+".txt"
if base_manga_name not in os.listdir():
os.makedirs(base_manga_name)
# if num == 1:
# url = 'view-source:https://www.91hanman.com/book/webBookDetail/69'
# else:
# url = 'http://www.shuaia.net/index_%d.html' % num
#TODO:
#url = base_url+'/book/webBookDetail/60'
headers = {
"User-Agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36",
"Referer": "http://www.google.com/bot.html"
}
#网页下载太长 需要加入参数 stream=True 来判断完整性下载
req = requests.get(url = url,headers = headers,stream=True)
req.encoding = 'utf-8'
html = req.text
bf = BeautifulSoup(html, 'lxml')
targets_url = bf.find_all(class_='detail-chapters-list-item')
for each_url in targets_url:
#pdb.set_trace() # 运行到这里会自动暂停
bf_2 = BeautifulSoup(str(each_url), 'lxml')
print(bf_2.a.get('href'))
print(bf_2.a.span.span.get_text())
BaseCommon.loglist(base_log,bf_2.a.span.span.get_text() + '=' + base_url +bf_2.a.get('href'))
list_url.append(bf_2.a.span.span.get_text() + '=' + base_url +bf_2.a.get('href'))
#pdb.set_trace() # 运行到这里会自动暂停
# for each in targets_url:
# list_url.append(each.img.get('alt') + '=' + each.get('href'))
print('连接采集完成')
print(list_url)
print(len(list_url))
count=0
for each_img in list_url:
#pdb.set_trace()
count +=1
if(count <= status_count):
print(count)
continue
folder=base_folder+base_manga_name+'/'+str(count)
#pdb.set_trace()
print(os.path.exists(folder));
if os.path.exists(folder) == False:
if folder not in os.listdir():
os.makedirs(folder)
#pdb.set_trace() # 运行到这里会自动暂停
img_info = each_img.split('=')
target_url = img_info[1]
filename = img_info[0]
print(str(count)+'下载:' + filename)
headers = {
"User-Agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36",
"Referer": "http://www.google.com/bot.html"
}
img_req = requests.get(url = target_url,headers = headers,stream=True)
img_req.encoding = 'utf-8'
img_html = img_req.text
img_bf_1 = BeautifulSoup(img_html, 'lxml')
img_url = img_bf_1.find_all('div', class_='page-img-list')
img_bf_2 = BeautifulSoup(str(img_url), 'lxml')#str 返回一个字符串 把对象转换字符串
img_bf_3 =img_bf_2.find_all('div', class_='nav-chapter-new')
#img_bf_3 =img_bf_2.find_all('img')
i=0
for each_img_2 in img_bf_3:
i += 1
img_bf_4 = BeautifulSoup(str(each_img_2), 'lxml')#str 返回一个字符串 把对象转换字符串
img_url = img_bf_4.div.img.get('data-original')
# if 'images' not in os.listdir():
# os.makedirs('images')
#print(str(i).zfill(2))
#print(folder+"/"+str(i).zfill(2) + '.jpg')
#pdb.set_trace() # 运行到这里会自动暂停
if os.path.exists(folder+"/"+str(i).zfill(2) + '.jpg') == False:
BaseCommon.download(url = img_url,filename = folder+"/"+str(i).zfill(2) + '.jpg')
time.sleep(1)
print('下载完成!')
os.system('shutdown -s -f -t 59')