-
Notifications
You must be signed in to change notification settings - Fork 85
/
Urban-and-rural-statistics-spider.py
239 lines (212 loc) · 9.04 KB
/
Urban-and-rural-statistics-spider.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
# 库函数导入
import requests
from lxml import etree
import csv
import time
import pandas as pd
from queue import Queue
from threading import Thread
# 网页爬取函数
# 下面加入了num_retries这个参数,经过测试网络正常一般最多retry一次就能获得结果
def getUrl(url,num_retries = 5):
headers = {'User-Agent':"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36"}
try:
response = requests.get(url,headers = headers)
response.encoding = 'GBK'
data = response.text
return data
except Exception as e:
if num_retries > 0:
time.sleep(10)
print(url)
print("requests fail, retry!")
return getUrl(url,num_retries-1) #递归调用
else:
print("retry fail!")
print("error: %s" % e + " " + url)
return #返回空值,程序运行报错
# 获取省级代码函数
def getProvince(url):
province = []
data = getUrl(url)
selector = etree.HTML(data)
provinceList = selector.xpath('//tr[@class="provincetr"]')
for i in provinceList:
provinceName = i.xpath('td/a/text()') #这里如果采用//a/text()路径会出现问题!!
provinceLink = i.xpath('td/a/@href')
for j in range(len(provinceLink)):
provinceURL = url[:-10] + provinceLink[j] #根据获取到的每个省的链接进行补全,得到真实的URL。
province.append({'name':provinceName[j],'link':provinceURL})
return province
# 获取市级代码函数
def getCity(url_list):
city_all = []
for url in url_list:
data = getUrl(url)
selector = etree.HTML(data)
cityList = selector.xpath('//tr[@class="citytr"]')
#下面是抓取每一个城市的代码、URL
city = []
for i in cityList:
cityCode = i.xpath('td[1]/a/text()')
cityLink = i.xpath('td[1]/a/@href')
cityName = i.xpath('td[2]/a/text()')
for j in range(len(cityLink)):
cityURL = url[:-7] + cityLink[j]
city.append({'name':cityName[j],'code':cityCode[j],'link':cityURL})
city_all.extend(city) #所有省的城市信息合并在一起
return city_all
# 获取区级代码函数---多线程实现
def getCounty(url_list):
queue_county = Queue() #队列
thread_num = 10 #进程数
county = [] #记录区级信息的字典(全局)
def produce_url(url_list):
for url in url_list:
queue_county.put(url) # 生成URL存入队列,等待其他线程提取
def getData():
while not queue_county.empty(): # 保证url遍历结束后能退出线程
url = queue_county.get() # 从队列中获取URL
data = getUrl(url)
selector = etree.HTML(data)
countyList = selector.xpath('//tr[@class="countytr"]')
#下面是爬取每个区的代码、URL
for i in countyList:
countyCode = i.xpath('td[1]/a/text()')
countyLink = i.xpath('td[1]/a/@href')
countyName = i.xpath('td[2]/a/text()')
#上面得到的是列表形式的,下面将其每一个用字典存储
for j in range(len(countyLink)):
countyURL = url[:-9] + countyLink[j]
county.append({'code':countyCode[j],'link':countyURL,'name':countyName[j]})
def run(url_list):
produce_url(url_list)
ths = []
for _ in range(thread_num):
th = Thread(target = getData)
th.start()
ths.append(th)
for th in ths:
th.join()
run(url_list)
return county
# 获取街道代码函数---多线程实现
def getTown(url_list):
queue_town = Queue() #队列
thread_num = 50 #进程数
town = [] #记录街道信息的字典(全局)
def produce_url(url_list):
for url in url_list:
queue_town.put(url) # 生成URL存入队列,等待其他线程提取
def getData():
while not queue_town.empty(): # 保证url遍历结束后能退出线程
url = queue_town.get() # 从队列中获取URL
data = getUrl(url)
selector = etree.HTML(data)
townList = selector.xpath('//tr[@class="towntr"]')
#下面是爬取每个区的代码、URL
for i in townList:
townCode = i.xpath('td[1]/a/text()')
townLink = i.xpath('td[1]/a/@href')
townName = i.xpath('td[2]/a/text()')
#上面得到的是列表形式的,下面将其每一个用字典存储
for j in range(len(townLink)):
# 中山市、东莞市的处理
if url == 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2016/44/4419.html' or url == 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2016/44/4420.html':
townURL = url[:-9] + townLink[j]
else:
townURL = url[:-11] + townLink[j]
town.append({'code':townCode[j],'link':townURL,'name':townName[j]})
def run(url_list):
produce_url(url_list)
ths = []
for _ in range(thread_num):
th = Thread(target = getData)
th.start()
ths.append(th)
for th in ths:
th.join()
run(url_list)
return town
# 获取居委会代码函数---多线程实现
def getVillage(url_list):
queue_village = Queue() #队列
thread_num = 200 #进程数
village = [] #记录街道信息的字典(全局)
def produce_url(url_list):
for url in url_list:
queue_village.put(url) # 生成URL存入队列,等待其他线程提取
def getData():
while not queue_village.empty(): # 保证url遍历结束后能退出线程
url = queue_village.get() # 从队列中获取URL
data = getUrl(url)
selector = etree.HTML(data)
villageList = selector.xpath('//tr[@class="villagetr"]')
#下面是爬取每个区的代码、URL
for i in villageList:
villageCode = i.xpath('td[1]/text()')
UrbanRuralCode = i.xpath('td[2]/text()')
villageName = i.xpath('td[3]/text()')
#上面得到的是列表形式的,下面将其每一个用字典存储
for j in range(len(villageCode)):
village.append({'code':villageCode[j],'UrbanRuralCode':UrbanRuralCode[j],'name':villageName[j]})
def run(url_list):
produce_url(url_list)
ths = []
for _ in range(thread_num):
th = Thread(target = getData)
th.start()
ths.append(th)
for th in ths:
th.join()
run(url_list)
return village
###########################
###########################
#省级信息获取
pro = getProvince("http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2016/index.html")
df_province = pd.DataFrame(pro)
df_province.info()
# 信息写入csv文件
df_province.to_csv('province.csv', sep=',', header=True, index=False)
###########################
#市级信息获取
city = getCity(df_province['link'])
df_city = pd.DataFrame(city)
df_city.info()
# 信息写入csv文件
df_city.to_csv('city.csv', sep=',', header=True, index=False)
###########################
#区级信息获取
county = getCounty(df_city['link'])
df_county = pd.DataFrame(county)
# 排序:由于多线程的关系,数据的顺序已经被打乱,所以这里按照区代码进行“升序”排序。
df_county_sorted = df_county.sort_values(by = ['code']) #按1列进行升序排序
df_county_sorted.info()
# 信息写入csv文件
df_county_sorted.to_csv('county.csv', sep=',', header=True, index=False)
###########################
#街道信息获取
#中山市、东莞市的特殊处理(他们的链接在df_city中)
url_list = list()
for url in df_county['link']:
url_list.append(url)
town_link_list = df_city[df_city['name'].isin(['中山市','东莞市'])]['link'].values
for town_link in town_link_list:
url_list.append(town_link)
town = getTown(url_list)
df_town = pd.DataFrame(town)
# 排序:由于多线程的关系,数据的顺序已经被打乱,所以这里按照街道代码进行“升序”排序。
df_town_sorted = df_town.sort_values(by = ['code']) #按1列进行升序排序
df_town_sorted.info()
# 信息写入csv文件
df_town_sorted.to_csv('town.csv', sep=',', header=True, index=False)
###########################
#居委会信息获取
village = getVillage(df_town['link'])
df_village = pd.DataFrame(village)
# 排序:由于多线程的关系,数据的顺序已经被打乱,所以这里按照街道代码进行“升序”排序。
df_village_sorted = df_village.sort_values(by = ['code']) #按1列进行升序排序
df_village_sorted.info()
# 信息写入csv文件
df_village_sorted.to_csv('village.csv', sep=',', header=True, index=False)