/
拉钩网爬虫.py
80 lines (73 loc) · 3.36 KB
/
拉钩网爬虫.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
import requests
import json
import pymongo
import time
url = 'https://www.lagou.com/jobs/positionAjax.json?city=%E5%8C%97%E4%BA%AC&needAddtionalResult=false&isSchoolJob=0'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
'Cookie': 'JSESSIONID=ABAAABAAAIAACBI3AB0C3F59B10F66D078B4F5E4A09A94A; _ga=GA1.2.1430593343.1519094020; '
'_gid=GA1.2.64420250.1519094020; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1519094020; user_trace_'
'token=20180220103341-76b7725e-15e6-11e8-8bee-525400f775ce; LGSID=20180220103341-76b773c3-15e6-11e8'
'-8bee-525400f775ce; LGUID=20180220103341-76b77696-15e6-11e8-8bee-525400f775ce; index_location_city'
'=%E5%8C%97%E4%BA%AC; _qddaz=QD.w1jd11.3q49jy.jdv1sb7o; X_HTTP_TOKEN=575b8945f60f6114bab0e6948e343416;'
' TG-TRACK-CODE=search_code; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1519096728; LGRID=20180220112806-'
'1104e7a5-15ee-11e8-b074-5254005c3644; '
'SEARCH_ID=5f3081935a1147ccb3333f910d2b7f01',
'Connection':'keep-alive',
'Referer': 'https://www.lagou.com/jobs/list_java?city=%E5%8C%97%E4%BA%AC&cl=false&fromSearch=true&labelWords=&suginput='
}
#初始化MongoDB参数
client = pymongo.MongoClient('localhost',27017)
LaGou = client['LaGou']
infos = LaGou['infos']
#提取最大页数
params = {
'first': 'true',
'pn':1,
'kd':'python'
}
html = requests.post(url,data = params,headers = headers)
json_data = json.loads(html.text)
items = json_data['content']['positionResult']['totalCount']
totalPages = int(items/15) if (items/15) < 30 else 30 #限制页数:若总页数>30,取30, 否则取最大页数。除以15是因为每页显示15个
#构造post的params参数
def loadPage():
for i in range(1,totalPages):
list = {
'first': 'true',
'pn': i,
'kd': 'python'
}
getInfo(list)
print("Page %d finished. "%i)
time.sleep(1) #暂停1s,防止因过快访问被ban IP
#从返回的response的json data中提取关键信息,存入MongoDB
def getInfo(params):
html = requests.post(url, data=params, headers=headers)
json_data = json.loads(html.text)
collections = json_data['content']['positionResult']['result']
for col in collections:
comment = ""
for i in col['companyLabelList']:
comment = comment + " " + i
aggregation = {
'city': col['city'],
'shortName': col['companyShortName'],
'fullName': col['companyFullName'],
'companySize': col['companySize'],
'education': col['education'],
'industryField': col['industryField'],
'jobNature': col['jobNature'],
'address': col['linestaion'],
'positionAdvantage': col['positionAdvantage'],
'positionName': col['positionName'],
'salary': col['salary'],
'workYear': col['workYear'],
'formatCreateTime': col['formatCreateTime'],
'description': comment
}
infos.insert_one(aggregation)
#程序入口
if __name__ == '__main__':
loadPage()