/
Hospital_information_system_crawler_for_Gleason_Score.py
119 lines (107 loc) · 4.45 KB
/
Hospital_information_system_crawler_for_Gleason_Score.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
# coding=utf-8
#出于信息安全,删除了headers和post的一些信息,使用之前请自行填写调试
import requests
from bs4 import BeautifulSoup
import re
import xlrd
from xlutils.copy import copy
xls_file = 'Prostate_Cancer_Jan_Nov.xls' #运行之前把修改此处的文件名
def get_Gleason(blh):
headers = { #从chrome F12中获取
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate, sdch',
'Accept-Language': 'zh-CN,zh;q=0.8',
'Connection': 'keep-alive',
'Cookie': '', #从chrome F12中获取
'Host': '', #从chrome F12中获取
'Referer': '',#从chrome F12中获取
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
}
request_url = '' #从chrome F12中获取
response = requests.get(request_url, data={'blh': blh}, headers=headers)
# response.apparent_encoding=GB2312
response.encoding = 'GB2312'
text = response.text
if re.search('收到日期', text) != None:
receive_date = re.compile('收到日期:</font>\W*?<font size="3">(.*?)</font>').findall(text)[0]
Gleason_Expressions = re.compile(r'Gleason.*?(\d.*?[=]\d{1,2})', re.I).findall(text.replace(' ', ''))
if len(Gleason_Expressions) >= 1:
Gleason_Expression = Gleason_Expressions[0]
Gleason_Score = re.compile(r'=(\d{1,2})').findall(Gleason_Expression)[0]
for Expression in Gleason_Expressions:
Score = re.compile(r'=(\d{1,2})').findall(Expression)[0]
if Score > Gleason_Score:
Gleason_Expression = Expression
Gleason_Score = Score
return blh,receive_date, Gleason_Expression, Gleason_Score
else:
return blh,'No Gleason'
else:
return blh, 'No Report' #此报告未审核!
def get_blh(zhy):
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-CN,zh;q=0.8',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
'Content-Length': '183',
'Content-Type': 'application/x-www-form-urlencoded',
'Cookie': '', #随便获取一个Cookie就能用
'Host': '',#从chrome F12中获取
'Origin': '', #从chrome F12中获取
'Referer': '', #从chrome F12中获取
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
}
post_data = { # post_data内容直接复制F12大法里面的data
'keywords_name': '',
'keywords_brbh': '',
'keywords_zyh': zhy,
'keywords_mzh': '',
'keywords_blh': '',
'keywords_sjks': '',
'keywords_rqxz': '',#从chrome F12中获取
'keywords_bgrq1': '',
'keywords_bgrq2': '',
'Submit': '%B2%E9%D1%AF'
}
request_url = ''#从chrome F12中获取
response = requests.post(request_url, data=post_data, headers=headers)
# response.apparent_encoding=GB2312
response.encoding = 'GB2312'
soup = BeautifulSoup(response.text, 'html.parser')
page_sum = re.search(r'第[\d]/[\d]页', soup.text)
if page_sum == None:
return None # 此条件没有查询结果!
else:
all_blh = soup.find_all(target='index_m', id=True)
return (page_sum,all_blh)
def write_Gleason_data(row, initial_col, data, s):
col = initial_col
for cell_data in data:
print(cell_data)
s.write(row, col, cell_data)
col = col + 1
return None
book = xlrd.open_workbook(xls_file)
sh = book.sheet_by_index(0)
print('表单{0}共{1}行{2}列'.format(sh.name,sh.nrows,sh.ncols))
wb = copy(book)
s = wb.get_sheet(0)
data_col = sh.ncols + 1 #起始列
for rownum in range(1,sh.nrows):
zhy = sh.cell(rownum, 1).value
print(zhy)
if get_blh(zhy) == None:
print('No blh')
s.write(rownum, data_col, 'No blh')
elif get_blh(zhy) != None:
print(get_blh(zhy)[0])
if get_blh(zhy) != None:
for blh in get_blh(zhy)[1]:
Gleason_data = get_Gleason(blh.text)
print(Gleason_data)
write_Gleason_data(rownum,data_col,Gleason_data,s)
wb.save(xls_file)