forked from zt0910/dp_scripy
-
Notifications
You must be signed in to change notification settings - Fork 0
/
page_content.py
198 lines (166 loc) · 6.09 KB
/
page_content.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
from html_parser.common import *
from bs4 import BeautifulSoup
from hdfs.client import Client
import re
client = Client("http://192.168.31.51:50070")
def get_file_info(filepath):
file_info = filepath.split('/')
city = file_info[2]
region = file_info[3]
area = file_info[4]
shopid = file_info[5].split('.')[0]
return shopid, city, region, area, filepath
def read_html(filepath):
with client.read(filepath, encoding='utf-8') as reader:
respon = reader.read()
soup = BeautifulSoup(respon, 'lxml')
return soup
def get_shop_name(soup):
try:
shopname = soup.find('h1', class_='mutilPics-shop-name').string
except:
shopname = soup.find('h1', class_='shopName').string
return shopname
def get_baseinfo(soup):
Idpattern = re.compile('.*\{.*{(.*?)}}')
idinfo = Idpattern.findall(str(soup.find('div', class_='address_left')))
idInfodict = {}
for x in idinfo[0].split(','):
idInfodict[x.split(':')[0]] = x.split(':')[1].strip("''")
shop_id = idInfodict['shopid']
city_id = idInfodict['city_id']
category = idInfodict['category']
return shop_id, city_id, category
def get_comment_count(soup, comment_TTGlyphs, comment_dict):
comment_count = 0
try:
taglist = ['Multi-itemNum', 'itemNum']
for tag in taglist:
comment = soup.find('div', class_=tag)
if comment != None:
comment = comment.text.strip()
break
else:
continue
comment_count = int(woff_change(comment, comment_TTGlyphs, comment_dict)[:-1])
return comment_count
except:
return comment_count
def get_merchant_score(soup, number_TTGlyphs, number_dict):
starpattern = re.compile('class="star starBig star-(.*)">')
star = starpattern.findall(str(soup))[0]
environment, taste, service = 0, 0, 0
try:
descrip_list = [('div', 'Multi-description'), ('div', 'description')]
for desc in descrip_list:
score_mes = soup.find(desc[0], class_=desc[1])
score_digit = []
if score_mes != None:
for x in score_mes.text.strip().split('\n'):
detailist = x.split(':')[1].split('.')
score_digit.append(int(woff_change(detailist, number_TTGlyphs, number_dict)) / 10)
taste = score_digit[0],
environment = score_digit[1]
service = score_digit[2]
break
else:
continue
return star, taste, environment, service
except:
return star, taste, environment, service
def get_telphonenumber(soup):
tell_number = ''
try:
tell_info = soup.find('div', class_='aboutPhoneNum')
tell_number = tell_info.find('a', class_='tel')['href'].split(':')[1]
return tell_number
except:
return tell_number
def get_picture_coount(soup):
picCount = 0
try:
picCount = soup.find('div', class_='picCount').string
return picCount
except:
return picCount
def get_price(soup):
avg_price = 0
try:
pricedict = {'div': 'Multi-price', 'span': 'price'}
for k, v in pricedict.items():
pricecontent = soup.find(k, class_=v)
if pricecontent != None:
avg_price = pricecontent.string
print(avg_price)
break
else:
continue
except:
pass
return avg_price
def get_adress(soup, address_TTGlyphs, address_dict):
pattern = re.compile('>(.*?)<')
adress = pattern.findall(str(soup.find_all('span', class_='addressText')[0]))
location = woff_change(adress, address_TTGlyphs, address_dict)
return location
def open_time(soup):
open_time = ''
try:
taglist = soup.find_all('div', class_='otherInfo')
for tr in taglist:
td = tr.find_all('div')
open_time += td[1].string.strip().strip('\n')
open_time.replace('\n', '')
return open_time
except:
return open_time
def get_rank(soup):
ranking, rankname = '', ''
try:
rankname = soup.find('div', class_="rankText").string
ranking = ''
pattern = re.compile('>(.*?)<')
for x in pattern.findall(str(soup.find('div', class_='rankNum'))):
ranking += x
return rankname, ranking
except:
return rankname, ranking
def get_recommend(soup):
recommender_count = []
dish_name = []
numpattern = re.compile('\d+')
try:
dish_info=soup.find('div',class_='dishPics')
if dish_info !=None:
for tag_a in dish_info.find_all('a',class_='dishItem'):
dish_name.append(tag_a.find('div',class_='dishName').string)
if tag_a.find('div',class_='recommendonfo')!=None:
recommender_count.append(numpattern.findall((tag_a.find('div',class_='recommendInfo').text)[0]))
else:
recommender_count.append(0)
return dish_name, recommender_count
except:
return dish_name, recommender_count
def get_coupon(soup):
coupon, original_price, discount_price, sale_count = [], [], [], []
try:
tuanlist = soup.find('div', class_='tuan-list')
coupon = []
original_price = []
discount_price = []
sale_count = []
for info in tuanlist.find_all('div', class_='newtitle'):
if '代金券' in info.string:
couponType = 'CASH'
else:
couponType = 'COMBO'
coupon.append(couponType)
for price in tuanlist.find_all('div', class_='price'):
original_price.append(float(price.string) * 100)
for price in tuanlist.find_all('div', class_='o-price'):
discount_price.append(float(price.string) * 100)
for count in tuanlist.find_all('span', class_='soldNumNew'):
sale_count.append(int(count.string[2:]) * 100)
return coupon, original_price, discount_price, sale_count
except:
return coupon, original_price, discount_price, sale_count