-
-
Notifications
You must be signed in to change notification settings - Fork 73
/
get_reviews.py
103 lines (78 loc) · 3.92 KB
/
get_reviews.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
#!/usr/bin/env python3
#
# https://stackoverflow.com/a/47858268/1832058
#
import requests # to get html from server
from bs4 import BeautifulSoup # to search in html
import re
#import webbrowser # to open html file in web browser
# global variable to keep request session with all cookies, etc.
s = requests.Session()
# NEW: get reviews in all languages - doesn't work, it gets only english reviews but code gets number of all reviews :(
s.cookies.set('TALanguage', 'ALL', domain='.google.co.uk', path='/') # other lanugages ie. 'en', 'es'
def get_soup(url):
headers = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:57.0) Gecko/20100101 Firefox/57.0'}
r = s.get(url, headers=headers)
# write html in file and open it in web browser - to see what you get from server
#with open('temp.html', 'wb') as f:
# f.write(r.content)
# webbrowser.open('temp.html')
if r.status_code != 200:
print('status code:', r.status_code)
else:
return BeautifulSoup(r.text, 'html.parser')
def parse(url, response):
if not response:
print('no response:', url)
return
# get number of reviews
num_reviews = response.find('span', class_='reviews_header_count').text
num_reviews = num_reviews[1:-1] # remove `( )`
num_reviews = num_reviews.replace(',', '') # remove `,`
num_reviews = int(num_reviews)
print('num_reviews:', num_reviews, type(num_reviews))
# create template for urls to pages with reviews
url = url.replace('.html', '-or{}.html')
print('template:', url)
# add requests to list
for offset in range(0, num_reviews, 5):
print('url:', url.format(offset))
url_ = url.format(offset)
parse_reviews(url_, get_soup(url_))
#return # for test only - to stop after first page
def parse_reviews(url, response):
print('review:', url)
if not response:
print('no response:', url)
return
for idx, review in enumerate(response.find_all('div', class_='review-container')):
# NEW: works - it has to check if `badgetext` exists on page
badgetext = review.find('span', class_='badgetext')
if badgetext:
badgetext = badgetext.text
else:
badgetext = ''
item = {
#'hotel_name': response.find('h1', class_='heading_title').text, # OLD: doesn't work
'review_title': review.find('span', class_='noQuotes').text,
'review_body': review.find('p', class_='partial_entry').text,
#'review_date': review.find('span', class_='relativeDate')['title'],#.text,#[idx], # OLD: doesn't work
'review_date': review.find('span', class_='ratingDate')['title'],#.text,#[idx],
#'num_reviews_reviewer': review.find('span', class_='badgetext').text, # OLD: doesn't work
'num_reviews_reviewer': badgetext, # NEW: works - it has to check if `badgetext` exists on page
#'reviewer_name': review.find('span', class_='scrname').text, # OLD: doesn't work
#'bubble_rating': review.select_one('div.reviewItemInline span.ui_bubble_rating')['class'][1][7:], # OLD: doesn't work
}
#~ yield item
for key,val in item.items():
print(key, ':', val)
print('----')
#return # for test only - to stop after first review
# some URLs for testing code
start_urls = [
'https://www.tripadvisor.com/Hotel_Review-g562819-d289642-Reviews-Hotel_Caserio-Playa_del_Ingles_Maspalomas_Gran_Canaria_Canary_Islands.html',
#'https://www.tripadvisor.com/Hotel_Review-g60795-d102542-Reviews-Courtyard_Philadelphia_Airport-Philadelphia_Pennsylvania.html',
#'https://www.tripadvisor.com/Hotel_Review-g60795-d122332-Reviews-The_Ritz_Carlton_Philadelphia-Philadelphia_Pennsylvania.html',
]
for url in start_urls:
parse(url, get_soup(url))