In [3]:
import json
import time
import argparse
from pprint import pprint
from datetime import datetime

from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.action_chains import ActionChains


def write_results_to_json(results):
    """ Write the scraped results to a timestamped JSON output file. """
    timestamp = datetime.strftime(datetime.now(), "%Y%m%dT%H%M%S")
    filename = f"results_{timestamp}.json"
    with open(filename, 'w') as outfile:
        json.dump(results, outfile)


def load_results_json(file):
    with open(file, 'r') as infile:
        data = json.load(infile)
    return data


def parse_review_text(text):
    """ Parse the raw text from a scraped review.
    Reviews look like, for example:

        riani wahyu prajanti
        December 29, 2019
        
        1
        
        The app is really easy to use. Lots of information about wines and especially the reviews
        help me in buying wines. Not all the time though, sometimes i bought wines with 3.3 or 3.5 review
        but those taste quite good! I would really like it if there is a feature to share the wine cellars.
        My boyfriend has scanned all of the wines we have and added those to his cellar list and i
        can't have the list on my account, too bad!

    """

    text = text.split('\n')
    """ Vivino support often replies to negative reviews, and in that event changes how we have
        to parse things. These reviews look something like, for example:

        'Olivia Andrew\nDecember 4, 2019\n\ue8dc\n8\n\ue5d4\nToo Complicated. Takes too much time to navigate.\n
        VivinoDecember 5, 2019\n
        We’re sorry to hear that. Would you mind contacting us through the app (Contact Vivino) or through 
        our web form: https://www.vivino.com/support? We really want to solve any issues you might experience. 
        Looking forward to hearing from you.'
    """
    vivino_response_text = ''
    # print(len(text))
    if len(text) <= 6:
        # No Vivino support response present
        raw_name = text[0]
        raw_date = text[1]
        raw_text = text[-1]
    else:
        # Vivino support response must be present
        raw_name = text[0]
        raw_date = text[1]
        raw_text = text[5]
        vivino_response_text = text[-1]
    return (raw_name, raw_date, raw_text, vivino_response_text)


class ShowMoreButton(object):

    def __init__(self, driver):
        self.driver = driver
        self._css_selector = '#fcxH9b > div.WpDbMd > c-wiz > div > div.ZfcPIb > div > div.JNury.Ekdcne > div > div > div.W4P4ne > div:nth-child(2) > div.PFAhAf > div > span > span'

    def is_present(self):
        """ Determine if 'SHOW MORE' reviews button is currently present on page."""
        try:
            btn_text = self.driver.find_element_by_css_selector(self._css_selector).text
            if btn_text == 'SHOW MORE':
                return True
            else:
                return False
        except NoSuchElementException:
            return False

    def move_to(self):
        """ Move to the button location on the page. """
        # Move to the next review

        button = self.driver.find_element_by_css_selector(self._css_selector)
        ActionChains(self.driver).move_to_element(button).perform()

    def click(self):
        """ Click the show more button. """
        WebDriverWait(self.driver, 5).until(EC.element_to_be_clickable((By.CSS_SELECTOR, self._css_selector))).click()

    def try_to_show_more_reviews(self):
        if self.is_present():
            # print("'SHOW MORE' button available - clicking to load more reviews now...")
            self.move_to()
            self.click()
        else:
            # print("'SHOW MORE' button not currently present on page!")
            pass


class ReviewStars(object):

    def __init__(self, driver, review_num):
        self.driver = driver
        self.review_num = review_num
        self._css_selector = f"#fcxH9b > div.WpDbMd > c-wiz > div > div.ZfcPIb > div > div.JNury.Ekdcne > div > div > div.W4P4ne > div:nth-child(2) > div > div:nth-child({review_num}) > div > div.d15Mdf.bAhLNe > div.xKpxId.zc7KVe > div.bAhLNe.kx8XBd > div > span.nt2C1d > div > div"

        self._rank = ()

    @property
    def rank(self):
        text = self.driver.find_element_by_css_selector(self._css_selector).get_attribute('aria-label')
        s = text.split(" ")
        # Review star ranks are on a x/5 scale, so
        # we'll return a tuple like, for example (4, 5)
        # to denote 4/5 stars.
        self._rank = (int(s[1]), 5)
        return self._rank


class Review(object):

    def __init__(self, driver, review_num):
        self._driver = driver
        self.review_num = review_num
        self._css_selector = f'#fcxH9b > div.WpDbMd > c-wiz > div > div.ZfcPIb > div > div.JNury.Ekdcne > div > div > div.W4P4ne > div:nth-child(2) > div > div:nth-child({review_num})'
        self._element = self._driver.find_element_by_css_selector(self._css_selector)

        self._full_review_css_selector = f"#fcxH9b > div.WpDbMd > c-wiz > div > div.ZfcPIb > div > div.JNury.Ekdcne > div > div > div.W4P4ne > div:nth-child(2) > div > div:nth-child({self.review_num}) > div > div.d15Mdf.bAhLNe > div.UD7Dzf > span:nth-child(1) > div > button"

        self._raw_text = ''

        # Parsed values: Reviwer name, date, review text
        self._name = ''
        self._date = ''
        self._text = ''
        self._vivino_response = ''

    @property
    def raw_text(self):
        self._raw_text = self._element.text
        return self._raw_text

    @property
    def name(self):
        self._name = parse_review_text(self.raw_text)[0]
        return self._name

    @property
    def date(self):
        self._date = parse_review_text(self.raw_text)[1]
        return self._date

    @property
    def text(self):
        self._text = parse_review_text(self.raw_text)[2]
        return self._text

    @property
    def vivino_response(self):
        self._vivino_response = parse_review_text(self.raw_text)[3]
        return self._vivino_response

    def move_to(self):
        """ Move to the review location on the page. """
        r = self._driver.find_element_by_css_selector(self._css_selector)
        ActionChains(self._driver).move_to_element(r).perform()

    def is_expandable(self):
        """ Check if 'Full Review' button present in review to show more text. """
        try:
            btn_text = self._element.find_element_by_css_selector(self._full_review_css_selector).text
            if btn_text == "Full Review":
                return True
            else:
                return False
        except NoSuchElementException:
            # print("Review cannot be expanded!")
            return False

    def expand(self):
        if self.is_expandable():
            # self._driver.find_element_by_css_selector(self._full_review_css_selector).click()
            WebDriverWait(self._driver, 5).until(
                EC.element_to_be_clickable((By.CSS_SELECTOR, self._full_review_css_selector))).click()


# def main(args):
#     """ Main entry point for application. """

#     max_reviews = args.max_reviews
#     print(f"***** STARTING SCRAPER *****")

#     # Google app store Vivino page URL
#     vivino_url = "https://play.google.com/store/apps/details?id=vivino.web.app&showAllReviews=true"

#     # Setup the driver.
#     driver = webdriver.Chrome()
#     driver.get(vivino_url)

#     show_more_btn = ShowMoreButton(driver)

#     # max_reviews = 10
#     review_num = 1

#     results = []
#     while True:

#         if review_num > max_reviews:
#             break

#         print(f"Getting Review #{review_num}/{max_reviews}")

#         try:

#             # Try to click the 'Show More' reviews button if available on page
#             show_more_btn.try_to_show_more_reviews()

#             # Parse the review
#             review = Review(driver, review_num)
#             review.move_to()
#             if review.is_expandable():
#                 review.expand()

#             stars = ReviewStars(driver, review_num)
#             # print(review.name, review.date, review.text, stars.rank)

#             results.append({
#                 "name": review.name,
#                 "date": review.date,
#                 "text": review.text,
#                 "stars": stars.rank,
#                 "vivino_response": review.vivino_response,
#             })
#         except Exception as e:
#             print(f"Exception occurred on review #{review_num}. Skipping review. Exception: \n {e}")

#         review_num += 1
#         time.sleep(1)

#     # Shutdown the Chrome driver
#     # driver.close()

#     # pprint(results)
#     print(f"**** FINISHED! SUCCESSFULLY SCRAPED {len(results)} TOTAL REVIEWS! ****")

#     # Save the scraped results to a JSON file
#     write_results_to_json(results)


# if __name__ == "__main__":

#     # Parse command line args
#     parser = argparse.ArgumentParser()
#     parser.add_argument("--max_reviews", type=int, default=10, help="Maximum number of reviews to scrape.")
#     args = parser.parse_args()

#     # Run the scraper
#     main(args)


In [4]:

""" Main entry point for application. """
MAX_REVIEWS = 30000
# MAX_REVIEWS = 500

max_reviews = MAX_REVIEWS # args.max_reviews
print(f"***** STARTING SCRAPER *****")

# Google app store Vivino page URL
vivino_url = "https://play.google.com/store/apps/details?id=vivino.web.app&showAllReviews=true"

# Setup the driver.
driver = webdriver.Chrome()
driver.get(vivino_url)

# Intitialize the review counter
review_num = 1

results = []
while True:

    if review_num > max_reviews:
        break

    print(f"Getting Review #{review_num}/{max_reviews}")

    try:

        # Try to click the 'Show More' reviews button if available on page
        show_more_btn = ShowMoreButton(driver)
        show_more_btn.try_to_show_more_reviews()

        # Parse the review
        review = Review(driver, review_num)
        review.move_to()
        if review.is_expandable():
            review.expand()

        stars = ReviewStars(driver, review_num)
        # print(review.name, review.date, review.text, stars.rank)
        
        # Try to click the 'Show More' reviews button if available on page
        show_more_btn = ShowMoreButton(driver)
        show_more_btn.try_to_show_more_reviews()

        results.append({
            "name": review.name,
            "date": review.date,
            "text": review.text,
            "stars": stars.rank,
            "vivino_response": review.vivino_response,
        })
    except Exception as e:
        print(f"Exception occurred on review #{review_num}. Skipping review. Exception: \n {e}")
        time.sleep(3)

    review_num += 1
    # time.sleep(1)

# Shutdown the Chrome driver
# driver.close()

# pprint(results)
print(f"**** FINISHED! SUCCESSFULLY SCRAPED {len(results)} TOTAL REVIEWS! ****")

# Save the scraped results to a JSON file
write_results_to_json(results)

***** STARTING SCRAPER *****
Getting Review #1/30000
Getting Review #2/30000
Getting Review #3/30000
Getting Review #4/30000
Getting Review #5/30000
Getting Review #6/30000
Getting Review #7/30000
Getting Review #8/30000
Getting Review #9/30000
Getting Review #10/30000
Getting Review #11/30000
Getting Review #12/30000
Getting Review #13/30000
Getting Review #14/30000
Getting Review #15/30000
Getting Review #16/30000
Getting Review #17/30000
Getting Review #18/30000
Getting Review #19/30000
Getting Review #20/30000
Getting Review #21/30000
Getting Review #22/30000
Getting Review #23/30000
Getting Review #24/30000
Getting Review #25/30000
Getting Review #26/30000
Getting Review #27/30000
Getting Review #28/30000
Getting Review #29/30000
Getting Review #30/30000
Getting Review #31/30000
Getting Review #32/30000
Getting Review #33/30000
Getting Review #34/30000
Getting Review #35/30000
Getting Review #36/30000
Getting Review #37/30000
Getting Review #38/30000
Getting Review #39/30000
Getti

KeyboardInterrupt: 

In [5]:
len(results)

5716

In [6]:
def write_results_to_json(results):
    """ Write the scraped results to a timestamped JSON output file. """
    timestamp = datetime.strftime(datetime.now(), "%Y%m%dT%H%M%S")
    filename = f"results_{timestamp}.json"
    with open(filename, 'w') as outfile:
        json.dump(results, outfile)

In [7]:
write_results_to_json(results)

In [6]:
import json


def load_results_json(file):
    with open(file, 'r') as infile:
        data = json.load(infile)
    return data

In [8]:
data = load_results_json("results_500reviews.json")

In [9]:
len(data)

500

In [10]:
data[-1]

{'name': 'Matt S',
 'date': 'January 3, 2019',
 'text': 'This app is very handy when looking to select something new to try, the user ratings on the wines and the individual wine profiles are both very helpful and interesting.',
 'stars': [5, 5],
 'vivino_response': ''}

In [10]:
class MyArgs(object):
    
    def __init__(self, max_reviews):
        self.max_reviews = max_reviews
        

In [11]:
myargs = MyArgs(max_reviews=10)

In [12]:
main(myargs)

***** STARTING SCRAPER *****
Getting Review #1/10
Getting Review #2/10
Getting Review #3/10
Getting Review #4/10
Getting Review #5/10
Getting Review #6/10
Getting Review #7/10
Getting Review #8/10
Getting Review #9/10
Getting Review #10/10
**** FINISHED! SUCCESSFULLY SCRAPED 10 TOTAL REVIEWS! ****
