In [4]:
import mongo_db
import TripAdvisor_scraper
from bs4 import BeautifulSoup
from selenium import webdriver

##### Connect with Selenium driver

In [5]:
driver = webdriver.Chrome()

##### Extract the urls from all the restaurants with Beautiful Soup

In [6]:
# It stores in the list urls_restaurants all the clickable urls of the restaurants
urls_restaurants = []
url = 'https://www.tripadvisor.com/Restaurants-g189473-Thessaloniki_Thessaloniki_Region_Central_Macedonia.html'
while True: # while there are pages to read
    try:
        # extract the data from each page
        driver.get(url)
        soup = BeautifulSoup(driver.page_source, 'html.parser')

        # get the clickable url form each title
        urls_restaurants = TripAdvisor_scraper.restaurants_urls(soup, urls_restaurants)

        # go to the next page
        url = TripAdvisor_scraper.next_page(soup, "nav next rndBtn ui_button primary taLnk")

    except Exception: # when there are no other pages to read
        break
# keep only the unique urls - duplicates because of sponsorships
urls_restaurants = list(set(urls_restaurants))

##### Create a local database and a collection

In [7]:
collection = mongo_db.create_mongo()

##### Extract the all the reviews for each restaurant url with Beautiful Soup

In [8]:
# for all the restaurants
for restaurant_url in urls_restaurants:
    print(restaurant_url)

    # for all the reviews
    url = restaurant_url
    while True: # while there are review pages to read
        try:
            driver.get(url)
            soup = BeautifulSoup(driver.page_source, 'html.parser')

            # get the name of all the restaurant
            restaurant_name = TripAdvisor_scraper.restaurant_name(soup)

            # get the username of all the reviewers
            usernames = TripAdvisor_scraper.username(soup)

            # get the review date of all the reviews
            review_dates = TripAdvisor_scraper.review_date(soup)

            # get the visit date of all the reviews
            visit_dates = TripAdvisor_scraper.visit_date(soup)

            # get the title of all the reviews
            titles = TripAdvisor_scraper.title(soup)

            # get the text of all the reviews
            # get the first text that appears
            partial_texts = TripAdvisor_scraper.partial_text(soup)
            # get the text inside the more button
            more_texts = TripAdvisor_scraper.more_text(soup)

            if more_texts: # if there is more section
                # combine the two texts to create the full review
                full_review_texts = []
                iterations = len(partial_texts)
                for it in range(iterations):
                    full_review_text = partial_texts[it] + ' ' + more_texts[0]
                    full_review_texts.append(full_review_text)
            else: # if there is no more section
                full_review_texts = partial_texts

            # get the rating of all the reviews
            ratings = TripAdvisor_scraper.rating(soup)

            # insert the review to the MongoDb
            for item in range(len(usernames)):
                item_dict = {
                    "restaurant": restaurant_name,
                    "username": usernames[item],
                    "dateOfReview": review_dates[item],
                    "dateOfVisit": visit_dates[item],
                    "title": titles[item],
                    "text": full_review_texts[item],
                    "rating": ratings[item]
                }
                mongo_db.insert_to_mongo(collection, item_dict)

            # go to the next page
            url = TripAdvisor_scraper.next_page(soup, "nav next ui_button primary")

        except Exception as exp: # when there are no other review pages to read
            break

https://www.tripadvisor.com/Restaurant_Review-g189473-d12395643-Reviews-Methi_Bistro_Wine_Bar-Thessaloniki_Thessaloniki_Region_Central_Macedonia.html
https://www.tripadvisor.com/Restaurant_Review-g189473-d23464816-Reviews-KIFI-Thessaloniki_Thessaloniki_Region_Central_Macedonia.html
https://www.tripadvisor.com/Restaurant_Review-g189473-d17724125-Reviews-Cup_Flavors_Experience-Thessaloniki_Thessaloniki_Region_Central_Macedonia.html
https://www.tripadvisor.com/Restaurant_Review-g189473-d23649169-Reviews-Surfer_Maya-Thessaloniki_Thessaloniki_Region_Central_Macedonia.html
https://www.tripadvisor.com/Restaurant_Review-g189473-d23946324-Reviews-Nutribowl-Thessaloniki_Thessaloniki_Region_Central_Macedonia.html
https://www.tripadvisor.com/Restaurant_Review-g189473-d17570992-Reviews-Falafel_Taste_Middle_East-Thessaloniki_Thessaloniki_Region_Central_Macedonia.html
https://www.tripadvisor.com/Restaurant_Review-g189473-d17556923-Reviews-Simon_s-Thessaloniki_Thessaloniki_Region_Central_Macedonia.htm

In [10]:
driver.quit()

#### TODOS:
1. Only English reviews - in Greek reviews it can not take the text and title