In [11]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
from webdriver_manager.chrome import ChromeDriverManager #Automatically manages ChromeDriver
from bs4 import BeautifulSoup
import pandas as pd
import time

# ChromeDriver set up with Service
service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service)

url = 'https://www.metacritic.com/game/animal-crossing-new-horizons/user-reviews/'
driver.get(url)

#Create a dictionary so that all the parsed data can be stored
review_dict = {'name':[], 'date':[], 'rating':[], 'review':[]}

SCROLL_PAUSE_TIME = 5

def parse_reviews(soup):
    for review in soup.find_all('div', class_='c-siteReview_main'):
        try:
            review_dict['name'].append(review.find('a', class_='c-siteReviewHeader_username').text.strip())
        except AttributeError:
            review_dict['name'].append(None)

        try:
            review_dict['date'].append(review.find('div', class_='c-siteReviewHeader_reviewDate').text.strip())
        except AttributeError:
            review_dict['date'].append(None)

        try:
            review_dict['rating'].append(review.find('div', class_='c-siteReviewHeader_reviewScore').find_all('div')[0].text)
        except AttributeError:
            review_dict['rating'].append(None)
        
        #Check if there's a read more button for spoilers
        quote_div = review.find('div', class_='c-siteReview_quote')
        if quote_div and quote_div.find('span') and '[SPOILER ALERT' in quote_div.find('span').text:
            # Click the "Read More" button to open the full review
            try:
                link = review.find('a', string='Read More')
                if link:
                    link_element = driver.find_element(By.XPATH, '//*[@id="__layout"]/div/div[2]/div[1]/div[1]/section/div[6]/div[8]/div/div[1]/div[2]/div[2]/button')
                    link_element.click()   

                    WebDriverWait(driver, 5).until(EC.visibility_of_element_located((By.XPATH, "/html/body/div[5]"))
                    )

                    # Get the full review from the modal
                    expanded_soup = BeautifulSoup(driver.page_source, 'html.parser')

                    for popReview in expanded_soup.find_all('div', class_='c-globalModal'):
                        review_dict['name'].append(popReview.find('a', class_='c-siteReviewHeader_username').text.strip())

                        review_dict['date'].append(popReview.find('div', class_='c-siteReviewHeader_reviewDate').text)

                        review_dict['rating'].append(popReview.find('div', class_='c-siteReviewScore').find('span').text)

                        review_dict['review'].append(popReview.find('div', class_='c-siteReviewReadMore_wrapper').text.strip())
                else:
                    review_dict['review'].append(None)
            except Exception as e:
                print(f"Error with 'Read More' button: {e}")
                review_dict['review'].append(None)

        else:
            try:
                review_text = review.find('div', class_='c-siteReview_quote').find('span').text.strip()
                review_dict['review'].append(review_text)
            except AttributeError:
                review_dict['review'].append(None)


In [12]:
#Get initial height of the page
last_height = driver.execute_script("return document.body.scrollHeight")

#Scroll and load dynamic content, allowing it to pause and load after each scroll
while True:
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    parse_reviews(soup)

    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    
    #Wait to load more content
    time.sleep(SCROLL_PAUSE_TIME)
    
    #Calculate new scroll height and compare it with the last scroll height
    new_height = driver.execute_script("return document.body.scrollHeight")
    if new_height == last_height:
        break  #Break the loop if no more new content is loaded (end of page)
    last_height = new_height

In [13]:
NHorizonReviews = pd.DataFrame(review_dict)

NHorizonReviews.shape

(166020, 4)

In [14]:
NHorizonReviews.to_csv('NHReviews.csv', index=False)

In [15]:
driver.quit()