# Overview
- The script will get data (name, score, genre, etc.) of top 100 video-games of the year from metacritic.com
  - The user specifies a year. then the script will export the result in a .csv file
  - If script encounters an excpetion, it will export progress in a .csv file
- Rough overview of execution
  - urls_list = get_game_links(year=2010)
  - game_data_dataframe = scrape_list(urls_list)
  - game_data_dataframe.to_csv('data_2010.csv')

In [None]:
import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.firefox.firefox_profile import FirefoxProfile
from selenium.webdriver.common.keys import Keys

In [None]:
def get_game_links(year, driver, page=0, save_dir = False):
    """
    checks for "links_{year}.csv" ; a .csv file containing list of urls
        if it exists, this function returns a list containing the urls
        if it doesn't exist, this function creates a list containing the urls, then returns the list

    Parameters
    ----------
    year : LIST
        the year of the video games that the user wants to scrape
    page : INT, OPTIONAL
        if 0, it will get top 1-100 games. if 1 it will get top 101-200 games. and so forth.
    driver : selenium.webdriver object
        the selenium webdriver that will load the webpaged to be scraped
    save_dir: Boolean, OPTIONAL
        if True, function will write a .csv file in directory, containing list of urls

    Returns
    -------
    links : LIST
        a list containing 100 url strings, each is the address to the metacritic.com page of an individual game

    """
    page_url = f'https://www.metacritic.com/browse/games/score/metascore/year/pc/filtered?year_selected={year}&page={page}'
    links = []
    
    try:
        with open(f'links_{year}.csv', 'r', year) as f:
            for line in f:
                x = line[:-1]
                links.append(x)

    # if .csv for links has not been made                
    except FileNotFoundError: 
        print(f'=== links_{year}.csv not yet created, proceed to get')
        driver.switch_to.window(driver.window_handles[0])
        driver.get(page_url)
        
        elements = driver.find_elements(By.CLASS_NAME, 'clamp-summary-wrap')
        for i in range(0, len(elements)):
            link = elements[i].find_element(By.CSS_SELECTOR, "a:nth-child(4)").get_attribute('href')
            links.append(link)
        driver.execute_script("window.open('');")
        driver.switch_to.window(driver.window_handles[1])
        driver.close()
        
        if save_dir:
            with open(f'links_{year}.csv', 'w') as f:
                for link in links:
                    f.write("%s\n" %link)
    
    return links

In [None]:
def get_game_data(page_url, driver):
    """
    scrapes the data for an individual game

    Parameters
    ----------
    page_url : string
        the metacritic.com url of an individual game.
    driver : selenium.webdriver
        the selenium webdriver that will load the webpage to be scraped

    Returns
    -------
    list
        list containing data of the individual game

    """
    driver.switch_to.window(driver.window_handles[0])
    driver.get(page_url)
    
    ## Title Details
    title = driver.find_element(By.CSS_SELECTOR, 'div.product_title > a:nth-child(1) > h1:nth-child(1)').text
    publisher = driver.find_element(By.CSS_SELECTOR, ".publisher > span:nth-child(2) > a:nth-child(1)").text
    release_date = driver.find_element(By.CSS_SELECTOR, ".release_data > span:nth-child(2)").text
    
    ## Platforms
    try: 
        platform_obj = driver.find_element(By.CSS_SELECTOR, '.product_platforms > span:nth-child(2)')
        platform = [data.text for data in platform_obj.find_elements(By.CLASS_NAME, "hover_none")]
    except NoSuchElementException:
        platform = []
    
    
    ## Side Details
    try:
        developer = driver.find_element(By.CSS_SELECTOR, '.developer > span:nth-child(2)').text
    except NoSuchElementException: developer = None
    
    try:
        rating = driver.find_element(By.CSS_SELECTOR, '.product_rating > span:nth-child(2)').text
    except NoSuchElementException: rating = None
    
    try:
        multiplayer = driver.find_element(By.CSS_SELECTOR, '.product_players > span:nth-child(2)').text
    except NoSuchElementException: multiplayer = None
    
    try:
        genre_obj = driver.find_element(By.CSS_SELECTOR, '.product_genre')
        genre = [data.text for data in genre_obj.find_elements(By.CLASS_NAME, "data")]
    except NoSuchElementException: genre = []
    
    
    ## Scores
    meta_score_summary_class = "score_summary.metascore_summary"
    meta_score_summary_div = driver.find_element(By.CLASS_NAME, meta_score_summary_class)
    meta_score = meta_score_summary_div.find_element(By.TAG_NAME, "span").text
    meta_count_span = meta_score_summary_div.find_element(By.CLASS_NAME, "count")
    meta_count = meta_count_span.find_elements(By.TAG_NAME, "span")[-1].text
    
    try:
        user_score_summary = driver.find_element(By.CSS_SELECTOR, ".side_details")
        user_score_class = user_score_summary.find_element(By.CLASS_NAME, "metascore_anchor")
        user_score = user_score_class.find_element(By.TAG_NAME, "div").text
        user_reviews_count = user_score_summary.find_element(By.CSS_SELECTOR, "div:nth-child(1) > div:nth-child(3) > p:nth-child(1) > span:nth-child(2) > a:nth-child(3)").text
    except NoSuchElementException:
        user_score = ''
        user_reviews_count = ''
    
    ## Must Play Label
    try:
        driver.find_element(By.CLASS_NAME, "product_image.large_image.must_play")
        driver.find_element(By.CSS_SELECTOR, "span.must_play")
        must_play = 1
    except NoSuchElementException:
        must_play = 0
    
    driver.execute_script("window.open('');")
    driver.switch_to.window(driver.window_handles[1])
    driver.close()
    return [title, publisher, release_date, platform, developer, rating, multiplayer, genre, meta_score, meta_count, user_score, user_reviews_count, must_play]

In [None]:
def scrape_list(links, driver, prior_data = pd.DataFrame([])):
    """
    scrapes the data of games, whose urls are listed in 'links'
    

    Parameters
    ----------
    links : list
        list containing a game's url to scrape data from.
    prior_data : pandas.DataFrame, optional
        a backup dataframe to continue progress from a previous failed attempt
        if provided a dataframe with length i, function will start scraping from the i+1-th url
    driver : selenium.webdriver
        the selenium webdriver that will load the webpage to be scraped

    Returns
    -------
    df_all : pandas.DataFrame
        dataframe combining data from prior_data and currently scraped data

    """
    count = len(prior_data)

    data = []
    column_names = ["title", "publisher", "release_date", "platform", "developer", "rating", "multiplayer", "genre", "meta_score", "meta_count", "user_score", "user_reviews_count", "must_play"]                
    
    try:    
        for i in range(count, len(links)):
            single_game_data = get_game_data(links[i], driver)
            #print("=== single data to be appended")
            data.append(single_game_data)
            
            print("===",count, data[-1][0], "succesfully scraped")
            count += 1
            time.sleep(15)
            
        df = pd.DataFrame(data, columns = column_names)
        df_all = pd.concat([prior_data, df])
        return df_all
    
    except Exception as e: 
        print("=== ecountered exception, returning progress dataframe")
        print(e)
        df = pd.DataFrame(data, columns = column_names)
        df_all = pd.concat([prior_data, df])
        return df_all

# Execution

In [None]:
## Initialize selenium.webdriver object
options = webdriver.firefox.options.Options()
options.binary_location = r'C:\Program Files\Mozilla Firefox\firefox.exe'
options.set_preference('profile', 'C:\\Users\\ACER\\b29bh87j.gecko_novideoplay')
options.set_preference('media.mp4.enabled', False)
driver = webdriver.Firefox(options=options)
#driver.install_addon(r'C:\adblock_plus-3.17.xpi')

In [None]:
## Run Script
for year in [i for i in range(2007,2008)]:

    links = get_game_links(year, driver=driver, page=0)

    try:
        prior_data = pd.read_csv(f'data_{year}.csv')
    except FileNotFoundError:
        print(f"=== No Prior_data: data_{year}.csv found")
        prior_data = pd.DataFrame([])
        
    scrape_result = scrape_list(links, driver, prior_data)
    scrape_result.to_csv(f'data_{year}.csv', mode='w+', index=False)