In [1]:
import numpy as np
import pandas as pd
import pickle as pkl

import random
import requests
import re
import os
import time
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import NoSuchElementException, TimeoutException, WebDriverException

## Set up ChromeDriver for Selenium

In [2]:
chromedriver = "/Applications/chromedriver"
os.environ["webdriver.chrome.driver"] = chromedriver

We want to get the links for games directly to save the trouble of having Selenium click on links later on.

In [3]:
# Create an empty list to collect game links.
game_links = []

In [None]:
# The range is just the number of pages showed on Metacritic.

for i in range(157):
    url = f'http://www.metacritic.com/browse/games/score/metascore/all/all/filtered?sort=desc&page={i}'
    
    driver = webdriver.Chrome(chromedriver)
    driver.get(url)
    
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    
    tags = soup.find_all('div', attrs={'class':'product_item product_title'})
    
    for tag in tags:
        link = tag.find('a')['href']
        game_links.append(link)
    
    print(f'Finished retrieving links on page number {i}')

## For each game (using the game links we gathered), gather the following information:

* Game Title
* Release Year
* Publisher
* Genre
* Platform
* Metascore (aka. Review score for game)
* Average User Score
* Number of Players (that can play the game)

In [5]:
# First, since all the links obtained are part of an URL. Need to fix that.

full_game_links = ['https://www.metacritic.com' + x for x in game_links]

In [6]:
title = []
release_year = []
publisher = []
genre = []
platform = []
metascore = []
avg_userscore = []
no_players = []

In [None]:
# Set up scraper

link_count = 0 #to keep track of how many links the scraper has gone through

for link in full_game_links:
    driver = webdriver.Chrome(chromedriver)
    driver.implicitly_wait(3) #add this so we don't need to wait for all webpage elements to load.
    driver.get(link)

    link_count += 1
    
    soup = BeautifulSoup(driver.page_source, 'html.parser')

    # get game title
    if soup.find('h1') == None:
        title.append('page not found')
    else:
        title.append(soup.find('h1').text)

    # get release year of game
    if soup.find('span', attrs={'class':'data', 'itemprop':'datePublished'}) == None:
        release_year.append('not specified')
    else:
        release_year.append(int(soup.find('span', attrs={'class':'data', 'itemprop':'datePublished'}).text[-4:]))

    # get publisher/developer of game
    if soup.find('li', attrs={'class':'summary_detail developer'}) == None:
        publisher.append('not specified')
    else:
        publisher.append(soup.find('li', attrs={'class':'summary_detail developer'}).text.replace('Developer:','').replace('\n','').replace(' ',''))

    # get genre(s) of game
    if len(soup.find_all('span', attrs={'class':'data', 'itemprop':'genre'})) == 1:
        genre.append(soup.find_all('span', attrs={'class':'data', 'itemprop':'genre'}).text)
    elif len(soup.find_all('span', attrs={'class':'data', 'itemprop':'genre'})) == 0:
        genre.append('no genre')
    else:    
        multi_genres = ';'.join([genre.text for genre in soup.find_all('span', attrs={'class':'data', 'itemprop':'genre'})])
        genre.append(multi_genres)

    # get platform of game
    if soup.find('span', attrs={'itemprop':'device'}) == None:
        platform.append('not specified')
    else:
        platform.append(soup.find('span', attrs={'itemprop':'device'}).text.replace('\n','').replace(' ',''))

    # get metascore of game
    if soup.find('span', attrs={'itemprop':'ratingValue'}) == None:
        metascore.append('not specified')
    else:
        metascore.append(soup.find('span', attrs={'itemprop':'ratingValue'}).text)

    # get average userscore of game
    if soup.find('div', attrs={'class':'userscore_wrap feature_userscore'}) == None:
        avg_userscore.append('not specified')
    else:
        avg_userscore.append(soup.find('div', attrs={'class':'userscore_wrap feature_userscore'}).text.replace('\nUser Score\n\n','')[0:3])

    # get the number of players that can play the game
    if soup.find('li', attrs={'class':'summary_detail product_players'}) == None:
        no_players.append('not specified')
    else:
        no_players.append(soup.find('li', attrs={'class':'summary_detail product_players'}).text.replace('\n# of players:\n','').replace('\n',''))

    print(f'Finished gathering Link # {link_count}, title: {len(title)}, year: {len(release_year)}, publisher: {len(publisher)}')
    print(f'genre: {len(genre)}, platform: {len(platform)}, metascore: {len(metascore)}, userscore: {len(avg_userscore)}, no_players: {len(no_players)}')
    
    driver.quit()

In [8]:
# Put all information collected in lists into a DataFrame.

df = pd.DataFrame(columns=['Title','Year','Publisher','Genre','Platform','Metascore','Avg_Userscore','No_Players'])
df['Title'] = title
df['Year'] = release_year
df['Publisher'] = publisher
df['Genre'] = genre
df['Platform'] = platform
df['Metascore'] = metascore
df['Avg_Userscore'] = avg_userscore
df['No_Players'] = no_players

In [38]:
# An quick shot of what the DataFrame should look like
df.head()

Unnamed: 0,Title,Year,Publisher,Genre,Platform,Metascore,Avg_Userscore,No_Players
0,The Legend of Zelda: Ocarina of Time,1998,Nintendo,Action Adventure;Fantasy,Nintendo64,99,9.1,1 Player
1,Tony Hawk's Pro Skater 2,2000,NeversoftEntertainment,Sports;Alternative;Skateboarding,PlayStation,98,7.4,1-2
2,Grand Theft Auto IV,2008,RockstarNorth,Action Adventure;Modern;Modern;Open-World,PlayStation3,98,7.5,1 Player
3,SoulCalibur,1999,Namco,Action;Fighting;3D,Dreamcast,98,8.6,1-2
4,Grand Theft Auto IV,2008,RockstarNorth,Action Adventure;Modern;Modern;Open-World,Xbox360,98,7.9,1 Player


In [None]:
# Save all data with Pickle.

with open('final_game_general_data.pkl', 'wb') as picklefile:
    pkl.dump(df, picklefile)

# The code below is another scraper for gathering user comments for each game scraped.

In [11]:
# All the game links that could not be accessed or found; do not need these.

df[df.Year == 'not specified']

Unnamed: 0,Title,Year,Publisher,Genre,Platform,Metascore,Avg_Userscore,No_Players
808,Error 503 Service Unavailable,not specified,not specified,no genre,not specified,not specified,not specified,not specified
1360,Bad Request,not specified,not specified,no genre,not specified,not specified,not specified,not specified
2122,This page isn’t working,not specified,not specified,no genre,not specified,not specified,not specified,not specified
2123,This page isn’t working,not specified,not specified,no genre,not specified,not specified,not specified,not specified
2124,This page isn’t working,not specified,not specified,no genre,not specified,not specified,not specified,not specified
2125,This page isn’t working,not specified,not specified,no genre,not specified,not specified,not specified,not specified
2126,This page isn’t working,not specified,not specified,no genre,not specified,not specified,not specified,not specified
2127,This page isn’t working,not specified,not specified,no genre,not specified,not specified,not specified,not specified
2977,page not found,not specified,not specified,no genre,not specified,not specified,not specified,not specified
4408,Bad Request,not specified,not specified,no genre,not specified,not specified,not specified,not specified


Set up scraper to collect user comments

In [12]:
'''
Metacritic is a well-structured website, so we can modify all game links to guide the scraper directly to the
user reviews page, without having the scraper to click through links.
'''

full_game_user_links = [x + '/user-reviews' for x in full_game_links]

In [13]:
game_title = []
game_platform = []
usernames = []
userscores = []
comments = []
no_helpfulness = []

In [None]:
game_count = 0

for index in range(0,4000): # set the number of games you would like the scraper to scrape
    
    game_count += 1
    page_number = 1
    
    driver = webdriver.Chrome(chromedriver)
    driver.implicitly_wait(3)
    driver.get(df['user_reviews_link'][index])

    print(f"Starting to gather comments for {df['Title'][index]}... page {page_number}")
    
    for _ in range(100): #each review page shows a maximum of 100 user reviews; do this to expand longer reviews
        try:
            expand = driver.find_element_by_link_text('Expand')
            expand.click()
        except WebDriverException or NoSuchElementException:
            break
    
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    
    # get username
    for user in soup.find_all('div', attrs={'class':'name'}):
        usernames.append(user.text)
        
    # get user scores
    for grade in soup.find_all('div', attrs={'class':'review_grade'}):
        userscores.append(int(grade.text))
        
    # get comments (need to expand comments)
    for review in soup.find_all('div', attrs={'class':'review_body'}):
        comments.append(review.text.replace('… Collapse', ''))
        game_title.append(df['Title'][index])
        game_platform.append(df['Platform'][index])
    
    # get how helpful a review is
    for vote in soup.find_all('span', attrs={'class':'total_ups'}):
        no_helpfulness.append(vote.text)
   
    for _ in range(100):
        try:
            page_number += 1
            print(f'starting page {page_number}...')
            next_page = driver.find_element_by_link_text('next')
            next_page.click()

            for _ in range(100):
                try:
                    expand = driver.find_element_by_link_text('Expand')
                    expand.click()
                except WebDriverException or NoSuchElementException:
                    break
            
            soup = BeautifulSoup(driver.page_source, 'html.parser')

            # get usernames
            for user in soup.find_all('div', attrs={'class':'name'}):
                usernames.append(user.text)

            # get user scores
            for grade in soup.find_all('div', attrs={'class':'review_grade'}):
                userscores.append(int(grade.text))

            # get comments (need to expand comments)
            for review in soup.find_all('div', attrs={'class':'review_body'}):
                comments.append(review.text.replace('… Collapse', ''))
                game_title.append(df['Title'][index])
                game_platform.append(df['Platform'][index])

            # get how helpful a review is
            for vote in soup.find_all('span', attrs={'class':'total_ups'}):
                no_helpfulness.append(vote.text)
                
        except NoSuchElementException:
            print(f"Finished **{df['Title'][index]}** game # {game_count} - titles: {len(game_title)}; platforms: {len(game_platform)} users: {len(usernames)}; scores: {len(userscores)}")
            print(f"comments: {len(comments)}; helpful rec'd: {len(no_helpfulness)}")
            break
            
    driver.quit()

In [None]:
# Save all lists as Pickled files.

with open('actual_game_title_3420.pkl','wb') as titlefile:
    pkl.dump(game_title, titlefile)
    
with open('actual_game_platform_3420.pkl','wb') as platformfile:
    pkl.dump(game_platform, platformfile)
    
with open('actual_usernames_3420.pkl','wb') as userfile:
    pkl.dump(usernames, userfile)
    
with open('actual_userscores_3420.pkl','wb') as scorefile:
    pkl.dump(userscores, scorefile)
    
with open('actual_usercomments_3420.pkl','wb') as commentfile:
    pkl.dump(comments, commentfile)
    
with open('actual_helpfulness_3420.pkl','wb') as helpfulfile:
    pkl.dump(no_helpfulness, helpfulfile)

