In [1]:
import pandas as pd
import numpy as np
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
import time
import random
import requests

In [2]:
url = "https://www.metacritic.com/browse/games/score/metascore/year/all/filtered?year_selected={YEAR}&distribution=&sort=desc&view=detailed&page={PAGE}"

In [3]:
curr_url = url.format(YEAR=2022, PAGE=0)

In [4]:
user_agent = {'User-agent': 'Mozilla/5.0'}
response = requests.get(curr_url, headers = user_agent)

soup = BeautifulSoup(response.text, 'html.parser')

In [5]:
def get_list_game_urls(soup):
    '''
    returns a list containing all urls from games in a specific search page

    Parameters:
        soup (BsObj): Beautiful soup object containing the search page 

    Returns:
        list_games_url (list): list of game urls
    '''
    list_hrefs = soup.findAll('a', {'class':'title','href':re.compile("^/game/")})
    if len(list_hrefs) != 0:
        list_games_url = ['https://metacritic.com'+url.attrs['href'] for url in list_hrefs]
    else:
        list_games_url = []

    return list_games_url

In [6]:
def scrape_page(url):
    '''
    returns a dictionary containing info about a specific game page

    Parameters:
        url (string): url of game page to be scraped

    Returns:
        dict_page (dict): dictionary containing variables from the page
    '''
    response = requests.get(url, headers = user_agent)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    title = soup.find('a',{'class':'hover_none'}).get_text().split('\n')[1] if soup.find('a',{'class':'hover_none'}) is not None else None
    platform = url.split('/')[4].strip()
    release_date = soup.find('li',{'class':'summary_detail release_data'}).get_text().split('\n')[2] if soup.find('li',{'class':'summary_detail release_data'}) is not None else None
    other_platforms = soup.find('li',{'class':'summary_detail product_platforms'}).get_text() if soup.find('li',{'class':'summary_detail product_platforms'}) is not None else None
    other_platforms = [string for string in other_platforms.split('\n') if string !=''][1:][0].split(',') if other_platforms is not None else None
    other_platforms = [platform.strip() for platform in other_platforms] if other_platforms is not None else None    
    metascore = soup.find('span',{'itemprop':'ratingValue'}).get_text().strip() if soup.find('span',{'itemprop':'ratingValue'}) is not None else None
    user_score=soup.find('a',{'class':'metascore_anchor','href':re.compile(r"user-reviews$")}).get_text().split('\n')[1].strip() if soup.find('a',{'class':'metascore_anchor','href':re.compile(r"user-reviews$")}) is not None else None
    developer=soup.find('a',{'href':re.compile("^/company/"),'class':'button'}).get_text() if soup.find('a',{'href':re.compile("^/company/"),'class':'button'}) is not None else None
    publisher = soup.find('a',{'href':re.compile("^/company/")}).get_text() if soup.find('a',{'href':re.compile("^/company/")}) is not None else None
    if publisher is not None:
        publisher = publisher.split('\n')[1].lstrip() if len(publisher.split('\n')) > 1 else publisher
    genre = [i.get_text() for i in list(soup.find('li',{'class':'summary_detail product_genre'}).children)][1:] if soup.find('li',{'class':'summary_detail product_genre'}) is not None else None
    players = [i.get_text() for i in list(soup.find('li',{'class':'summary_detail product_players'}).children)][3] if soup.find('li',{'class':'summary_detail product_players'}) is not None else None
    rating = [i.get_text() for i in list(soup.find('li',{'class':'summary_detail product_rating'}).children)][3] if soup.find('li',{'class':'summary_detail product_rating'}) is not None else None
    summary = soup.find('span',{'class':'blurb blurb_expanded'}).get_text() if soup.find('span',{'class':'blurb blurb_expanded'}) is not None else None
    
    dict_page = {'title':title, 'platform':platform,'release_date':release_date,'other_platforms':other_platforms,'metascore':metascore, 'user_score':user_score,
                 'developer':developer, 'publisher':publisher, 'genre':genre,'players':players,'rating':rating,
                 'summary':summary, 'url':url
                }
    return dict_page



In [339]:
def scrape_all_page(url, year, page):
    '''
    returns a df with all the game data from specific page, on specific year

    Parameters:
        url (string): url of the game page to be formated
        year (int): year of the page
        page (int): number of the page
    Returns:
        df (DataFrame): df containing all the data from the games of a specific search page
    '''
    new_url = (url + '.')[:-1]
    curr_url = new_url.format(YEAR=year, PAGE=page)
    user_agent = {'User-agent': 'Mozilla/5.0'}
    response = requests.get(curr_url, headers = user_agent)

    soup = BeautifulSoup(response.text, 'html.parser')
    list_games_page = get_list_game_urls(soup)
    if len(list_games_page) == 0:
        return None
    list_games_dicts = []
    for game_url in list_games_page:
        game_dict = scrape_page(game_url)
        list_games_dicts.append(game_dict)
        time.sleep(random.random()/2)
    df = pd.DataFrame(list_games_dicts)
    return df

In [None]:
#scrapping the pages
year = 2021
page = 0
dfs_list = []
year_dfs_list = []
url = "https://www.metacritic.com/browse/games/score/metascore/year/all/filtered?year_selected={YEAR}&distribution=&sort=desc&view=detailed&page={PAGE}"
while (year >= 2000):
    print(year,', ',page)
    df = scrape_all_page(url, year, page)
    if df is not None:
        year_dfs_list.append(df)
        dfs_list.append(df)
    else:
        page = -1
        concatenated = pd.concat(year_dfs_list)
        concatenated.to_csv('/Metacritic_data/metacritic_games_{YEAR}.csv'.format(YEAR=year))
        year_dfs_list = []
        year = year-1
    page=page+1

        

2021 ,  0
2021 ,  1
2021 ,  2
2021 ,  3
2021 ,  4
2021 ,  5
2021 ,  6
2021 ,  7
2021 ,  8
2021 ,  9
2021 ,  10
2020 ,  0
2020 ,  1
2020 ,  2
2020 ,  3
2020 ,  4
2020 ,  5
2020 ,  6
2020 ,  7
2020 ,  8
2020 ,  9
2020 ,  10
2020 ,  11
2019 ,  0
2019 ,  1
2019 ,  2
2019 ,  3
2019 ,  4
2019 ,  5
2019 ,  6
2019 ,  7
2019 ,  8
2019 ,  9
2019 ,  10
2019 ,  11
2018 ,  0
2018 ,  1
2018 ,  2
2018 ,  3
2018 ,  4
2018 ,  5
2018 ,  6
2018 ,  7
2018 ,  8
2018 ,  9
2018 ,  10
2018 ,  11
2018 ,  12
2017 ,  0
2017 ,  1
2017 ,  2
2017 ,  3
2017 ,  4
2017 ,  5
2017 ,  6
2017 ,  7
2017 ,  8
2017 ,  9
2017 ,  10
2017 ,  11
2016 ,  0
2016 ,  1
2016 ,  2
2016 ,  3
2016 ,  4
2016 ,  5
2016 ,  6
2016 ,  7
2016 ,  8


In [15]:
# concating all the available dfs
data = []
for year in np.arange(2000,2022+1)[::-1]:
    df = pd.read_csv('Metacritic_data/metacritic_games_{YEAR}.csv'.format(YEAR=year), index_col=0).reset_index(drop=True)
    data.append(df)
    

In [18]:
full_game_data = pd.concat(data).reset_index(drop=True)

In [20]:
full_game_data.to_csv('Metacritic_data/metacritic_games_full.csv')

In [7]:
# reading data again
df = pd.read_csv('Metacritic_data/metacritic_games_full.csv')

In [262]:
def get_critic_reviews(df, idx, chunk_size):
    
    '''
    returns df containing all critic reviews from the n games where n=chunk_size,
    starting from the game placed on index=idx from the provided df

    Parameters:
        df (DataFrame): dataframe containin game titles, urls and platforms
        idx (int): number corresponding to the desired starting index of the df
        chunk_size (int): number of games to scrape reviews of

    Returns:
        full_df (DataFrame): df containing all the desired reviews
        idx (int): final index where the scraper stopped at
    '''
    list_all_dfs = []
    this_df = df.copy()
    counter = 1
    while (counter<=chunk_size):

        print(idx)
        url = this_df['url'][idx]
        print(url)
        title = this_df['title'][idx]
        print(title)
        platform = this_df['platform'][idx]

        critic_url = url+'/critic-reviews'
        print(critic_url)
        user_agent = {'User-agent': 'Mozilla/5.0'}
        response = requests.get(critic_url, headers = user_agent)

        soup = BeautifulSoup(response.text, 'html.parser')
        users_score = [score for score in soup.findAll('div',{'class':re.compile("^metascore_w user medium")})]
        n_user_reviews = max(0,(len(users_score)-1))
        un_score = [score.get_text() for score in soup.findAll('div',{'class':"metascore_w medium game noscore indiv"})]

        if (n_user_reviews + len(un_score))>0:
            reviews = [review.get_text()[2:].strip() for review in soup.findAll('div',{'class':'review_body'})][:-(n_user_reviews+len(un_score))]
        else: 
            reviews = [review.get_text()[2:].strip() for review in soup.findAll('div',{'class':'review_body'})]
        pos_score = [score.get_text() for score in soup.findAll('div',{'class':"metascore_w medium game positive indiv"})]
        mixed_score = [score.get_text() for score in soup.findAll('div',{'class':"metascore_w medium game mixed indiv"})]
        negative_score = [score.get_text() for score in soup.findAll('div',{'class':"metascore_w medium game negative indiv"})]
        un_score = [score.get_text() for score in soup.findAll('div',{'class':"metascore_w medium game noscore indiv"})]
        users_score = [score for score in soup.findAll('div',{'class':re.compile("^metascore_w user medium")})]
        scores = pos_score+mixed_score+negative_score
        if len(un_score)>0:
            reviewers = [reviewer.get_text() for reviewer in soup.findAll('div',{'class':'source'})][:-(len(un_score))]
        else:
            reviewers = [reviewer.get_text() for reviewer in soup.findAll('div',{'class':'source'})]

        if n_user_reviews + len(un_score)>0:
            dates = [critic.get_text() for critic in soup.findAll('div',{'class':'date'})][:-(n_user_reviews+len(un_score))]
        else:
            dates = [critic.get_text() for critic in soup.findAll('div',{'class':'date'})]
        reviews_game_list = []
        for j in np.arange(0,len(scores)):
            review_dict = {'url':url,'title':title,'platform':platform,'user_score':scores[j], 'reviewer':reviewers[j],'review':reviews[j], 'date':dates[j], 'review_type':'critic', 'url_page':critic_url}
            reviews_game_list.append(review_dict)
        df = pd.DataFrame(reviews_game_list)
        list_all_dfs.append(df)
        idx+=1
        counter+=1
    full_df = pd.concat(list_all_dfs).reset_index(drop=True)
    return full_df, idx
    

In [263]:
new_df, i = get_critic_reviews(df, 30, 10)

30
https://metacritic.com/game/playstation-5/horizon-forbidden-west
Horizon Forbidden West
https://metacritic.com/game/playstation-5/horizon-forbidden-west/critic-reviews
31
https://metacritic.com/game/pc/final-fantasy-vi-pixel-remaster
Final Fantasy VI Pixel Remaster
https://metacritic.com/game/pc/final-fantasy-vi-pixel-remaster/critic-reviews
32
https://metacritic.com/game/switch/cuphead-in-the-delicious-last-course
Cuphead in the Delicious Last Course
https://metacritic.com/game/switch/cuphead-in-the-delicious-last-course/critic-reviews
33
https://metacritic.com/game/pc/immortality
Immortality
https://metacritic.com/game/pc/immortality/critic-reviews
34
https://metacritic.com/game/playstation-5/gran-turismo-7
Gran Turismo 7
https://metacritic.com/game/playstation-5/gran-turismo-7/critic-reviews
35
https://metacritic.com/game/playstation-5/uncharted-legacy-of-thieves-collection
Uncharted: Legacy of Thieves Collection
https://metacritic.com/game/playstation-5/uncharted-legacy-of-thiev

In [None]:
# getting user reviews

In [91]:
def get_user_reviews(df, idx, chunk_size, max_revs=10000, sort_by = 'most-helpful'):
        
    '''
    returns df containing all user reviews from the n games where n=chunk_size,
    starting from the game placed on index=idx from the provided df

    Parameters:
        df (DataFrame): dataframe containin game titles, urls and platforms
        idx (int): number corresponding to the desired starting index of the df
        chunk_size (int): number of games to scrape reviews of
        max_revs (int): max number of reviews to gather per game
        sort_by (string): how the review page should be sorted ex: (most-helpful, date)

    Returns:
        full_df (DataFrame): df containing all the desired reviews
        idx (int): final index where the scraper stopped at
    '''
    list_all_dfs = []
    this_df = df.copy()
    counter = 1
    while (counter<=chunk_size):
        print(idx)
        url = this_df['url'][idx]
        print(url)
        title = this_df['title'][idx]
        print(title)
        platform = this_df['platform'][idx]
        return_list = np.ones(10)
        page = 0
        list_dfs_game = []
        rev_count = 0
        while((len(return_list)>0) and (rev_count <=max_revs)):
            
            print(page)
            user_game_list = []
            user_url = (url+'.')[:-1]
            user_url = url+'/user-reviews?sort-by={TYPE}&num_items=100&page={PAGE}'
            user_url_page = user_url.format(TYPE = sort_by,PAGE=page)
            print(user_url_page)
            user_agent = {'User-agent': 'Mozilla/5.0'}
            response = requests.get(user_url_page, headers = user_agent)

            soup = BeautifulSoup(response.text, 'html.parser')
            user_scores = [int(score.get_text()) for score in soup.findAll('div',{'class':re.compile("^metascore_w user medium")})]
            users = (['.']+[user.get_text() for user in soup.findAll('a',{'href':re.compile("^/user/")})])[1::2]
            critic_score = [score for score in soup.findAll('div',{'class':re.compile("^metascore_w medium game (.*?)\ indiv")})]
            dates = [date.get_text() for date in soup.findAll('div',{'class':'date'})][:-(len(critic_score))]
            reviews = [review.get_text()[1:].strip() for review in soup.findAll('div',{'class':'review_body'})][:-(len(critic_score))]
            rev_count += len(user_scores)
            for j in np.arange(0,len(user_scores)):
                user_dict = {'url':url,'title':title,'platform':platform,'user_score':user_scores[j], 'reviewer':users[j],'review':reviews[j], 'date':dates[j], 'review_type':'user', 'url_page':user_url}
                user_game_list.append(user_dict)
            df = pd.DataFrame(user_game_list)
            list_dfs_game.append(df)
            return_list = user_scores
            page+=1
            
        df_game = pd.concat(list_dfs_game)
        counter +=1
        idx+=1
        list_all_dfs.append(df_game)
    full_df = pd.concat(list_all_dfs)
    return full_df, idx


In [92]:
df, idx = get_user_reviews(df,0,10, 200)

0
https://metacritic.com/game/pc/persona-5-royal
Persona 5 Royal
0
https://metacritic.com/game/pc/persona-5-royal/user-reviews?sort-by=most-helpful&num_items=100&page=0
1
https://metacritic.com/game/pc/persona-5-royal/user-reviews?sort-by=most-helpful&num_items=100&page=1
1
https://metacritic.com/game/xbox-series-x/elden-ring
Elden Ring
0
https://metacritic.com/game/xbox-series-x/elden-ring/user-reviews?sort-by=most-helpful&num_items=100&page=0
1
https://metacritic.com/game/xbox-series-x/elden-ring/user-reviews?sort-by=most-helpful&num_items=100&page=1
2
https://metacritic.com/game/xbox-series-x/elden-ring/user-reviews?sort-by=most-helpful&num_items=100&page=2
2
https://metacritic.com/game/playstation-5/elden-ring
Elden Ring
0
https://metacritic.com/game/playstation-5/elden-ring/user-reviews?sort-by=most-helpful&num_items=100&page=0
1
https://metacritic.com/game/playstation-5/elden-ring/user-reviews?sort-by=most-helpful&num_items=100&page=1
2
https://metacritic.com/game/playstation-5/e