##### Goal: Find out which game company has a higher overall rating (Valve vs Blizzard)

In [1]:
from bs4 import BeautifulSoup as bs
import requests

In [2]:
valve_html = requests.get('https://en.wikipedia.org/wiki/List_of_Valve_games')
blizzard_html = requests.get('https://en.wikipedia.org/wiki/List_of_Blizzard_Entertainment_games')

In [3]:
soup = bs(valve_html.content)

contents = soup.prettify()

#print(contents)

In [4]:
games = soup.select('.wikitable i a')
game_path = games[0]['href']

game_path

'/wiki/Half-Life_(video_game)'

In [5]:
def get_game_links(url):
    html = requests.get(url)
    soup = bs(html.content)
    games = soup.select('.wikitable i a')
    games = soup.select('.wikitable .fn i a')
    game_links = []
    base_path = 'https://en.wikipedia.org'

    for game in games:
        # subdirectory example: "/wiki/Half-Life_(video_game)"
        game_subdirectory = game['href']
        path = base_path + game_subdirectory
        game_links.append(path)
    
    return game_links

get_game_links('https://en.wikipedia.org/wiki/List_of_Valve_games')

['https://en.wikipedia.org/wiki/Half-Life_(video_game)',
 'https://en.wikipedia.org/wiki/Team_Fortress_Classic',
 'https://en.wikipedia.org/wiki/Counter-Strike_(video_game)',
 'https://en.wikipedia.org/wiki/Day_of_Defeat',
 'https://en.wikipedia.org/wiki/Counter-Strike:_Condition_Zero',
 'https://en.wikipedia.org/wiki/Counter-Strike:_Source',
 'https://en.wikipedia.org/wiki/Half-Life_2',
 'https://en.wikipedia.org/wiki/Half-Life_2:_Deathmatch',
 'https://en.wikipedia.org/wiki/Day_of_Defeat:_Source',
 'https://en.wikipedia.org/wiki/Half-Life_2:_Lost_Coast',
 'https://en.wikipedia.org/wiki/Half-Life_2:_Episode_One',
 'https://en.wikipedia.org/wiki/Half-Life_2:_Episode_Two',
 'https://en.wikipedia.org/wiki/Portal_(video_game)',
 'https://en.wikipedia.org/wiki/Team_Fortress_2',
 'https://en.wikipedia.org/wiki/The_Orange_Box',
 'https://en.wikipedia.org/wiki/Left_4_Dead',
 'https://en.wikipedia.org/wiki/Left_4_Dead_2',
 'https://en.wikipedia.org/wiki/Alien_Swarm',
 'https://en.wikipedia.org

In [6]:
import re

def remove_ref(contents):
    # regular expression to find all matches with format [c] where c is any character eg. [a] or [1]
    reference = r'\[.\]'
    
    if type(contents) == str:
        matches = re.findall(reference, contents)
        # for each match, remove it from the string
        for match in matches:
            contents = contents.replace(match, '')
        return contents
    
    elif type(contents) == list:
        contents_list = []
        for my_string in contents:
            matches = re.findall(reference, my_string)
            for match in matches:
                my_string = my_string.replace(match, '')
            contents_list.append(my_string)
        return contents_list

In [40]:
def get_row_contents(row):
    # to handle releases
    # strip=True to remove whitespace chars (\n, xa0, etc)
    if row.select('li ul li'):
        return [li.get_text('', strip=True) for li in row.select('li ul li')]
    elif row.find('li'):
        return [li.get_text('', strip=True) for li in row.find_all('li')]
    #elif row.find('br'):
        #return [text for text in row.stripped_strings]
    #elif "," in row.get_text():
        #return [text.strip() for text in row.find('td', class_='infobox-data').get_text().split(',')]
    else:  
        return row.find('td', class_='infobox-data').get_text()

In [41]:
my_string = 'OS X, Linux'
my_string.split(',')

['OS X', ' Linux']

In [42]:
my_string = 'Single-player, multiplayer'
my_string.split(',')

['Single-player', ' multiplayer']

In [68]:
import re

def get_release_date(contents):
    # June 15, 2005 or 15 June 2005
    date_re = r'([A-Za-z]+[ ]\d+,[ ]\d+)|(\d+[ ][A-Za-z]+[ ]\d+)'
    
    if type(contents) == str:
        match = re.search(date_re, contents)
        return match.group()
    elif type(contents) == list:
        for item in contents:
            match = re.search(date_re, item)
            if (match != None):
                return match.group()

get_release_date(['Microsoft Windows', '29 November 2006', 'Mac OS X', '23 September 2010', 'Linux', '5 June 2013'])

'29 November 2006'

In [69]:
import re

def name_to_link(name):
    # matches when a char is not a-zA-Z or 0-9 or "_"
    alphanumeric_re = r'\W'
    
    name = name.lower()
    name = name.replace(' ', '_')
    name = name.replace('-', '_')
    # remove matches of non alphanumeric or "_"
    name = re.sub(alphanumeric_re, '', name)
    name = name.replace('_', '-')
    
    return name

print(name_to_link('Garry\'s Mod'))
print(name_to_link('Half life 2: Lost Coast'))

garrys-mod
half-life-2-lost-coast


In [70]:
names = ['Portal 2', 'Half life 2: Lost Coast', 'Garry\'s Mod']

def get_game_rating(name):
    # change name to link format
    # eg. https://www.metacritic.com/game/pc/garrys-mod, https://www.metacritic.com/game/pc/half-life
    name = name_to_link(name)
    url = f'https://www.metacritic.com/game/pc/{name}'

    user_agent = {'User-agent': 'Mozilla/5.0'}
    response = requests.get(url, headers = user_agent)
    soup = bs(response.content)
    
    metascore = soup.find('span', itemprop='ratingValue')
    if metascore is not None:
        metascore_str = metascore.get_text()
    else:
        metascore_str = 'N/A'
        
    userscore = soup.select('.score_summary .metascore_anchor .metascore_w.user')
    if userscore is not None:
        userscore_str = userscore[0].get_text()
    else:
        userscore_str = 'N/A'
    
    return metascore_str, userscore_str

for name in names:
    print(get_game_rating(name))

('95', '9.1')
('N/A', '7.9')
('N/A', '8.9')


In [71]:
def get_info_box(url):
    game_html = requests.get(url)
    game_soup = bs(game_html.content)
    base_path = 'https://en.wikipedia.org'
    
    info_box = game_soup.find('table', class_='infobox')
    info_rows = info_box.find_all('tr')
    
    game_info = {}
    
    for i, row in enumerate(info_rows):
        # get title from infobox
        if row.select('.infobox-above'):
            game_info['Name'] = row.find('th').get_text()
            # handling really rare edge case with different naming
            if game_info['Name'] == 'Hearthstone':
                game_info['Name'] = 'Hearthstone: Heroes of Warcraft'
        # get image link from infobox
        elif row.select('.infobox-image'):
            image_link = base_path + row.find('a')['href']
            game_info['Image_link'] = image_link
        else:
            label = row.find('th', class_='infobox-label').get_text()
            contents = get_row_contents(row)
            if (label == 'Release'):
                contents = get_release_date(contents)
                
            contents = remove_ref(contents)
            game_info[label] = contents
    
    game_info['Source'] = url
    
    metascore, userscore = get_game_rating(game_info['Name'])
    game_info['Metascore'] = metascore
    game_info['User Score'] = userscore
    
    return game_info
    
    
get_info_box('https://en.wikipedia.org/wiki/Garry%27s_Mod')

{'Name': "Garry's Mod",
 'Image_link': 'https://en.wikipedia.org/wiki/File:Garry%27s_Mod_logo.svg',
 'Developer(s)': 'Facepunch Studios',
 'Publisher(s)': 'Valve',
 'Designer(s)': 'Garry Newman',
 'Programmer(s)': 'Garry Newman',
 'Engine': 'Source',
 'Platform(s)': ['Microsoft Windows', 'Mac OS X', 'Linux'],
 'Release': '29 November 2006',
 'Genre(s)': 'Sandbox',
 'Mode(s)': 'Single-player, multiplayer',
 'Source': 'https://en.wikipedia.org/wiki/Garry%27s_Mod',
 'Metascore': 'N/A',
 'User Score': '8.9'}

In [72]:
def scrape_game_company(url):
    game_links = get_game_links(url)
    games_info = []
    for i, link in enumerate(game_links):
        try:
            games_info.append(get_info_box(link))
            if (i % 10 == 0):
                print(f'--Scraped {i} games--')

        except Exception as e:
            print(link)
            print(e)

    print(f'Scraping finished with {i} games')
    return games_info
    
games_info = scrape_game_company('https://en.wikipedia.org/wiki/List_of_Blizzard_Entertainment_games')

--Scraped 0 games--
--Scraped 10 games--
--Scraped 20 games--
Scraping finished with 26 games


In [73]:
games_info

[{'Name': 'Half-Life',
  'Image_link': 'https://en.wikipedia.org/wiki/File:Half-Life_Cover_Art.jpg',
  'Developer(s)': 'Valve',
  'Publisher(s)': 'Sierra Studios',
  'Writer(s)': 'Marc Laidlaw',
  'Composer(s)': 'Kelly Bailey',
  'Series': 'Half-Life',
  'Engine': 'GoldSrc',
  'Platform(s)': ['Windows', 'PlayStation 2', 'OS X', 'Linux'],
  'Release': 'November 19, 1998',
  'Genre(s)': 'First-person shooter',
  'Mode(s)': 'Single-player, multiplayer',
  'Source': 'https://en.wikipedia.org/wiki/Half-Life_(video_game)',
  'Metascore': '96',
  'User Score': '9.1'},
 {'Name': 'Team Fortress Classic',
  'Image_link': 'https://en.wikipedia.org/wiki/File:Team_Fortress_Classic_box.jpg',
  'Developer(s)': 'Valve',
  'Publisher(s)': ['Sierra Studios', 'Valve (digital)'],
  'Designer(s)': ['John Cook', 'Robin Walker'],
  'Engine': 'GoldSrc',
  'Platform(s)': 'Microsoft Windows, OS X, Linux',
  'Release': 'April 7, 1999',
  'Genre(s)': 'First-person shooter',
  'Mode(s)': 'Multiplayer',
  'Source':

Things to clean:
- Remove references [a] [1] etc
- Convert from all li in one string to list of strings
- In release, format from '\nNovember 19, 1998\n Windows' to 'November 19, 1998 (Windows)'

Tasks:
- Scrape both websites
- Convert to dictionary
- Convert to pandas
- Do some EDA
- Make insights