##### Goal: Find out which game company has a higher overall rating (Valve vs Blizzard)

In [12]:
from bs4 import BeautifulSoup as bs
import requests

In [13]:
valve_url = 'https://en.wikipedia.org/wiki/List_of_Valve_games'
blizzard_url = 'https://en.wikipedia.org/wiki/List_of_Blizzard_Entertainment_games'

valve_html = requests.get(valve_url)
blizzard_html = requests.get(blizzard_url)

In [14]:
soup = bs(valve_html.content)

contents = soup.prettify()

#print(contents)

In [15]:
games = soup.select('.wikitable i a')
game_path = games[0]['href']

#game_path

In [16]:
def get_game_links(url):
    html = requests.get(url)
    soup = bs(html.content)
    games = soup.select('.wikitable i a')
    games = soup.select('.wikitable .fn i a')
    game_links = []
    base_path = 'https://en.wikipedia.org'

    for game in games:
        # subdirectory example: "/wiki/Half-Life_(video_game)"
        game_subdirectory = game['href']
        path = base_path + game_subdirectory
        game_links.append(path)
    
    return game_links

#get_game_links('https://en.wikipedia.org/wiki/List_of_Valve_games')

In [17]:
import re

def remove_ref(contents):
    # regular expression to find all matches with format [c] where c is any character eg. [a] or [1]
    reference = r'\[.\]'
    
    if type(contents) == str:
        matches = re.findall(reference, contents)
        # for each match, remove it from the string
        for match in matches:
            contents = contents.replace(match, '')
        # final remove whitespace that might've been between ref n text
        contents = contents.strip()
        return contents
    
    elif type(contents) == list:
        contents_list = []
        for my_string in contents:
            matches = re.findall(reference, my_string)
            for match in matches:
                my_string = my_string.replace(match, '')
                # final remove whitespace that might've been between ref n text
                my_string = my_string.strip()
            contents_list.append(my_string)
        return contents_list

In [18]:
def get_row_contents(row):
    # to handle releases
    # strip=True to remove whitespace chars (\n, xa0, etc)
    if row.select('li ul li'):
        return [li.get_text('', strip=True) for li in row.select('li ul li')]
    elif row.find('li'):
        return [li.get_text('', strip=True) for li in row.find_all('li')]
    elif row.find('br'):
        # stripped strings strips out whitespace chars
        return [text for text in row.find('td').stripped_strings]
    elif "," in row.get_text():
        return [text.strip() for text in row.find('td', class_='infobox-data').get_text().split(',')]
    else:  
        #print(row)
        return row.find('td', class_='infobox-data').get_text()

In [19]:
import re

def standardise_date(date):
    # other date format
    uncommon_date_re = r'\d+[ ][A-Za-z]+[ ]\d+'
    if re.search(uncommon_date_re, date):
        # first split string to list
        # eg. 15 June 2005 to [15, 'June', 2005]
        split_date = date.split()
        # make date string like this: June 15, 2005
        new_date = f'{split_date[1]} {split_date[0]}, {split_date[2]}'
        return new_date
    else:
        return date

standardise_date('15 June 2005')

'June 15, 2005'

In [20]:
from datetime import datetime

def convert_to_datetime(date_str):
    date_format = '%B %d, %Y'
    date = datetime.strptime(date_str, date_format)
    return date

convert_to_datetime('June 15, 2005')

datetime.datetime(2005, 6, 15, 0, 0)

In [22]:
import re

def get_release_date(row):
    if row.select('ul li'):
        contents = [li.get_text('', strip=True) for li in row.select('ul li')]
        #print('in ul li if statement')
    else:
        contents = row.find('td', class_='infobox-data').get_text()
        #print('in everything else')
    #print(contents)
    # June 15, 2005 or 15 June 2005 
    date_re = r'([A-Za-z]+[ ]\d+,[ ]\d+)|(\d+[ ][A-Za-z]+[ ]\d+)'
    
    if type(contents) == str:
        match = re.search(date_re, contents)
        date = match.group()
        standardised_date = standardise_date(date)
        datetime_obj = convert_to_datetime(standardised_date)
        return datetime_obj
    elif type(contents) == list:
        for i, item in enumerate(contents):
            match = re.search(date_re, item)
            #print(i, match)
            if (match != None):
                date = match.group()
                standardised_date = standardise_date(date)
                datetime_obj = convert_to_datetime(standardised_date)
                return datetime_obj
        return None

#get_release_date(['Microsoft Windows', '29 November 2006', 'Mac OS X', '23 September 2010', 'Linux', '5 June 2013'])

In [23]:
import re

def name_to_link(name):
    # matches when a char is not a-zA-Z or 0-9 or "_"
    alphanumeric_re = r'\W'
    
    name = name.lower()
    name = name.replace(' ', '_')
    name = name.replace('-', '_')
    # remove matches of non alphanumeric or "_"
    name = re.sub(alphanumeric_re, '', name)
    name = name.replace('_', '-')
    
    return name

#print(name_to_link('Garry\'s Mod'))
#print(name_to_link('Half life 2: Lost Coast'))

In [24]:
def get_game_rating(name):
    # change name to link format
    # eg. https://www.metacritic.com/game/pc/garrys-mod, https://www.metacritic.com/game/pc/half-life
    name = name_to_link(name)
    url = f'https://www.metacritic.com/game/pc/{name}'
    #print(url)

    user_agent = {'User-agent': 'Mozilla/5.0'}
    response = requests.get(url, headers = user_agent)
    soup = bs(response.content)
    
    metascore = soup.find('span', itemprop='ratingValue')
    if metascore is not None:
        metascore_str = metascore.get_text()
    else:
        metascore_str = 'N/A'
        
        
    userscore = soup.select('.score_summary .metascore_anchor .metascore_w.user')
    # by default returns empty list
    if len(userscore) > 0:
        userscore_str = userscore[0].get_text()
    else:
        userscore_str = 'N/A'
    
    return metascore_str, userscore_str

#for name in names:
    #print(get_game_rating(name))

In [25]:
# deal with random (text) and '' whitespaces at the end
def remove_random_contents(contents):
    # to match format (Japan something) or ':'
    random_re = r'(\(*\))|[:]'
    #print(contents)
    if type(contents) == list:
        for item in contents:
            # if find item in format (Japan something) or ':' or empty string ''
            if re.search(random_re, item) or len(item) == 0:
                # remove that item from contents
                contents.remove(item)
    return contents

In [21]:
def get_info_box(url):
    game_html = requests.get(url)
    game_soup = bs(game_html.content)
    base_path = 'https://en.wikipedia.org'
    
    info_box = game_soup.find('table', class_='infobox')
    info_rows = info_box.find_all('tr')
    
    game_info = {}
    
    for i, row in enumerate(info_rows):
        # get title from infobox
        if row.select('.infobox-above'):
            game_info['Name'] = row.find('th').get_text()
            # handling really rare edge case with different naming
            if game_info['Name'] == 'Hearthstone':
                game_info['Name'] = 'Hearthstone: Heroes of Warcraft'
        # get image link from infobox
        elif row.select('.infobox-image'):
            image_link = base_path + row.find('a')['href']
            game_info['Image_link'] = image_link
        else:
            label = row.find('th', class_='infobox-label').get_text()
            contents = get_row_contents(row)
            contents = remove_ref(contents)
            if (label == 'Release'):
                #print(contents)
                contents = get_release_date(row)
                #print(contents)
            #print(contents)
            contents = remove_random_contents(contents)
            #print(contents)
            #print(f'Label: {label}, type: {type(contents)}')
                
            #print(contents)
            game_info[label] = contents
    
    game_info['Source'] = url
    
    metascore, userscore = get_game_rating(game_info['Name'])
    game_info['Metascore'] = metascore
    game_info['User Score'] = userscore
    
    return game_info
    
    
get_info_box('https://en.wikipedia.org/wiki/Justice_League_Task_Force_(video_game)')

{'Name': 'Justice League Task Force',
 'Image_link': 'https://en.wikipedia.org/wiki/File:Justice_League_Task_Force_game_cover.jpg',
 'Developer(s)': ['Sunsoft',
  'Super NES',
  'Blizzard Entertainment',
  'Mega Drive/Genesis',
  'Condor, Inc.'],
 'Publisher(s)': 'Acclaim Entertainment',
 'Composer(s)': ['Matt Uelmen', 'Glenn Stafford'],
 'Platform(s)': ['Genesis', 'Super NES'],
 'Release': datetime.datetime(1995, 9, 1, 0, 0),
 'Genre(s)': 'Fighting',
 'Mode(s)': ['Single-player', 'multiplayer'],
 'Source': 'https://en.wikipedia.org/wiki/Justice_League_Task_Force_(video_game)',
 'Metascore': 'N/A',
 'User Score': 'N/A'}

In [22]:
def scrape_game_company(url):
    game_links = get_game_links(url)
    games_info = []
    for i, link in enumerate(game_links):
        try:
            games_info.append(get_info_box(link))
            if (i % 10 == 0):
                print(f'--Scraped {i} games--')

        except Exception as e:
            print(link)
            print(e)

    print(f'Scraping finished with {i} games')
    return games_info

In [23]:
valve_games = scrape_game_company(valve_url)

--Scraped 0 games--
--Scraped 10 games--
--Scraped 20 games--
Scraping finished with 26 games


In [24]:
# ignore exceptions (they're games that haven't been released)
blizzard_games = scrape_game_company(blizzard_url)

--Scraped 0 games--
--Scraped 10 games--
https://en.wikipedia.org/wiki/Overwatch_2
'NoneType' object has no attribute 'group'
https://en.wikipedia.org/wiki/Diablo_IV
'NoneType' object has no attribute 'group'
Scraping finished with 21 games


In [25]:
valve_games

[{'Name': 'Half-Life',
  'Image_link': 'https://en.wikipedia.org/wiki/File:Half-Life_Cover_Art.jpg',
  'Developer(s)': 'Valve',
  'Publisher(s)': 'Sierra Studios',
  'Writer(s)': 'Marc Laidlaw',
  'Composer(s)': 'Kelly Bailey',
  'Series': 'Half-Life',
  'Engine': 'GoldSrc',
  'Platform(s)': ['Windows', 'PlayStation 2', 'OS X', 'Linux'],
  'Release': datetime.datetime(1998, 11, 19, 0, 0),
  'Genre(s)': 'First-person shooter',
  'Mode(s)': ['Single-player', 'multiplayer'],
  'Source': 'https://en.wikipedia.org/wiki/Half-Life_(video_game)',
  'Metascore': '96',
  'User Score': '9.1'},
 {'Name': 'Team Fortress Classic',
  'Image_link': 'https://en.wikipedia.org/wiki/File:Team_Fortress_Classic_box.jpg',
  'Developer(s)': 'Valve',
  'Publisher(s)': ['Sierra Studios'],
  'Designer(s)': ['John Cook', 'Robin Walker'],
  'Engine': 'GoldSrc',
  'Platform(s)': ['Microsoft Windows', 'OS X', 'Linux'],
  'Release': datetime.datetime(1999, 4, 7, 0, 0),
  'Genre(s)': 'First-person shooter',
  'Mode(s)

In [26]:
# to be solved: https://en.wikipedia.org/wiki/Justice_League_Task_Force_(video_game) see developers
blizzard_games

[{'Name': 'RPM Racing',
  'Image_link': 'https://en.wikipedia.org/wiki/File:RPM_Racing_cover.jpg',
  'Developer(s)': 'Silicon & Synapse',
  'Publisher(s)': ['Interplay Productions', 'Victor Musical Industries'],
  'Producer(s)': 'Michael Quarles',
  'Programmer(s)': 'Allen Adham',
  'Artist(s)': 'Rob Nesler',
  'Composer(s)': 'George Alistair Sanger',
  'Platform(s)': ['Super NES',
   'Windows',
   'Switch',
   'PlayStation 4',
   'Xbox One'],
  'Release': datetime.datetime(1992, 3, 19, 0, 0),
  'Genre(s)': 'Racing',
  'Mode(s)': ['Single-player', 'multiplayer'],
  'Source': 'https://en.wikipedia.org/wiki/RPM_Racing',
  'Metascore': 'N/A',
  'User Score': 'N/A'},
 {'Name': 'The Lost Vikings',
  'Image_link': 'https://en.wikipedia.org/wiki/File:The_Lost_Vikings_SNES_cover.jpg',
  'Developer(s)': 'Silicon & Synapse',
  'Publisher(s)': ['Interplay Productions',
   'T&E Soft',
   'Blizzard Entertainment'],
  'Designer(s)': 'Ron Millar',
  'Composer(s)': 'Charles Deenen',
  'Platform(s)': [

In [11]:
blizzard_games[10]

{'Name': 'StarCraft',
 'Image_link': 'https://en.wikipedia.org/wiki/File:StarCraft_box_art.jpg',
 'Developer(s)': 'Blizzard Entertainment',
 'Publisher(s)': 'Blizzard Entertainment',
 'Designer(s)': ['Chris Metzen', 'James Phinney'],
 'Composer(s)': ['Derek Duke',
  'Jason Hayes',
  'Glenn Stafford',
  'Tracy W. Bush'],
 'Series': 'StarCraft',
 'Platform(s)': ['Microsoft Windows', 'Classic Mac OS', 'Nintendo 64'],
 'Release': datetime.datetime(1998, 3, 31, 0, 0),
 'Genre(s)': 'Real-time strategy',
 'Mode(s)': ['Single-player', 'multiplayer'],
 'Source': 'https://en.wikipedia.org/wiki/StarCraft_(video_game)',
 'Metascore': '88',
 'User Score': '9.1'}

In [52]:
import json

def save_json(filename, data):
    with open(filename, 'w') as f:
        json.dump(data, f)
        
def load_json(filename):
    with open(filename, 'r') as f:
        return json.load(f)

In [53]:
import pickle

def save_pickle(filename, data):
    with open(filename, 'wb') as f:
        pickle.dump(data, f)

def load_pickle(filename):
    with open(filename, 'rb') as f:
        return pickle.load(f)

In [54]:
#save_pickle('valve_cleaned_pickle.pickle', valve_games)
#save_pickle('blizzard_cleaned_pickle.pickle', blizzard_games)

In [55]:
valve_games = load_pickle('valve_cleaned_pickle.pickle')
blizzard_games = load_pickle('blizzard_cleaned_pickle.pickle')

In [56]:
# To fix: make sure all str get converted to datetime objects
    # Nones get removed (?)
blizzard_games

[{'Name': 'RPM Racing',
  'Image_link': 'https://en.wikipedia.org/wiki/File:RPM_Racing_cover.jpg',
  'Developer(s)': 'Silicon & Synapse',
  'Publisher(s)': ['Interplay Productions', 'Victor Musical Industries'],
  'Producer(s)': 'Michael Quarles',
  'Programmer(s)': 'Allen Adham',
  'Artist(s)': 'Rob Nesler',
  'Composer(s)': 'George Alistair Sanger',
  'Platform(s)': ['Super NES',
   'Windows',
   'Switch',
   'PlayStation 4',
   'Xbox One'],
  'Release': datetime.datetime(1992, 3, 19, 0, 0),
  'Genre(s)': 'Racing',
  'Mode(s)': ['Single-player', 'multiplayer'],
  'Source': 'https://en.wikipedia.org/wiki/RPM_Racing',
  'Metascore': 'N/A',
  'User Score': 'N/A'},
 {'Name': 'The Lost Vikings',
  'Image_link': 'https://en.wikipedia.org/wiki/File:The_Lost_Vikings_SNES_cover.jpg',
  'Developer(s)': 'Silicon & Synapse',
  'Publisher(s)': ['Interplay Productions',
   'T&E Soft',
   'Blizzard Entertainment'],
  'Designer(s)': 'Ron Millar',
  'Composer(s)': 'Charles Deenen',
  'Platform(s)': [

In [57]:
from datetime import datetime

# converts datetime objects in game dictionaries to strings
def json_friendly(game_dict):
    for i, game in enumerate(game_dict):
        #print(f'Game number {i}')
        datetime_obj = game['Release']
        if datetime_obj == None:
            continue
        date_str = datetime_obj.strftime('%B %d, %Y')
        game['Release'] = date_str
        #print(game['Release'])
    return game_dict
    
#a = json_friendly(blizzard_games)

In [58]:
valve_json = json_friendly(valve_games)
blizzard_json = json_friendly(blizzard_games)

In [62]:
save_json('valve_cleaned_json.json', valve_json)
save_json('blizzard_cleaned_json.json', blizzard_json)

In [64]:
#load_json('valve_cleaned_json.json')

In [67]:
valve_games

[{'Name': 'Half-Life',
  'Image_link': 'https://en.wikipedia.org/wiki/File:Half-Life_Cover_Art.jpg',
  'Developer(s)': 'Valve',
  'Publisher(s)': 'Sierra Studios',
  'Writer(s)': 'Marc Laidlaw',
  'Composer(s)': 'Kelly Bailey',
  'Series': 'Half-Life',
  'Engine': 'GoldSrc',
  'Platform(s)': ['Windows', 'PlayStation 2', 'OS X', 'Linux'],
  'Release': 'November 19, 1998',
  'Genre(s)': 'First-person shooter',
  'Mode(s)': ['Single-player', 'multiplayer'],
  'Source': 'https://en.wikipedia.org/wiki/Half-Life_(video_game)',
  'Metascore': '96',
  'User Score': '9.1'},
 {'Name': 'Team Fortress Classic',
  'Image_link': 'https://en.wikipedia.org/wiki/File:Team_Fortress_Classic_box.jpg',
  'Developer(s)': 'Valve',
  'Publisher(s)': ['Sierra Studios'],
  'Designer(s)': ['John Cook', 'Robin Walker'],
  'Engine': 'GoldSrc',
  'Platform(s)': ['Microsoft Windows', 'OS X', 'Linux'],
  'Release': 'April 07, 1999',
  'Genre(s)': 'First-person shooter',
  'Mode(s)': 'Multiplayer',
  'Source': 'https:

In [66]:
import pandas as pd

valve_df = pd.DataFrame.from_dict(valve_games)
valve_df

Unnamed: 0,Name,Image_link,Developer(s),Publisher(s),Writer(s),Composer(s),Series,Engine,Platform(s),Release,Genre(s),Mode(s),Source,Metascore,User Score,Designer(s),Artist(s),Director(s),Producer(s),Programmer(s)
0,Half-Life,https://en.wikipedia.org/wiki/File:Half-Life_C...,Valve,Sierra Studios,Marc Laidlaw,Kelly Bailey,Half-Life,GoldSrc,"[Windows, PlayStation 2, OS X, Linux]","November 19, 1998",First-person shooter,"[Single-player, multiplayer]",https://en.wikipedia.org/wiki/Half-Life_(video...,96.0,9.1,,,,,
1,Team Fortress Classic,https://en.wikipedia.org/wiki/File:Team_Fortre...,Valve,[Sierra Studios],,,,GoldSrc,"[Microsoft Windows, OS X, Linux]","April 07, 1999",First-person shooter,Multiplayer,https://en.wikipedia.org/wiki/Team_Fortress_Cl...,,7.2,"[John Cook, Robin Walker]",,,,
2,Counter-Strike,https://en.wikipedia.org/wiki/File:Counter-Str...,Valve,[Sierra Studios],,,Counter-Strike,GoldSrc,"[Windows, Xbox, OS X, Linux]","November 09, 2000",First-person shooter,Multiplayer,https://en.wikipedia.org/wiki/Counter-Strike_(...,88.0,9.2,"[Minh Le, Jess Cliffe]",,,,
3,Day of Defeat,https://en.wikipedia.org/wiki/File:Day_of_Defe...,Valve,[Activision],,Michael Gordon Shapiro,,GoldSrc,"[Microsoft Windows, OS X, Linux]","May 06, 2003",First-person shooter,Multiplayer,https://en.wikipedia.org/wiki/Day_of_Defeat,79.0,9.0,,,,,
4,Counter-Strike: Condition Zero,https://en.wikipedia.org/wiki/File:CZbox.jpg,"[Ritual Entertainment, Turtle Rock Studios, Va...",[Sierra Entertainment],,,Counter-Strike,GoldSrc,"[Windows, OS X, Linux]","March 23, 2004",First-person shooter,"[Single-player, multiplayer]",https://en.wikipedia.org/wiki/Counter-Strike:_...,65.0,8.7,,,,,
5,Counter-Strike: Source,https://en.wikipedia.org/wiki/File:Counter-Str...,"[Valve, Turtle Rock Studios]",Valve,,,Counter-Strike,Source,"[Windows, Mac OS X, Linux]","October 07, 2004",First-person shooter,Multiplayer,https://en.wikipedia.org/wiki/Counter-Strike:_...,88.0,8.9,,,,,
6,Half-Life 2,https://en.wikipedia.org/wiki/File:Half-Life_2...,Valve,[Valve],Marc Laidlaw,Kelly Bailey,Half-Life,Source,"[Windows, Xbox, Xbox 360, PlayStation 3, Mac O...","November 16, 2004",First-person shooter,Single-player,https://en.wikipedia.org/wiki/Half-Life_2,96.0,9.2,,Viktor Antonov,,,
7,Half-Life 2: Deathmatch,https://en.wikipedia.org/wiki/File:Half-Life_2...,Valve,Valve,,,Half-Life,Source,"[Microsoft Windows, Mac OS X, Linux]","November 30, 2004",First-person shooter,Multiplayer,https://en.wikipedia.org/wiki/Half-Life_2:_Dea...,,7.3,,,,,
8,Day of Defeat: Source,https://en.wikipedia.org/wiki/File:DODSourceCo...,Valve,Valve,,Dan Haigh,,Source,"[Microsoft Windows, Mac OS X, Linux]","September 26, 2005",First-person shooter,Multiplayer,https://en.wikipedia.org/wiki/Day_of_Defeat:_S...,80.0,9.0,,,,,
9,Half-Life 2: Lost Coast,https://en.wikipedia.org/wiki/File:Half-Life_2...,Valve,Valve,,,Half-Life,Source,"[Microsoft Windows, OS X, Linux]","October 27, 2005",First-person shooter,Single-player,https://en.wikipedia.org/wiki/Half-Life_2:_Los...,,7.9,,,,,


In [68]:
blizzard_df = pd.DataFrame.from_dict(blizzard_games)
blizzard_df

Unnamed: 0,Name,Image_link,Developer(s),Publisher(s),Producer(s),Programmer(s),Artist(s),Composer(s),Platform(s),Release,Genre(s),Mode(s),Source,Metascore,User Score,Designer(s),Writer(s),Director(s),Series,Engine
0,RPM Racing,https://en.wikipedia.org/wiki/File:RPM_Racing_...,Silicon & Synapse,"[Interplay Productions, Victor Musical Industr...",Michael Quarles,Allen Adham,Rob Nesler,George Alistair Sanger,"[Super NES, Windows, Switch, PlayStation 4, Xb...","March 19, 1992",Racing,"[Single-player, multiplayer]",https://en.wikipedia.org/wiki/RPM_Racing,,,,,,,
1,The Lost Vikings,https://en.wikipedia.org/wiki/File:The_Lost_Vi...,Silicon & Synapse,"[Interplay Productions, T&E Soft, Blizzard Ent...",,,,Charles Deenen,"[Super Nintendo, Genesis, Amiga, MS-DOS, Amiga...","April 29, 1993",Puzzle-platform,"[Single-player, cooperative]",https://en.wikipedia.org/wiki/The_Lost_Vikings,,,Ron Millar,,,,
2,Rock n' Roll Racing,https://en.wikipedia.org/wiki/File:Rock_N%27_R...,[Software Creations],"[Interplay Productions, Namco, Blizzard Entert...",,"[Bob Fitch, Patrick Wyatt, Ayman Adham]","[Samwise Didier, Ronald Millar Sr., Joeyray Hall]",Tim Follin and Geoff Follin,"[Super NES, Mega Drive/Genesis, Game Boy Advan...","June 04, 1993",Racing,"[Single-player, multiplayer]",https://en.wikipedia.org/wiki/Rock_n%27_Roll_R...,,,Alan Pavlish,,,,
3,The Death and Return of Superman,https://en.wikipedia.org/wiki/File:The_Death_a...,"[Blizzard Entertainment, Sunsoft]",Sunsoft,,"[James Edward Anhalt III, Allen Adham, Bob Fitch]","[Samwise Didier, David Berggren, Roman Kenney,...","[Michael Morhaime, Glenn Stafford]","[Super NES, Sega Genesis]",,Beat 'em up,Single player,https://en.wikipedia.org/wiki/The_Death_and_Re...,,,Dan MacArthur,,,,
4,Blackthorne,https://en.wikipedia.org/wiki/File:Blackthorne...,Blizzard Entertainment,Interplay Productions,"[Ronald Millar Sr., Matthew Findley]","[Frank Pearce Jr., Patrick Wyatt]","[Roman Kenney, Stuart Rose, Jason Magness, Ron...",Glenn Stafford,"[SNES, MS-DOS, Sega 32X, Classic Mac OS, Game ...","February 20, 2021",Platform,Single-player,https://en.wikipedia.org/wiki/Blackthorne,,,Ronald Millar Sr.,"[Micky Neilson, Ronald Millar Sr., Frank Pearc...",,,
5,Warcraft: Orcs & Humans,https://en.wikipedia.org/wiki/File:Warcraft_-_...,Blizzard Entertainment,[EU:Interplay Productions],"[Bill Roper, Patrick Wyatt]","[Bob Fitch, Jesse McReynolds, Michael Morhaime...",,Gregory Alper,"[MS-DOS, Classic Mac OS]","November 15, 1994",Real-time strategy,"[Single-player, multiplayer]",https://en.wikipedia.org/wiki/Warcraft:_Orcs_%...,,,,,Patrick Wyatt,Warcraft,
6,Justice League Task Force,https://en.wikipedia.org/wiki/File:Justice_Lea...,"[Sunsoft, Super NES, Blizzard Entertainment, M...",Acclaim Entertainment,,,,"[Matt Uelmen, Glenn Stafford]","[Genesis, Super NES]","September 01, 1995",Fighting,"[Single-player, multiplayer]",https://en.wikipedia.org/wiki/Justice_League_T...,,,,,,,
7,Warcraft II: Tides of Darkness,https://en.wikipedia.org/wiki/File:Warcraft-2-...,Blizzard Entertainment,Davidson & Associates,"[Sam Didier, Michael Morhaime, Patrick Wyatt]","[Bob Fitch, Jesse McReynolds, Michael Morhaime]",,Glenn Stafford,"[MS-DOS, Classic Mac OS, Saturn, PlayStation, ...","December 09, 1995",Real-time strategy,"[Single-player, multiplayer]",https://en.wikipedia.org/wiki/Warcraft_II:_Tid...,,9.0,Ron Millar,Chris Metzen,,Warcraft,
8,Diablo,https://en.wikipedia.org/wiki/File:Diablo_Cove...,Blizzard North,"[Blizzard Entertainment,Davidson & Associates]",Bill Roper,David Brevik,Michio Okamura,Matt Uelmen,"[Microsoft Windows, PlayStation, Mac OS]","January 03, 1997",Action role-playing,"[Single-player, multiplayer]",https://en.wikipedia.org/wiki/Diablo_(video_game),94.0,8.6,"[David Brevik, Erich Schaefer, Max Schaefer, E...","[Chris Metzen, Bill Roper, Eric Sexton, Erich ...",,Diablo,
9,Lost Vikings 2,https://en.wikipedia.org/wiki/File:Lost_Viking...,"[Blizzard Entertainment, Beam Software]",Interplay Productions,Feargus Urquhart,James Phinney,Samwise Didier,Glenn Stafford,"[Super NES, MS-DOS, Windows, Saturn, PlayStati...","February 27, 1997",Puzzle-platform,"[Single-player, 2 player cooperative]",https://en.wikipedia.org/wiki/The_Lost_Vikings_2,,,Ron Millar,,,,


In [69]:
valve_df.to_csv('valve_cleaned_csv.csv')
blizzard_df.to_csv('blizzard_cleaned_csv.csv')

Things to clean:
- Remove references [a] [1] etc
- Convert from all li in one string to list of strings
- In release, format from '\nNovember 19, 1998\n Windows' to 'November 19, 1998 (Windows)'

Tasks:
- Scrape both websites
- Convert to dictionary
- Convert to pandas
- Do some EDA
- Make insights