In [1]:
from bs4 import BeautifulSoup
import dateparser
import hashlib
import os
import pandas as pd
import requests
import re

BASE_PATH = os.getcwd()
GAME_PATH = os.path.join(BASE_PATH,'GameData')
GAME_DATA_PATH = os.path.join(GAME_PATH, 'data')

# Page URL
url_wikipedia_ps5 = 'https://en.wikipedia.org/wiki/List_of_PlayStation_5_games'
url_wikipedia_ps4_AL = 'https://en.wikipedia.org/wiki/List_of_PlayStation_4_games_(A%E2%80%93L)'
url_wikipedia_ps4_MZ = 'https://en.wikipedia.org/wiki/List_of_PlayStation_4_games_(M%E2%80%93Z)'

column_names = [
        'Title',
         'Genre',
         'Developer',
         'Publisher',
         'Release Date JP',
         'Release Date NA',
         'Release Date PAL',
         'Unreleased JP',
         'Unreleased NA',
         'Unreleased PAL',
         'TBA JP',
         'TBA NA',
         'TBA PAL',
         'Crossbuy',
         'Crossplay',
         '3DTV',
         'PS Camera',
         'PS4 Pro Enhanced',
         'Play Link',
         'PSVR',
         'PSVR2',
         'Wikipedia Link',
    ]

In [2]:
############################################################
#
# Functions
#
############################################################
def extract_list(bs):
  # Search for multiple items within tags
  items = bs.find_all(['a','li','p'])

  # If we didn't find anything return the plain text
  if items is None or len(items) == 0:
    return (bs.text.strip(), )
  else:
    return tuple(i.text.strip() for i in items)

def extract_link(c):
    linked = c.find_all('a', href=True)
    if len(linked) > 0:
        return f'https://en.wikipedia.org{linked[0]["href"]}'
    else:
        return None

def get_and_clean_wikipedia_list(url, platform):
    # Request the page
    response = requests.get(url)

    # Parse HTML content using BeautifulSoup
    soup = BeautifulSoup(response.text, 'html.parser')

    # Get the table of games
    table = soup.find('table', id='softwarelist')

    # Extract all the table rows
    tr = table.find_all('tr')[2:]

    # For each Row extract data cells
    raw_cells = list(map(lambda x: x.find_all(['th','td']), tr))

    # Create a data Frame
    df = pd.DataFrame(raw_cells)
    df.columns =['Title',
                 'Genre',
                 'Developer',
                 'Publisher',
                 'Release Date JP',
                 'Release Date NA',
                 'Release Date PAL',
                 'Addons',
                 'Ref']

    # Clean Data
    df['Wikipedia Link'] = df['Title'].map(extract_link)
    df['Title'] = df['Title'].map(lambda a: a.text.strip())
    df['Genre'] = df['Genre'].map(extract_list) # TODO: Normalize genres
    df['Developer'] = df['Developer'].map(lambda a: a.text.strip())
    df['Publisher'] = df['Publisher'].map(lambda a: a.text.strip())
    df['Unreleased JP'] = df['Release Date JP'].map(lambda a: a.text.strip() == 'Unreleased')
    df['Unreleased NA'] = df['Release Date NA'].map(lambda a: a.text.strip() == 'Unreleased')
    df['Unreleased PAL'] = df['Release Date PAL'].map(lambda a: a.text.strip() == 'Unreleased')
    df['TBA JP'] = df['Release Date JP'].map(lambda a: a.text.strip() == 'TBA')
    df['TBA NA'] = df['Release Date NA'].map(lambda a: a.text.strip() == 'TBA')
    df['TBA PAL'] = df['Release Date PAL'].map(lambda a: a.text.strip() == 'TBA')
    df['Release Date JP'] = df['Release Date JP'].map(lambda a: dateparser.parse(a.text.strip()))
    df['Release Date NA'] = df['Release Date NA'].map(lambda a: dateparser.parse(a.text.strip()))
    df['Release Date PAL'] = df['Release Date PAL'].map(lambda a: dateparser.parse(a.text.strip()))
    df['Crossbuy'] = df['Addons'].map(lambda a: False if a is None else 'CB' in a.text)
    df['Crossplay'] = df['Addons'].map(lambda a: False if a is None else 'CP' in a.text)
    df['3DTV'] = df['Addons'].map(lambda a: False if a is None else '3D' in a.text)
    df['PS Camera'] = df['Addons'].map(lambda a: False if a is None else 'C' in a.text)
    df['PS4 Pro Enhanced'] = df['Addons'].map(lambda a: False if a is None else 'P' in a.text)
    df['Play Link'] = df['Addons'].map(lambda a: False if a is None else 'PL' in a.text)
    df['PSVR'] = df['Addons'].map(lambda a: False if a is None else 'VR' in a.text)
    df['PSVR2'] = df['Addons'].map(lambda a: False if a is None else 'VR2' in a.text)
    df['Platform'] = platform

    # Create a hash to uniquely identify the entry
    df['Hash'] = (df['Title'] + df['Developer'] + df['Publisher'] + df['Platform'])
    df['Hash'] = df['Hash'].map(lambda a: re.sub(r'\W+', '', a).lower())
    df['Hash'] = df['Hash'].map(lambda a: hashlib.sha256(a.encode('utf-8')).hexdigest())

    # Drop unnecessary columns
    df = df.drop(columns=['Addons', 'Ref'])

    return df

In [3]:
# Get Wikipedia PS5 games
df_ps5 = get_and_clean_wikipedia_list(url_wikipedia_ps5, 'PS5')

# Get Wikipedia PS4 games A-L
df_ps4_a = get_and_clean_wikipedia_list(url_wikipedia_ps4_AL, 'PS4')

# Get Wikipedia PS4 games M-Z
df_ps4_m = get_and_clean_wikipedia_list(url_wikipedia_ps4_MZ, 'PS4')

# Merge lists
df = df_ps4_a.merge(df_ps4_m, how='outer')
df = df.merge(df_ps5, how='outer')
df = df.sort_values(by=['Title'])

df.to_pickle(os.path.join(GAME_DATA_PATH, 'wikipedia.pkl'))


OSError: Cannot save file into a non-existent directory: '/Users/crempp/projects/data-experiments/GameData/GameData/data'

In [11]:
response = requests.get(url_wikipedia_ps4_AL)
soup = BeautifulSoup(response.text, 'html.parser')
table = soup.find('table', id='softwarelist')
tr = table.find_all('tr')[2:]
raw_cells = list(map(lambda x: x.find_all(['th','td']), tr))
df = pd.DataFrame(raw_cells)
df.columns =['Title',
             'Genre',
             'Developer',
             'Publisher',
             'Release Date JP',
             'Release Date NA',
             'Release Date PAL',
             'Addons',
             'Ref']

In [12]:
# df_ps4_a.info()
type(df.loc[0, "Genre"])


bs4.element.Tag