In [1]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', 50)
pd.set_option('display.max_rows', 50)

from tdi_capstone_common_functions import *
# the two functions imported are: standardize_string and pseudo_list_parser

In [2]:
# A distinct game entry is defined by the unique combination of the following parameters

unique_game = ['name', 'platform', 'release_year']

The project merges two data sets for training the model and predicting sales. The first is a sales data set, retreived from Kaggle (https://www.kaggle.com/datasets/thedevastator/global-video-game-sales-and-reviews). It contains a few other features, such as genre, critic scores, etc. The other data set contains information about games and was generated by accessing IGDB, an online games database (www.igdb.com) via its API. Before merging, the data sets need to be processed and aligned in such a way that they can be matched.

## Sales data set

The sales data set is dealt with first as it does not require much in terms of processing.

In [3]:
# import the sales data csv into a pandas dataframe

# names of the dataframe columns
sales_columns = ['index', 'name', 'platform', 'release_year', 'genre', 'publisher',
                 'sales_na', 'sales_eu', 'sales_jp', 'sales_other', 'sales_global',
                 'critic_score', 'critic_count', 'user_score', 'user_count', 'developer', 'rating']

# names of the columns to be dropped
sales_drop_columns = ['critic_score', 'critic_count', 'user_score', 'user_count']

# reading csv file into dataframe
df_sales = (pd.read_csv('sales_data.csv', skiprows=1, names=sales_columns, index_col='index')
            .drop(sales_drop_columns, axis=1)
            .drop_duplicates() # There are 209 duplicated rows, which are removed here
            .dropna(subset=['name'])) # There are 2 rows that have NaN as 'name' and therefore cannot be used

The release year column in this data set contains floats, all of which are either a year number followed by .0 or a NaN. We want years to be ints as well as handle the NaN values. Release year is one of the features that are used to distinguish one game from another, so normally, we would not be able to use rows containing NaN values in these features. However, it is possible to replace the NaN values with the correct values manually, so these rows can still be useful. It is possible to designate data types for specific columns with read_csv, but pandas cannot do this conversion because of the NaN values. Instead, the NaN values are filled in with -1, an int itself which doubles as a flag, so these rows can be corrected manually.

In [4]:
# NOTE: immediately after this I would put in the code that manually corrected the missing values.
# I want to keep the fillna(-1) just in case I missed or forgot to correct one of these rows.

df_sales['release_year'] = df_sales['release_year'].fillna(-1).astype('int')

Now it is possible to filter the data set to focus on a specific range of years.

In [5]:
# filter the data set to retain certain years, here 2010-2020 (inclusive)

time_boundaries = {'start': 2010, 'end': 2020}

def filter_by_time_boundaries(df, column, time_limits=time_boundaries, include_start=True, include_end=True, flag=False):
    """
    Filters a dataframe to retain only rows that fall within a specified range of years.
    
    Parameters
    ----------
    df : pandas.DataFrame()
        The dataframe object to be filtered.
    column : str
        The name of the column containing the years, according to which the function will evaluate which rows to keep.
    time_limits : dict, default time_boundaries
        A dictionary with two keys, 'start' and 'end', denoting the two end points of the range to retain. Values of this
        dictionary are integers. The project's default values are 2010 and 2020, set in a variable called time_boundaries.
    include_start : bool, default True
        Whether the function is inclusive of the start year.
    include_end : bool, default True
        Whether the function is inclusive of the end year.
    flag : object, default False
        Whether the function is inclusive of years with values equal to the value of flag.
        Note that these must be truthy values in order for the function to include these rows.
        Therefore, the default of False would not include flagged rows.
    
    Returns
    -------
    pandas.DataFrame
        The filtered dataframe.
    """

    # sets up the filter based on the start of the range
    if include_start:
        start_filter = (df[column] >= time_limits['start'])
    else:
        start_filter = (df[column] > time_limits['start'])
    
    # sets up the filter based on the end of the range
    if include_end:
        end_filter = (df[column] <= time_limits['end'])
    else:
        end_filter = (df[column] < time_limits['end'])
    
    # sets up the filter based on whether to include rows flagged for missing values
    if flag:
        flagged_filter = (df[column] == flag)
        
        # return rows if they fall between start and end (inclusive or exclusive) or if rows were flagged for missing values
        return df[(start_filter & end_filter) | flagged_filter]
    
    # return only rows that fall between start and end (inclusive or exclusive), excluding flagged rows
    return df[start_filter & end_filter]

# Note: currently the project filters out games that do not have a release year (marked with a flag of -1)

In [6]:
# filtering the sales data set by years
df_sales = filter_by_time_boundaries(df_sales, 'release_year')

Some columns contain strings, which need to be standardized. Most of them are a simple case of lowercase.

In [7]:
# columns with strings to lowercase
regular_string_columns = ['platform', 'genre', 'publisher', 'developer', 'rating']

# standardizes string columns with lowercase
df_sales[regular_string_columns] = df_sales[regular_string_columns].applymap(lambda x: x.lower() if isinstance(x, str) else x)

The names of games are more complex. Names can include diacritics, non-alphabet characters, East Asian characters, symbols, etc. There are also not confined to a limited vocabulary, such as genre or platform. It is important to remember that the names of games are only important insofar as they helps identify unique entries.

In [8]:
# import re
# from unidecode import unidecode

# def standardize_string(string):
#     """
#     Standardize strings by converting diacritics and East Asian characters
#     as well as removing symbols such as parentheses (and their inclused
#     content), retaining only alphanumeric characters.
    
#     Parameters
#     ----------
#     string : str
#         The string to be standardized.
    
#     Returns
#     -------
#     str
#         The standardized string in lowercase and stripped of
#         preceding/following whitespace.
#     """

#     # if string is not a str, return an empty string
#     if not isinstance(string, str):
#         return ''
   
#     # convert everything to unicode, addressing diacritics as well as chinese characters
#     string = unidecode(string)
    
#     # remove any non-alphanumeric character or non-space as well as parenthesis (and their enclosed content)
#     regex = r'\([^)]*\)|[^a-zA-Z0-9\s]'
#     string = re.sub(regex, '', string)
    
#     # standardize spacing to retain a single space between words
#     string = re.sub(r'\s+', ' ', string)
    
#     # change to lowercase and strips whitespaces
#     return string.lower().strip()

In [9]:
# standardizes strings in the name column, handling non-alphanumeric characters as well as removing parentheses
df_sales['name'] = df_sales['name'].map(lambda x: standardize_string(x) if isinstance(x, str) else x)

By grouping rows according to unique_game, I eliminate duplicates while saving the max value for other columns, which can be assumed to be more up-to-date (since sales can only increase, not decrease). For this data set, this affects only a single game (name = 'madden nfl 13'; platform = 'ps3'; release_year = 2012), which has two different sales values and therefore is not removed when duplicates are removed.

In [10]:
df_sales = df_sales.groupby(unique_game).agg('max').reset_index()

With the sales data set ready, we now move to importing and preparing the games data set.

## Preparing the games data set

The games data set contains a lot of columns, about half of which are useful or could potentially be useful. Most are self-explanatory.
It's worth noting that the majority of columns do not contain text, but rather the ID representing this data. For example, an entry of an action game would have XXXXX under genres rather than 'action'. To decode these numbers, one has to access the respective endpoint via IGDB's API. Since the actual name of the genre is not important (and would simply be one-hot-encoded probably), there is no need to transform those, and we can simply work with those IDs.

* name
* alternative_names
* first_release_date
* release_dates: in-depth information on release dates based on region, platform, etc.
* platforms
* genres
* themes
* franchise
* franchises: other franchises this entry belongs to
* keywords: e.g., 'world war 2', 'steampunk'
* game mode: e.g., single player, multiplayer
* player_perspectives
* multiplayer_modes
* summary: text description of the game
* storyline
* parent_game
* bundles: bundles containing this game
* collections: collections featuring this entry
* collection: the specific series the entry belongs to
* language_supports
* game_localizations
* similar_games
* involved_companies: note that this is under development at IGDB
* game_engines: the type of engine the game uses
* age_rating
* category: 0 = main game, 1 = DLC, 2 = expansion; the majority, over 226k entries, are category 0
* external_games: game platforms featuring this entry, e.g., Steam, GOG, Twitch, Epic

The rest of the columns in the games data set can be dropped with impunity.

In [11]:
# columns to drop in the games data set

games_drop_columns = ['artworks', # images of the game
                      'cover', # cover art
                      'created_at', # date when game entry was created at IGDB
                      'screenshots', # screenshots of the game
                      'slug', # unique url name string
                      'tags', # auto-generated numbers for complex API filtering
                      'updated_at', # last time the entry was updated at IGDB
                      'url', # link to the game's entry in IGDB
                      'version_parent', # if the entry is a version of another entry, this is the ID of the parent entry
                      'version_title', # title of this version, e.g. gold edition
                      'checksum', # hash of the game entry
                      'websites', # websites associated with entry, e.g., developer's game page
                      'follows', # no. people following the game on IGDB (depricated)
                      'videos', # videos of gameplay
                      'hypes', # no. people following the game on IGDB before its release
                      'dlcs', # ID of DLCs of the entry
                      'expansions', # ID of expansions of the entry
                      'remakes', # ID of remakes of the entry
                      'expanded_games', # ID of expanded games related to this entry
                      'remasters', #ID of games that are remastered versions of this entry
                      'standalone_expansions', # ID of stand-alone expansions of this entry
                      'aggregated_rating', # aggregated rating based on external critic scores
                      'aggregated_rating_count', # no. external critic scores
                      'rating', # rating based on public reviews on IGDB
                      'rating_count', # no. public reviews on IGDB
                      'total_rating', # average rating based on critic and public review scores
                      'total_rating_count', # no. reviews overall
                      'forks', #
                      'ports' # the ports (other platforms) the entry has other than current
                     ]

Due to size restrictions on GitHub, the games data set is split into 10 files, so they need to be concatenated into a singular dataframe.

In [12]:
df_games = pd.DataFrame()

for i in range(0, 10):
    df_partial = pd.read_csv(f'games_data_{i}.csv', low_memory=False, index_col='Unnamed: 0')
    df_games = pd.concat([df_games, df_partial], axis=0)

In [13]:
# Drop the irrelevant columns, rename platforms to platform for consistency, and drop duplicates

df_games = df_games.drop(games_drop_columns, axis=1).rename(columns={'platforms': 'platform'}).drop_duplicates()

In [14]:
# if the games dataset was stored as one file, this command would have done all of the above
# df_games = (pd.read_csv('igdb_raw.csv', low_memory=False, index_col='Unnamed: 0')
#             .drop(games_drop_columns, axis=1)
#             .rename(columns={'platforms': 'platform'})
#             .drop_duplicates())

Since distinct games are identified by name, platform, and release year, we use the first_release_date to extract the release year. However, first_release_date is a floating point number that needs to be converted into a more manageable (and useful for us) format, namely a date format, from which we can then extract the release year itself (which is the same kind of data the sales data set has).
Just as in the case with the sales data set, NaN values are replaced by -1 (as a flag value) and type cast into int.

In [15]:
df_games['release_year'] = pd.to_datetime(df_games['first_release_date'], unit='s').dt.year.fillna(-1).astype('int')

Then the data set is filtered to retain only rows with games whose (initial) release year falls within the time boundaries set for the project.

In [16]:
df_games = filter_by_time_boundaries(df_games, 'release_year')

Now that the games data set has all three features that make up a distinct entry properly formatted, any entry that does not have one of these three key features is removed. Unlike the case with the sales data set, which has slightly over 200 such enties and therefore can be handled manually, the games data set has over 95,000 such entries.

In [17]:
# remove all rows that have missing values in any of the fields that define a unique game (name, platform, year).
# this can be done with the unique_game variable only after the creation of the 'release_year' column above.

df_games = df_games.dropna(how='any', subset=unique_game)

In [18]:
# standardizes the name column and drops any row that returned an empty string (i.e. names with only special characters)

df_games['name'] = df_games['name'].map(lambda x: standardize_string(x))
df_games = df_games[df_games['name'] != '']

Quite a few of the columns of the games data set contain lists of values. These, however, are imported as strings, e.g. '[12623, 6231, 96023]'. Therefore, these strings need to be parsed into proper lists.

In [19]:
# def pseudo_list_parser(item, dtype=int, ignore_space=True):
#     """
#     Parse a string that appears as a list into a list of elements of a specified data type.
#     It assumes elements are separated by a comma.
#     These kind of strings are a common outcome of reading lists from csv files into a dataframe.
    
#     Parameters
#     ----------
#     item : str
#         The string to be parsed.
#     dtype : data type, default int
#         The data type that would comprise the elements of the parsed list.
#     ignore_space : bool, default True
#         Whether to remove spaces (appearing most often after a comma in a list string) from the string before parsing.
    
#     Returns
#     -------
#     list or object
#         Returns the parsed string as a list of elements of the specified data type.
#         If the first argument passed to the function is not a string (e.g. NaN),
#         returns the item without performing any operations.
#     """

#     if isinstance(item, str):
#         if ignore_space:
#             item = item.replace(' ', '')
#         return [dtype(x) for x in item.replace('[','').replace(']', '').split(',')]
    
#     return item

In particular, the platform column requires special attention, since it is one of the three elements that define a distinct game entry.

In [20]:
# converts the platform column from a single string that looks like a list to an actual list of values.
# each element of the list is a string itself, since that would be mapped to match the platform values in the
# sales data set.

df_games['platform'] = df_games['platform'].map(lambda x: pseudo_list_parser(x, str))

While both data sets have the platform column, they encode this information differently. For example, the sales data set has 'pc', while the games data set has '6' and '13' for Windows and DOS respectively. In order to identify the distinct games and merge the data sets correctly, the platform values of one data set must be mapped onto those of the other data set. The sales data set is less granular (e.g. it has a single 'pc' value for both Windows and DOS), yet does not impact the ability to identify distinct game entries. For that reason, the platform values of the sales data set will be mapped onto those of the games data set. For example, after the remapping, the games data set will have 'pc' for all cases where originally it had the values '6' and '13' as platforms.
The following dictionary's keys are the platforms (IDs) that appear in the games data set, while its values are the corresponding platforms in the sales data set. This was accomplished by first querying the IGDB API's platforms endpoint and then looking up the names of the platforms.

In [21]:
# key = values found in df_games['platform']; value = values found in df_sales['platform']
games_to_sales_platform_dict = {
    'atari 2600': '2600',
    '37': '3ds',
    '137': '3ds', # new 3ds
    '20': 'ds', # nintendo ds
    '159': 'ds', # nintendo dsi
    '9': 'ps3',
    '7': 'ps2',
    '38': 'psp',
    '6': 'pc', # windows
    '13': 'pc', # DOS
    '5': 'wii',
    '12': 'x360',
    '4': 'n64', # nintendo 64
    '21': 'gc', # game cube
    '11': 'xb', # xbox
    '18': 'nes',
    '24': 'gba', # game boy advance
    '46': 'psv', # ps vita ; note also '165' = playstation vr, and '390' = playstation vr2 (both not included in this dict)
    '48': 'ps4',
    '49': 'xone', # xbox one
    '19': 'snes', # super NES
    '59': '2600', # atari 2600
    '41': 'wiiu',
    '32': 'sat', # sega saturn
    '33': 'gb', # game boy
    '22': 'gb', # game boy color
    '136': 'ng', # neo geo ; there are other neo geo variations in df_platforms, but there is no relevant game between 2010-2020    
    '29': 'gen', # sega genesis
    '274': 'pcfx',
    '23': 'dc', # dream cast
    '50': '3do', # 3do interactive multiplayer
    '57': 'ws', # wonderswan
    '86': 'tg16', # turbografx-16/pc engine cd
    '150': 'tg16', # turbografx-16/pc engine
    '78': 'scd', # sega cd
    '35': 'gg' # game gear
}

To apply this to the entire platform column, we will write a function that will be mapped onto it.

In [22]:
def map_values(search_key, value_mapping):
    """
    Map one set of values to another, thereby matching values between two data sets.
    
    Parameters
    ----------
    search_key : object or list
        The singular value or list to be remapped.
    value_mapping : dict
        A dictionary, where keys are the values passed to the function in search_key and the values are the new values to
        be used in the remapping.
    
    Returns
    -------
    list or object
        If a list was passed as the first argument of the function, a list with the corresponding (remapped) values will
        be returned. If no match for any value in search_key was found, and therefore no remapping could have taken place,
        return an empty list. If search_key was a singular value, returns the corresponding (remapped) value if found, or
        a np.nan otherwise.
        
    Notes
    -----
    The following code accomplishes this as well but provides less control:
        for item in set(l).intersection(value_mapping):
            remapped.values.append(value_mapping[item])
    or alternatively:
        remapped_values = [item for item in set(l).intersection(value_mapping)]
    
    Future development
    ------------------
    The following code would look in value_mapping to see if it finds a match. If it doesn't, it will keep the value as is,
    and when a dataframe merge takes place, rows with these values will be dropped since there will be no match.
    If no element in search_key appears in value_mapping, set the return value to np.nan. If there are elements, make sure
    to remove any duplicate values by transforming the list into a set.
        for item in search_key:
            value = value_mapping.get(item, item)
        remapped_values.append(value)
        remapped_values = list(set(remapped_values))
        if not remapped_values:
            remapped_values = np.nan
        else:
            remapped_values = list(set(remapped_values))
    """
    
    # Handling the case in which search_key is a list
    if isinstance(search_key, list):
        
        # initializes the return value as an empty list
        remapped_values = []
        
        # For every item in the search_key list, the loop attempts to find it as a key in the value_mapping dict.
        # If it cannot, it continues to the next iteration (due to the if statement).
        # If it does, it appends the non-None value to remapped_values, which is the return value of the function
        for item in search_key:
            
            # retrieving the new, remapped value or None if it was not found 
            value = value_mapping.get(item, None)
            
            # corresponding values are appended to the return value of the function
            if value:
                remapped_values.append(value)
    
    # Handling the case in which search_key was not a list (function assumes this means it is a single value)
    else:
        
        # retrieves the new, remapped value or NaN if it was not found
        remapped_values = value_mapping.get(search_key, np.nan)
    
    return remapped_values

The dictionary of corresponding platforms across the two data sets is then fed to a function which maps one set of possible values onto another.

In [23]:
# changes the values in the platforms column to the values used for platform in the df_sales
df_games['platform'] = df_games['platform'].map(lambda x: map_values(x, games_to_sales_platform_dict))

Now the platforms in both data sets draw from the same vocabulary. However, while each entry in the sales data set has a unique name, release year, and platform, the game data sets lists all platforms that a specific name-release year combination have. Since the platform is a feature used by the model, each entry in the games data set must have its own unique name-release year-platform combination.

In [24]:
# Transform rows that have multiple platforms listed into separate rows for each platform, copying all other information

df_games = df_games.explode('platform')

# This results in over 13,000 rows that have a NaN value as their platform. These have not been previously removed because
# before exploding, their platform column contained an empty list, which is not considered a NaN value.
# So now it is possible to remove them based on their NaN value.

df_games = df_games.dropna(subset=['platform'])

With a few exceptions, the rest of the columns in the games data set simply need to be parsed from strings that look like lists to actual lists.

In [25]:
# converts all columns except for specific ones

columns_not_to_parse = ['id', 'name', 'release_year', 'platform', 'summary', 'storyline']

columns_to_parse = list(df_games.columns)

# removes columns_not_to_parse from columns_to_parse
for column in columns_not_to_parse:
    columns_to_parse.remove(column)

# parse all relevant columns
for column in columns_to_parse:
    df_games[column] = df_games[column].map(lambda x: pseudo_list_parser(x))

With all the columns treated, it is almost time to merge the two data sets. However, there are still cases where the same unique game (as determined by the name-release year-platform combination) has more than one entry, likely because they had multiple entries in IGDB. The issue is that each of these different entries for the same game can have different values for the features, e.g., one entry could encode different keywords than another entry for the same game. The data in both entries is valid and valuable, and rather than pick one over the other, these cases of multiple entries for the same game are aggregated to include all the data pertaining to a specific game.
Since the data most often appear as a list of IDs (corresponding to various genres, themes, keywords, etc.), we'll need a function that can aggregate multiple values (i.e. multiple elements of a list).

In [26]:
def mva(group):
    """
    Aggregate multiple values, used for aggregating pandas dataframe groupby objects, into a single flattened list.
    
    Parameters
    ----------
    group : pandas.Series
        The series containing the different values that will be combined into the single flattened list to be returned.
    
    Returns
    -------
    list or pandas.Series or np.nan
        Returns a flattened list containing all the elements of the Series passed as an argument to the function, with
        no duplicates. If this Series contain only a single element, no aggregation is necessary, and the function returns
        the Series as is. If the Series is empty, there were only NaN values, and the function returns np.nan.
    
    Notes
    -----
    Pandas dataframe groupby objects' agg method passes every column of every group as a Series to the aggregate function.
    """
    
    # removes all NaN values
    group = group.dropna()
    
    # if the group is empty, then it means that there were only NaN values in it
    if len(group) == 0:
        return np.nan
    
    # if the group has one element, it is the only one that needs to be returned
    if len(group) == 1:
        return group
    
    # otherwise, there are multiple elements that need to be combined into a list
    aggregated_value = []
    
    for value in group:
        if isinstance(value, list):
            aggregated_value.extend(value)
        else:
            aggregated_value.append(value)
        
    return list(set(aggregated_value))

The agg method for data frames groupby objects can accept a dictionary, where keys are the column names and values are the functions to use for the aggregation operation. These can be actual functions or strings denoting names of specific functions (e.g. 'min'). The name, release year, and platform columns are excluded from this dictionary because the data is grouped by these three columns.

In [27]:
# sets the behavior of how each column would be aggregated by using dictionaries, where key = column name, and
# value = the function (e.g. mva) or the name of the function (e.g. 'min')

# columns that have multiple values that need to be combined into a flattened list
mva_columns = ['age_ratings', 'category', 'external_games', 'game_modes', 'genres', 'release_dates', 'similar_games',
              'summary', 'themes', 'language_supports', 'involved_companies', 'keywords', 'multiplayer_modes', 'status',
              'alternative_names', 'bundles', 'franchises', 'game_engines', 'player_perspectives', 'game_localizations',
              'collections', 'parent_game', 'collection', 'storyline', 'franchise']
mva_dict = {key: mva for key in mva_columns}

# columns that can be aggregated by taking the minimum value
min_columns = ['id', 'first_release_date']
min_dict = {key: 'min' for key in min_columns}

# creates a single dictionary with the above aggregation behavioral dictionaries.
# This dictionary will be passed onto the agg method of the groupby object
column_aggregation_dict = {**mva_dict, **min_dict}

In [28]:
# aggregates the df_games in order to remove duplicate entries, likely due to multiple entries for the same game in the
# dataset itself. Note that this process can take a couple of minutes.

df_games_agg = df_games.groupby(unique_game).agg(column_aggregation_dict).reset_index()

## Merging sales and games data

Since a unique game is defined by the combination of its name, release year, and platform, it is necessary to make sure that all three of these can be matched. Release years and platforms are confined to a certain number of fixed values (e.g., 'pc' or 'ps4' for platforms and 2015 or 2019 for release years), and the code above made sure that both data sets use the same vocabulary of values.

Names of games, however, can wildly vary. Even the same game can have different spellings of its name (or could have been input differently, e.g., with colons and hyphens, or using digits vs. roman numerals). Instead of a one-to-one match like with release year and platform, names will be matched by closest match. This introduces a couple of complexities. First, the same name can be used by several games, often referring to older/newer releases of a title. A straightforward closest match would not work, since this would result in finding the (first) closest match in the data set. This also reveals the second caveat here, which is that multiple names in one data set might match most closely to a single name in the other data set.

These issues can be solved by making the assumption that in a given year and for a given platform, every game will have a distinctive name. Put another way, every game for each year-platform combination will have a unique name. This means it is possible to look for closest name matches within a given year-platform combination.

In [29]:
from collections import defaultdict

def generate_year_platform_dict(df, year_column='release_year', platform_column='platform'):
    """
    Create a dictionary of all combinations of year-platform in the data set, where keys are release year and values are
    a list of platforms for which games were made for that year (insofar as they appear in the data set).
    
    Parameters
    ----------
    df : pandas.DataFrame
        The data frame containing the data set. Must contain columns with the names that appear in the two other arguments.
    year_column : str, default 'release_year'
        The name of the column containing the release year that will be used as the return value dictionary's keys.
    platform_column : str, default 'platform'
        The name of the column containig the platforms that will be used as the return value dictionary's values.
        
    Returns
    -------
    dict
        A dictionary with keys of release year and values of the platforms with games that were released that year.
    """
    
    return_dict = defaultdict(list)
    
    for year in df[year_column].unique():
        for platform in df[df[year_column] == year][platform_column].unique():
            return_dict[year].append(platform)
    
    return return_dict

The first step is to find all possible release year-platform combinations in the data set. Not every year has games in every single platform. Because we need to do so for both data sets, we can use a function that will generate a dictionary, where the keys are the release year and the values are platforms which had games released in that (key) release year.

In [30]:
# all possible combination of year-platform in the sales data

sales_comb = generate_year_platform_dict(df_sales)

In [31]:
# all possible combination of year-platform in the games data

games_comb = generate_year_platform_dict(df_games_agg)

The two data sets don't fully overlap in terms of what games they contain, so some release year-platform combinations are shared, while others are exclusive to one or the other data set. Since we will be using this to match names between the two data sets, only the shared combinations are relevant.

In [32]:
# Creates a dictionary with release year as key and list of platforms as values that are shared between the two data sets

shared_comb = {}

for sales_year, sales_platform in sales_comb.items():
    
    # retrieves the list of platforms from df_game according to the year from df_sales; or None if there were no platforms
    # for that sales year
    games_platform = games_comb.get(sales_year, None)
    
    # add to that sales year the platforms that are shared for both datasets for that particular year
    if games_platform:
        shared_comb[sales_year] = list(set(sales_platform).intersection(games_platform))

In [33]:
from rapidfuzz import process

The following function finds the closest match to a given string out of possible choices and removes the one it picked from the available choices, so that that choice cannot be matched again.

In [34]:
def find_match_and_remove(string, choices):
    """
    Find the closest string from available choices based on edit distance. Once a match is found, that particular choice is
    removed from the available future choices to prevent multiple strings matching to the same choice.
    
    Parameters
    ----------
    string : str
        The string to be matched.
    choices : iterable
        The array of strings (most commonly a list or a Series) from which the closest match will be taken.
    
    Returns
    -------
    tuple
        A tuple of four elements: the string that was the closest match, the match score of the closest match (0 = completely
        different string, 100 = identical string), the index within the iterable choices of the closest match, and the array
        of choices (after the removal of the choice that was matched).    
    """
    
    match, score, index = process.extractOne(string, choices)
    choices = choices.drop(index)
    return match, score, index, choices

Now it is possible to create a new data frame that will contain the unique game entries in the sales data set with their matched name and index in the games data set, alongside their match score.

In [35]:
# Creates an empty data frame that will hold the data of the sales data frame alongside its match in the games data set
df_sales_name_matched = pd.DataFrame()

# Goes over all the shared release year-platform combination
for year, platforms in shared_comb.items():
    # for every platform in a given year
    for platform in platforms:
        # retrieve the unique games for this particular release year-platform combination from the sales data set
        filtered_sales = df_sales[(df_sales['release_year'] == year) & (df_sales['platform'] == platform)][unique_game]
        # retrieve the unique games for this particular release year-platform combination from the games data set
        filtered_games = df_games_agg[(df_games_agg['release_year'] == year) & (df_games_agg['platform'] == platform)][unique_game]

        # for this particular release year-platform combination, names in the filtered_sales data set will have choices from
        # names in the filtered_games data set
        choices = filtered_games['name']
        
        # each entry in the filtered_sales will now have a 'closest match' (the closest name), 'match_score' (how close of a
        # match it is, described above), and 'index_in_df_games_agg' (the index of the matched name in the games data set).
        filtered_sales['closest_match'], filtered_sales['match_score'], filtered_sales['index_in_df_games_agg'], choices = zip(*filtered_sales['name'].apply(lambda x: find_match_and_remove(x, choices)))
        
        # once the loop ran through all platforms for this year, it will slowly build up the df_sales_names_matched data frame,
        # before proceeding to the next platform for that particular year or to the next year (if it exhausted all platforms for
        # that year)
        df_sales_name_matched = pd.concat([df_sales_name_matched, filtered_sales], axis=0)

Some games have a match score of 100, meaning that they have identical names. Others have lower match score, meaning that they have names of varying degrees of similarities. From testing different match score thresholds, it seems that a match score of 90 is a good one, where there are many good matches. Anything lower that 90 results in many mismatches and only additional good matches.

Now df_sales_name_matched holds only the unique game entries that have matches in the games data set, but really only has the name, platform, and release_year columns (i.e., the ones that mark a distinct game) from the sales data frame. The next step is to add all the other columns from the sales data set. There is no need to reduplicate the three aforementioned columns, so they are dropped from the sales data set.

In [36]:
# joins the columns from df_sales (after dropping the shared columns, i.e., unique_game) to the matched up rows
df_sales_name_matched = df_sales_name_matched[df_sales_name_matched['match_score'] >= 90].join(df_sales.drop(unique_game, axis=1))

Finally, it is possible to merge the sales data set (filtered with matches to the games data set) with the games data set. This is done by matching on the index of games data set, which is then dropped, since that index has no value for modelling.

In [37]:
# merges the sales data (also containing the matched indices from the games data) with the aggregated games data

final_columns_to_drop = ['index_in_df_games_agg']

df_final = (pd.merge(df_sales_name_matched,
                     df_games_agg.drop(unique_game, axis=1), left_on='index_in_df_games_agg', right_index=True)
            .drop(final_columns_to_drop, axis=1)
            .reset_index(drop=True))

Lastly, the joined data set is saved to a csv file.

In [38]:
df_final

Unnamed: 0,name,platform,release_year,closest_match,match_score,genre,publisher,sales_na,sales_eu,sales_jp,sales_other,sales_global,developer,rating,age_ratings,category,external_games,game_modes,genres,release_dates,similar_games,summary,themes,language_supports,involved_companies,keywords,multiplayer_modes,status,alternative_names,bundles,franchises,game_engines,player_perspectives,game_localizations,collections,parent_game,collection,storyline,franchise,id,first_release_date
0,achtung panzer kharkov 1943,pc,2010,achtung panzer kharkov 1943,100.0,strategy,mamba games,0.00,0.01,0.00,0.00,0.02,paradox interactive,,[124163],0,"[40103, 127834, 246584, 2519870]",[1],"[11, 15]",[4233],"[3700, 6985, 10243, 16497, 28794, 30229, 31480...",Want to find yourself in the very center of Kh...,"[22, 39]",[479437],"[4124, 4125]","[19, 1107, 4134, 5653]",,,,,,,[3],,[2079],,2079.0,,,2008,1.267056e+09
1,age of conan rise of the godslayer,pc,2010,age of conan rise of the godslayer,100.0,role-playing,funcom,0.00,0.02,0.00,0.00,0.03,funcom,m,"[17157, 59915]",2,"[41662, 131800]","[3, 5]","[12, 31]",[188559],"[27092, 36198, 47823, 55038, 55199, 81249, 962...",Age of Conan: Rise of the Godslayer is the fir...,"[1, 17]",,"[1419, 44892, 44893]","[900, 1158, 1538, 2472, 4468, 7161, 9092, 1306...",,,"[22294, 22295]",,[3],[436],[2],,[123],16402.0,123.0,"Age of Conan: Rise of the Godslayer, will deli...",3.0,588,1.273536e+09
2,aliens vs predator,pc,2010,aliens vs predator,100.0,shooter,sega,0.00,0.17,0.00,0.02,0.19,rebellion,m,"[34071, 34072, 74254, 108863, 108864]",0,"[14771, 62537, 72988, 72989, 72990, 78778, 133...","[1, 2, 3]",[5],"[1259, 1260, 107222, 107223, 409136]","[356, 494, 495, 564, 571, 1006, 2031, 7020, 73...",Bringing the legendary war between two of scie...,"[1, 18, 19, 21]","[74009, 74010, 74011, 74012, 74013, 74014]","[1357, 1358]","[3, 129, 453, 558, 1158, 1186, 1286, 1293, 133...",[1915],,"[68328, 68329]",,"[23, 464, 507]",[36],[1],,[126],,126.0,,,560,1.266278e+09
3,all points bulletin,pc,2010,apb all points bulletin,95.0,role-playing,electronic arts,0.00,0.03,0.00,0.01,0.04,,,"[11102, 75084]",0,[2855915],"[2, 3]","[5, 10]","[65992, 65993]","[32902, 38030, 43367, 76263, 82090, 95340, 103...",APB: All Points Bulletin is a multiplayer onli...,[1],,"[252258, 252259]",,,,[8219],,,[351],[2],,,,,,,23369,1.277770e+09
4,alpha protocol,pc,2010,alpha protocol,100.0,role-playing,sega,0.00,0.03,0.00,0.01,0.03,obsidian entertainment,m,"[4576, 32711, 74260]",0,"[78785, 80919, 137977, 189388, 210034, 213165,...",[1],"[5, 12]","[5396, 5397, 5398, 107229, 107230, 190322]","[1877, 19164, 19564, 28309, 36198, 47823, 5503...",A talented young agent cast out by his governm...,"[1, 23]","[259092, 259093, 259094, 259095, 259096]","[6032, 6033]","[103, 1033, 1219, 1423, 4004, 4245, 5262, 5486...",,,,,,[6],[2],,,,,Loyalty carries a price and no one knows this ...,,2539,1.275005e+09
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4197,wwe 13,ps3,2012,wwe 13,100.0,action,thq,0.51,0.60,0.00,0.25,1.35,thq,t,[2890],0,"[25482, 74337, 84687, 93499, 93500, 122173, 21...","[1, 2, 4]","[4, 8, 14, 15]","[12475, 12476, 12477, 106961, 106962, 144949, ...","[2265, 5296, 5297, 5298, 5299, 5300, 7167, 927...",WWE '13 is an upcoming professional wrestling ...,,,"[12805, 12806, 220224, 220236]","[234, 869, 1293, 1333, 4134, 4245, 4266, 4271,...",[935],,"[65829, 65830]",[148124],[255],,[2],,[464],,464.0,,255.0,5295,1.351555e+09
4198,xcom enemy unknown,ps3,2012,xcom enemy unknown,100.0,strategy,take-two interactive,0.24,0.23,0.00,0.10,0.57,firaxis games,m,"[26, 4399, 108262, 108263, 125694]",8,"[13319, 17666, 84729, 92397, 100169, 123327, 2...","[1, 2]","[12, 13, 15, 16, 24]","[3508, 3509, 3510, 28292, 28293, 106992, 106993]","[43, 495, 533, 1006, 1164, 1377, 3188, 5647, 7...",XCOM: Enemy Unknown will place you in control ...,"[1, 18, 39]","[242226, 242227, 242228, 242229, 242230, 24223...","[3687, 3688, 42284]","[3, 129, 415, 545, 558, 562, 575, 577, 578, 73...",,,"[36173, 47769]","[95365, 243463]",[1064],[6],[3],[25704],[9219],24.0,,,,1318,1.349741e+09
4199,yakuza,ps3,2012,yakuza 5,95.0,action,sega,0.00,0.00,0.11,0.00,0.11,,,"[34680, 34681, 69001]",0,"[53659, 84755, 124102, 246093, 400578, 1933249...",[1],[31],"[107008, 229362]","[1877, 17548, 25300, 26574, 81249, 87622, 9621...",Yakuza 5 is an open-world action-adventure gam...,"[1, 38]",,"[5083, 68675]","[274, 284, 1186, 3234, 17215]",,,"[61545, 102999]",,[1467],,[2],"[42, 19303]",[380],,,,,2063,1.354752e+09
4200,yakuza 5,ps3,2012,yakuza 5,100.0,action,sega,0.00,0.00,0.59,0.00,0.59,ryu ga gotoku studios,m,"[34680, 34681, 69001]",0,"[53659, 84755, 124102, 246093, 400578, 1933249...",[1],[31],"[107008, 229362]","[1877, 17548, 25300, 26574, 81249, 87622, 9621...",Yakuza 5 is an open-world action-adventure gam...,"[1, 38]",,"[5083, 68675]","[274, 284, 1186, 3234, 17215]",,,"[61545, 102999]",,[1467],,[2],"[42, 19303]",[380],,,,,2063,1.354752e+09


In [None]:
# write the complete data set into a CSV file.

df_final.to_csv('data_complete.csv', index_label='index')

# CONTINUE HERE

# END OF ORGANIZING

In [None]:
# NOTE:::: I should import_api on forks endpoint to see what I get in return.
# NOTE:::: I need to find what is the ID for 'action' in genres

## Section dealing with rows flagged with missing years

Platforms left to do: Wii, X360, DS, PS3, XB, 2600

In [None]:
df_sales[(df_sales['release_year'] == -1) & (df_sales['platform'] == 'Wii')]

In [None]:
list(df_sales[(df_sales['release_year'] == -1) & (df_sales['platform'] == 'Wii')]['name'])

In [None]:
#df_sales[df_sales['name'].str.contains('Super Robot Wars OG Saga: Masou Kishin II')]['name'][9739]

In [None]:
# platform = PC

# inversion, 2012
# Homeworld Remastered Collection, 2015
# WRC: FIA World Rally Championship, 2010
# GRID, 2019
# Clockwork Empires, 2016
# Dead Island: Riptide, 2013
# Rocksmith, 2011
# Test Drive Unlimited 2, 2011
# Dead Space 3, 2013
# LEGO Harry Potter: Years 5-7, 2011 | PC, 3DS, PSP
# BioShock 2, 2010
# Tomb Raider, 2013 
# TERA, 2011
# Call of Duty: Black Ops, 2010
    
# Disgaea 3: Absence of Detention, 2011 | PSV
    
# 3DS
# Harvest Moon: The Tale of Two Towns, 2010
# Pet Zombies, 2011
# Face Racers: Photo Finish, 2011
# The Hidden, 2011
# Dream Trigger 3D, 2011
# Beyond the Labyrinth, 2012

# PSP
# Danganronpa: Trigger Happy Havoc, 2010
# Valkyria Chronicles III: Unrecorded Chronicles, 2011
# Super Robot Wars OG Saga: Masou Kishin II - Revelation of Evil God, 2012
# Fullmetal Alchemist: Brotherhood, 2010

# PS2
# Madden NFL 2004, 2003
# FIFA Soccer 2004, 2003
# wwe Smackdown vs. Raw 2006, 2005
# NASCAR Thunder 2003, 2002
# Rock Band, 2007
# Suikoden III, 2002
# Wheel of Fortune, 2003
# MLB SlugFest 20-03, 2002
# Monster Hunter 2, 2006
# 'NASCAR: Dirt to Daytona', 2002
#  'NFL GameDay 2003', 2002
#  'Harvest Moon: Save the Homeland', 2001
#  'Final Fantasy XI', 2002
#  'All-Star Baseball 2005', 2004
#  'Haven: Call of the King', 2002
#  'College Hoops 2K6', 2005
#  'Mega Man X Collection', 2006
#  'Jet X20', 2002
#  'Tribes: Aerial Assault', 2002
#  'Yu Yu Hakusho: Dark Tournament', 2004
#  'NBA Starting Five', 2002
#  "James Cameron's Dark Angel", 2002
#  'Sword of the Samurai', 2002
#  'eJay Clubworld', 2003
#  'Saru! Get You! Million Monkeys', 2006
#  'Star Trek: Conquest', 2007
#  'Demon Chaos', 2005
#  "McFarlane's Evil Prophecy", 2004
#  'Combat Elite: WWII Paratroopers', 2005
#  'Mountain Bike Adrenaline', 2007
#  'Sega Rally 2006', 2006
#  'Samurai Spirits: Tenkaichi Kenkakuden', 2005
#  "Cabela's Alaskan Adventure", 2006
#  'Virtua Quest', 2004

## End of section for flagged rows

### Exploration of data

I used to following code to determine the correlations of the platforms between df_games and df_sales.
I do not need to run any of this.

In [None]:
#df_platforms = pd.read_csv('platforms.csv', index_col='Unnamed: 0')