In [18]:
import numpy as np
import os
import pandas as pd
from datetime import datetime
from sklearn.preprocessing import MultiLabelBinarizer

In [22]:
data_directory = os.path.join(os.getcwd(), "data", "initial_upload", "transformed_data")
data_upload_directory = os.path.join(os.getcwd(), "data", "classification_data")

# Transform Data

## Games

In [52]:
# read entity table from "transformed_data" folder
entity_game = pd.read_csv(os.path.join(data_directory, "entity_game.csv"))

# solve corrupted columns
entity_game = entity_game.drop(columns=['achievements_count'])
entity_game = entity_game.rename(columns={'parent_achievements_count':'achievements_count',
                                          'added':'added_count'}) # remove

# drop irrelevant columns
entity_game = entity_game.drop(columns=["slug", "name", "name_original", #"alternative_names",
                                        "updated",
                                        "rating_top", "reviews_count", #"ratings_count", "community_rating",
                                        "metacritic",
                                        "description_raw",
                                        #"saturated_color", "dominant_color",
                                        "background_image","background_image_additional",
                                        #"clip",
                                        "reddit_name","reddit_description","reddit_logo",
                                        "esrb",
                                        "score",
                                        #"user_game",
                                        "creators_count"])

# only retain games where added_count and sum of added statuses tally
entity_game['added_sum'] = entity_game['added_yet'] + entity_game['added_owned'] + entity_game['added_beaten'] + entity_game['added_toplay'] + entity_game['added_dropped'] + entity_game['added_playing']
entity_game = entity_game[entity_game["added_count"] == entity_game["added_sum"]]
entity_game.drop(columns=["added_sum"], inplace=True)

# only retain games that are not "to be announced"
entity_game = entity_game[entity_game['tba'] == False]
entity_game.drop(columns=["tba"], inplace=True)

# seperate release date into day of the week, day of the month, month and year
entity_game['released'] = entity_game['released'].apply(lambda x: datetime.strptime(x,'%Y-%m-%d'))
entity_game['day_of_week'] = entity_game['released'].apply(lambda x: x.weekday()+1)
entity_game['day_of_month'] = entity_game['released'].apply(lambda x: x.day)
entity_game['month'] = entity_game['released'].apply(lambda x: x.month)
entity_game['year'] = entity_game['released'].apply(lambda x: x.year)
entity_game = entity_game.drop(columns=['released'])

# transform website, description, movies and reddit_url values to boolean
entity_game['website'] = entity_game['website'].notna().astype(int)
entity_game['description'] = entity_game['description'].notna().astype(int)
entity_game['reddit_url'] = entity_game['reddit_url'].notna().astype(int)
entity_game['movies'] = (entity_game['movies_count'].fillna(0) == 0).astype(int)
entity_game = entity_game.drop(columns=['movies_count'])

# transform added status counts to percentages
entity_game['added_yet'] = (entity_game['added_yet'].fillna(0)/entity_game['added_count']).mul(100).round(2)
entity_game['added_owned'] = (entity_game['added_owned'].fillna(0)/entity_game['added_count']).mul(100).round(2)
entity_game['added_beaten'] = (entity_game['added_beaten'].fillna(0)/entity_game['added_count']).mul(100).round(2)
entity_game['added_toplay'] = (entity_game['added_toplay'].fillna(0)/entity_game['added_count']).mul(100).round(2)
entity_game['added_dropped'] = (entity_game['added_dropped'].fillna(0)/entity_game['added_count']).mul(100).round(2)
entity_game['added_playing'] = (entity_game['added_playing'].fillna(0)/entity_game['added_count']).mul(100).round(2)

print("Shape:", entity_game.shape)
print("Columns w NA:", entity_game.columns[entity_game.isna().any()].tolist())

game_list = entity_game.id.to_list()

  exec(code_obj, self.user_global_ns, self.user_ns)


Shape: (59, 28)
Columns w NA: []


## One Hot Encoding

In [53]:
def one_hot_encode(df:pd.DataFrame(),rows:str,cols:str) -> pd.DataFrame():
    """
    Helper function to one hot encode categorical entities.
    """
    # replace "-" with "_" to get neater column names
    df[cols] = df[cols].apply(lambda x: x.replace("-","_"))

    # group df by "rows"
    group = pd.DataFrame(df.groupby([rows])[cols].apply(lambda x: tuple(x.values))).reset_index()

    # one hot encoding
    mlb = MultiLabelBinarizer(sparse_output=True)
    content = mlb.fit_transform(group[cols])
    new_df = pd.DataFrame(data = content.toarray(),
                          columns=mlb.classes_).add_prefix(cols+"_")

    # add count as a feature
    new_df[cols+"s_count"] = new_df.sum(axis=1) 

    # create id column for joining purposes
    new_df["id"] = group[rows]

    return new_df

### Platforms

In [60]:
# read entity and relationship tables from "transformed_data" folder
entity_parent_platform = pd.read_csv(os.path.join(data_directory, "entity_parent_platform.csv")).rename(columns={"id":"parent_platform_id","slug":"parent_platform"})
entity_platform = pd.read_csv(os.path.join(data_directory, "entity_platform.csv")).rename(columns={"id":"platform_id","slug":"platform"})
rs_game_platform = pd.read_csv(os.path.join(data_directory, "rs_game_platform.csv"))

# inner join entity and relationship table
df_platform = rs_game_platform.merge(entity_platform.merge(entity_parent_platform, how="inner", on="parent_platform_id"), how="inner", on="platform_id")[["game_id","platform","parent_platform"]]
df_platform = df_platform.loc[df_platform['game_id'].isin(game_list)]

# one hot encoding
ohc_parent_platform = one_hot_encode(df_platform,"game_id","parent_platform")
ohc_platform = one_hot_encode(df_platform,"game_id","platform")
ohc_platform = ohc_platform.merge(ohc_parent_platform, how="inner", on=["id"])

ohc_platform

Unnamed: 0,platform_android,platform_ios,platform_linux,platform_macos,platform_nintendo_switch,platform_pc,platform_playstation4,platform_playstation5,platform_ps_vita,platform_xbox_one,...,id,parent_platform_android,parent_platform_ios,parent_platform_linux,parent_platform_mac,parent_platform_nintendo,parent_platform_pc,parent_platform_playstation,parent_platform_xbox,parent_platforms_count
0,0,0,0,0,0,1,1,0,0,1,...,28,0,0,0,0,0,1,1,1,3
1,0,0,0,0,0,1,0,0,0,0,...,9639,0,0,0,0,0,1,0,0,1
2,0,0,0,1,1,1,1,0,0,1,...,17327,0,0,0,1,1,1,1,1,5
3,0,0,1,1,1,1,1,1,0,1,...,22123,0,0,1,1,1,1,1,1,6
4,0,0,0,0,0,1,1,0,0,0,...,28109,0,0,0,0,0,1,1,0,2
5,0,0,0,0,0,1,1,1,0,1,...,41494,0,0,0,0,0,1,1,1,3
6,0,0,0,0,1,1,1,0,0,1,...,44641,0,0,0,0,1,1,1,1,4
7,0,0,0,0,1,1,0,0,0,0,...,44831,0,0,0,0,1,1,0,0,2
8,0,0,0,1,0,1,0,0,0,1,...,46287,0,0,0,1,0,1,0,1,3
9,0,0,0,0,1,1,1,0,0,1,...,46824,0,0,0,0,1,1,1,1,4


### Stores

In [59]:
# read entity and relationship tables from "transformed_data" folder
entity_store = pd.read_csv(os.path.join(data_directory, "entity_store.csv")).rename(columns={"id":"store_id","slug":"store"})
rs_game_store = pd.read_csv(os.path.join(data_directory, "rs_game_store.csv"))

# inner join entity and relationship table
df_store = rs_game_store.merge(entity_store, how="inner", on="store_id")[["game_id","store"]]
df_store = df_store.loc[df_store['game_id'].isin(game_list)]

# one hot encoding
ohc_store = one_hot_encode(df_store,"game_id","store")

ohc_store

Unnamed: 0,store_apple_appstore,store_epic_games,store_gog,store_google_play,store_nintendo,store_playstation_store,store_steam,store_xbox_store,stores_count,id
0,0,1,0,0,0,1,1,1,4,28
1,0,1,1,0,0,0,1,0,3,9639
2,0,0,1,0,1,1,1,1,5,17327
3,0,0,1,0,0,1,1,1,4,22123
4,0,0,0,0,0,1,1,0,2,28109
5,0,1,1,0,0,1,1,1,5,41494
6,0,1,1,0,1,1,1,1,6,44641
7,0,0,0,0,1,0,1,0,2,44831
8,0,0,1,0,0,0,1,1,3,46287
9,0,0,0,0,1,1,1,1,4,46824


### Publishers

In [61]:
# read entity and relationship tables from "transformed_data" folder
entity_publisher = pd.read_csv(os.path.join(data_directory, "entity_publisher.csv")).rename(columns={"id":"publisher_id","slug":"publisher"})
rs_game_publisher = pd.read_csv(os.path.join(data_directory, "rs_game_publisher.csv"))

# inner join entity and relationship table
df_publisher = rs_game_publisher.merge(entity_publisher, how="inner", on="publisher_id")[["game_id","publisher"]]
df_publisher = df_publisher.loc[df_publisher['game_id'].isin(game_list)]

# one hot encode
ohc_publisher = one_hot_encode(df_publisher,"game_id","publisher")

ohc_publisher

Unnamed: 0,publisher_101xp,publisher_505_games,publisher_activision_blizzard,publisher_agm_playism,publisher_annapurna_interactive,publisher_armor_games_studios,publisher_bandai_namco_entertainment,publisher_blue_brain_games,publisher_capcom,publisher_cd_projekt_red,...,publisher_thunderful_publishing,publisher_tinybuild,publisher_toge_productions,publisher_ubisoft_entertainment,publisher_ultimate_games,publisher_unfold_games,publisher_xbox_game_studios,publisher_zhu_shi_hui_she_anipuretsukusu,publishers_count,id
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,28
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,9639
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,2,17327
3,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,2,22123
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,28109
5,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,1,41494
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,44641
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,2,44831
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,46287
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,2,46824


### Genres

In [62]:
# read entity and relationship tables from "transformed_data" folder
entity_genre = pd.read_csv(os.path.join(data_directory, "entity_genre.csv")).rename(columns={"id":"genre_id","slug":"genre"})
rs_game_genre = pd.read_csv(os.path.join(data_directory, "rs_game_genre.csv"))

# inner join entity and relationship table
df_genre = rs_game_genre.merge(entity_genre, how="inner", on="genre_id")[["game_id","genre"]]
df_genre = df_genre.loc[df_genre['game_id'].isin(game_list)]

# one hot encode
ohc_genre = one_hot_encode(df_genre,"game_id","genre")

ohc_genre

Unnamed: 0,genre_action,genre_adventure,genre_card,genre_casual,genre_fighting,genre_indie,genre_massively_multiplayer,genre_platformer,genre_puzzle,genre_racing,genre_role_playing_games_rpg,genre_shooter,genre_simulation,genre_sports,genre_strategy,genres_count,id
0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,2,28
1,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,3,9639
2,0,1,0,0,0,0,0,0,0,0,0,0,1,0,1,3,17327
3,1,1,0,0,0,1,0,0,0,0,1,0,0,0,0,4,22123
4,1,0,0,0,0,1,0,0,0,1,0,0,0,1,0,4,28109
5,1,1,0,0,0,0,0,0,0,0,1,0,0,0,0,3,41494
6,0,0,0,0,0,1,0,0,0,0,0,0,1,0,1,3,44641
7,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0,3,44831
8,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,2,46287
9,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,3,46824


### Tags

In [63]:
# read entity and relationship tables from "transformed_data" folder
entity_tag = pd.read_csv(os.path.join(data_directory, "entity_tag.csv")).rename(columns={"id":"tag_id","slug":"tag"})
rs_game_tag = pd.read_csv(os.path.join(data_directory, "rs_game_tag.csv"))

# inner join entity and relationship table
df_tag = rs_game_tag.merge(entity_tag, how="inner", on="tag_id")[["game_id","tag"]]
df_tag = df_tag.loc[df_tag['game_id'].isin(game_list)]

# one hot encode
ohc_tag = one_hot_encode(df_tag,"game_id","tag")

ohc_tag

Unnamed: 0,tag_25d,tag_2d,tag_3d,tag_3d_fighter_2,tag_3d_platformer,tag_3d_platformer_2,tag_3rd_person,tag_3rd_person_perspective,tag_4x,tag_abstract,...,tag_weapons,tag_western,tag_work,tag_world_war_ii,tag_zarubezhnyi,tag_zhenshchina_protagonist,tag_zombi_2,tag_zombies,tags_count,id
0,0,0,0,0,0,0,1,1,0,0,...,0,1,0,0,0,0,0,0,53,28
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,13,9639
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,8,17327
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,21,22123
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,11,28109
5,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,52,41494
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,21,44641
7,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,21,44831
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,11,46287
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,21,46824


# Merge Entities

In [67]:
data = entity_game.copy()
data = data.merge(ohc_platform, how="inner", on=["id"]) \
           .merge(ohc_store, how="inner", on=["id"]) \
           .merge(ohc_publisher, how="inner", on=["id"]) \
           .merge(ohc_genre, how="inner", on=["id"]) \
           .merge(ohc_tag, how="inner", on=["id"])

print("Shape:", data.shape)
print("Columns w NA:", data.columns[data.isna().any()].tolist())

Shape: (55, 575)
Columns w NA: []


In [65]:
# only retain publisher/genre/tag columns that are included in at least 1 game
data = data.loc[:,~(data.sum() == 0 & data.columns.str.startswith(("publishers_","genres_","tags_")))]
print(data.shape)

(55, 568)


In [66]:
def bin(y):
    """
    Helper function to transform target variable from continuous to categorical.
    """
    if y > 4:
        return 4
    elif y > 3:
        return 3
    elif y > 2:
        return 2
    else:
        return 1

data["rating"] = data["rating"].apply(bin)

# Load Data

In [16]:
data.to_csv(os.path.join(data_upload_directory, "data.csv"), index=False)

In [17]:
'''
def extract_categories_as_columns(df:pd.DataFrame(),entity:str) -> pd.DataFrame():
    """
    Helper function to extract categories as columns, with game_id as rows and count percentage as values.
    """
    # create dictionary to group data by game_id and category_id
    dic = {}
    for index, row in df.iterrows():
        game_id = row["game_id"]
        category = row[entity]
        if game_id not in dic:
            dic[game_id] = {}
        dic[game_id][entity+"_"+category] = row["count"]

    # convert dictionary to dataframe
    new_df= pd.DataFrame.from_dict(dic, orient='index').fillna(0)

    # add total count
    new_df[entity+"_count"] = new_df.sum(axis=1)

    # transform counts to percentages
    for col in new_df.columns:
        if col != entity+"_count":
            new_df[col] = (new_df[col]/new_df[entity+"_count"]).mul(100).round(2)

    # create id column for joining purposes
    new_df["id"] = new_df.index

    return new_df

# read entity and relationship tables from "transformed_data" folder
entity_rating = pd.read_csv(os.path.join(data_directory, "entity_rating.csv")).rename(columns={"id":"rating_id","title":"rating"})
rs_game_rating = pd.read_csv(os.path.join(data_directory, "rs_game_rating.csv"))

# inner join entity and relationship table
df_rating = rs_game_rating.merge(entity_rating, how="inner", on=["rating_id"])

# extract categories as columns
df_rating= extract_categories_as_columns(df_rating,"rating")

df_rating
'''

'\ndef extract_categories_as_columns(df:pd.DataFrame(),entity:str) -> pd.DataFrame():\n    """\n    Helper function to extract categories as columns, with game_id as rows and count percentage as values.\n    """\n    # create dictionary to group data by game_id and category_id\n    dic = {}\n    for index, row in df.iterrows():\n        game_id = row["game_id"]\n        category = row[entity]\n        if game_id not in dic:\n            dic[game_id] = {}\n        dic[game_id][entity+"_"+category] = row["count"]\n\n    # convert dictionary to dataframe\n    new_df= pd.DataFrame.from_dict(dic, orient=\'index\').fillna(0)\n\n    # add total count\n    new_df[entity+"_count"] = new_df.sum(axis=1)\n\n    # transform counts to percentages\n    for col in new_df.columns:\n        if col != entity+"_count":\n            new_df[col] = (new_df[col]/new_df[entity+"_count"]).mul(100).round(2)\n\n    # create id column for joining purposes\n    new_df["id"] = new_df.index\n\n    return new_df\n\n# 