In [80]:
import os
import pandas as pd

In [82]:
# move all the files in initial_upload_raw_data folder to raw_data

data_directory = os.path.join(os.getcwd(), "data", "initial_upload", "raw_data")

# Transform Data

## Main Entities

### Game Schema

In [13]:
# To put game list and game details' data together
df_game = pd.read_csv(os.path.join(data_directory, "game_data.csv"))
df_game_details_data = pd.read_csv(os.path.join(data_directory, "game_details_data.csv"), low_memory=False)

In [14]:
# Remove columns in df_game from df_game_details_data
df_game_details_data_subset = df_game_details_data[[col for col in df_game_details_data.columns if col not in df_game.columns.tolist()] + ["id"]].copy()
df_game_output = df_game.merge(df_game_details_data_subset, on=["id"], how="left")
df_game_output.drop_duplicates(subset=["id"], inplace=True)

In [15]:
#### Add GAME STATUS data into game table

df_status = pd.read_csv(os.path.join(data_directory, "game_status.csv"))
df_game_output = df_game_output.merge(df_status.rename(columns={"game_id": "id"}), on=["id"], how="left")

In [19]:
#### Add ESRB data into game table

try:
    df_game_esrb = pd.read_csv(os.path.join(data_directory, "game_esrb.csv"))

except pd.errors.EmptyDataError:
    df_game_esrb = pd.DataFrame(columns=["esrb_id", "name", "slug", "name_en", "name_ru", "game_id"])

df_game_output = df_game_output.merge(df_game_esrb[["name", "game_id"]].rename(columns={"name": "esrb", "game_id": "id"}), how="left", on="id")

In [20]:
#### Remove these columns

col_to_delete = [
    "tba",
    "background_image",
    "rating_top",
    "ratings_count",
    "clip",
    "user_game",
    "saturated_color",
    "dominant_color",
    "community_rating",
    "name_original",
    "description",
    "background_image_additional",
    "reddit_logo",
    "reddit_description",
    "metacritic_url"
]

df_game_output.drop(columns=col_to_delete, inplace=True)

In [21]:
df_game_output.shape

(47, 30)

In [22]:
df_game_output.iloc[0:5, 0:10]

Unnamed: 0,slug,name,playtime,released,rating,reviews_text_count,added,metacritic,suggestions_count,updated
0,qi-men-dun-jia,《奇门遁甲》,0,2023-01-01,0.0,0,1,,171,2023-01-02T02:20:10
1,clash-robot-detective-complete-edition,Clash: Robot Detective - Complete Edition,0,2023-01-02,0.0,0,1,,298,2023-01-06T02:42:46
2,eversoul,Eversoul,0,2023-01-03,0.0,0,1,,350,2023-01-16T05:46:14
3,office-mayhem-2,Office Mayhem,0,2023-01-01,0.0,0,0,,0,2023-01-02T02:18:54
4,battle-mage,Battle Mage,0,2023-01-03,0.0,0,0,,0,2023-01-06T02:29:56


In [23]:
df_game_output.iloc[0:5, 11:20]

Unnamed: 0,score,reviews_count,website,screenshots_count,movies_count,creators_count,achievements_count,parent_achievements_count,reddit_url
0,,0,,5,0,0,0,0,
1,,0,https://dronegardenstudios.000webhostapp.com/i...,8,0,0,0,0,
2,,0,,8,0,0,0,0,
3,,0,https://babysnickweeze.wixsite.com/supremegaun...,10,0,0,0,0,
4,,0,https://www.mfgassets.com/,6,0,0,0,0,


In [24]:
df_game_output.iloc[0:5, 21:]

Unnamed: 0,reddit_count,twitch_count,youtube_count,parents_count,additions_count,game_series_count,description_raw,owned,esrb
0,0,0,0,0,0,0,这是一款以中国古老术数《奇门遁甲》帮助顾客算卦的游戏，游戏中可以体验给别人算卦的乐趣，了解中...,1.0,
1,0,0,0,0,0,0,A relaxing time on a high-tech cruise ship tur...,,
2,0,0,0,0,0,0,COLLECT YOUR SOULS!\nCollect beautiful Souls a...,,
3,0,0,0,0,0,0,When a man named Matthew Jarvis realizes that ...,,
4,0,0,0,0,0,0,Battle Mage is a rogue-like/souls-like/Metroid...,,


### Parent Platform Schema

In [25]:
df_parent_platform = pd.read_csv(os.path.join(data_directory, "parent_platform_data.csv"))
df_parent_platform

Unnamed: 0,id,name,slug
0,1,PC,pc
1,2,PlayStation,playstation
2,3,Xbox,xbox
3,4,iOS,ios
4,8,Android,android
5,5,Apple Macintosh,mac
6,6,Linux,linux
7,7,Nintendo,nintendo
8,9,Atari,atari
9,10,Commodore / Amiga,commodore-amiga


### Platform Schema

In [26]:
df_platform = pd.read_csv(os.path.join(data_directory, "platform_data.csv"))
df_platform = df_platform[["id", "name", "slug"]].copy()
df_platform

Unnamed: 0,id,name,slug
0,4,PC,pc
1,187,PlayStation 5,playstation5
2,18,PlayStation 4,playstation4
3,1,Xbox One,xbox-one
4,186,Xbox Series S/X,xbox-series-x
5,7,Nintendo Switch,nintendo-switch
6,3,iOS,ios
7,21,Android,android
8,8,Nintendo 3DS,nintendo-3ds
9,9,Nintendo DS,nintendo-ds


In [27]:
#### Add Parent Platform FK in

df_parent_platform_platform = pd.read_csv(os.path.join(data_directory, "parent_platform_platform.csv"))
df_platform_output = df_platform.merge(df_parent_platform_platform[["platform_id", "parent_platform_id"]].rename(columns={"platform_id": "id"}), how="left", on="id")
df_platform_output

Unnamed: 0,id,name,slug,parent_platform_id
0,4,PC,pc,1
1,187,PlayStation 5,playstation5,2
2,18,PlayStation 4,playstation4,2
3,1,Xbox One,xbox-one,3
4,186,Xbox Series S/X,xbox-series-x,3
5,7,Nintendo Switch,nintendo-switch,7
6,3,iOS,ios,4
7,21,Android,android,8
8,8,Nintendo 3DS,nintendo-3ds,7
9,9,Nintendo DS,nintendo-ds,7


In [28]:
# CHECK FOR PARENT PLATFORM NOT IN PARENT_PLATFORM_SCHEMA

df_platform_output[~df_platform_output["parent_platform_id"].isin(df_parent_platform["id"])]

Unnamed: 0,id,name,slug,parent_platform_id


### Publisher Schema

In [29]:
df_publisher = pd.read_csv(os.path.join(data_directory, "publisher_data.csv"))
df_publisher.columns

Index(['id', 'name', 'slug', 'games_count', 'image_background', 'description'], dtype='object')

In [30]:
# Exclude these columns first

df_publisher.drop(columns=["games_count", "image_background"], inplace=True)

In [31]:
df_publisher.sample(5)

Unnamed: 0,id,name,slug,description
28214,33686,Flyover Games,flyover-games-llc,
21956,18193,Aghayeva and Ko,aghayeva-and-ko-sro,
20951,27481,eescape Room,eescape-room,
27563,7420,Simon Says: Play!,simon-says-play,
6292,22050,iQuantile,iquantile,


### Tag Schema

In [33]:
df_tag = pd.read_csv(os.path.join(data_directory, "tag_data.csv"))
df_tag.columns

Index(['id', 'name', 'slug'], dtype='object')

In [34]:
# tag/list API return duplicates

df_tag.drop_duplicates(subset=["id"], inplace=True)

###  Genre Schema

In [36]:
df_genre = pd.read_csv(os.path.join(data_directory, "genre_data.csv"))
df_genre.columns

Index(['id', 'name', 'slug'], dtype='object')

In [37]:
df_genre

Unnamed: 0,id,name,slug
0,4,Action,action
1,51,Indie,indie
2,3,Adventure,adventure
3,5,RPG,role-playing-games-rpg
4,10,Strategy,strategy
5,2,Shooter,shooter
6,40,Casual,casual
7,14,Simulation,simulation
8,7,Puzzle,puzzle
9,11,Arcade,arcade


### Store Schema

In [38]:
df_store = pd.read_csv(os.path.join(data_directory, "store_data.csv"))
df_store.columns

Index(['id', 'name', 'domain', 'slug'], dtype='object')

In [39]:
df_store

Unnamed: 0,id,name,domain,slug
0,1,Steam,store.steampowered.com,steam
1,3,PlayStation Store,store.playstation.com,playstation-store
2,2,Xbox Store,microsoft.com,xbox-store
3,4,App Store,apps.apple.com,apple-appstore
4,5,GOG,gog.com,gog
5,6,Nintendo Store,nintendo.com,nintendo
6,7,Xbox 360 Store,marketplace.xbox.com,xbox360
7,8,Google Play,play.google.com,google-play
8,9,itch.io,itch.io,itch
9,11,Epic Games,epicgames.com,epic-games


## Weak Schema

###  Ratings Schema

In [72]:
# Problem --> No rating at all [dataset 0 rows]

df_game_ratings = pd.read_csv(os.path.join(data_directory, "game_rating.csv"))

if len(df_game_ratings) == 0:
    df_game_ratings = pd.DataFrame(columns=["id", "title", "count", "percent", "game_id"])

In [73]:
df_ratings = df_game_ratings[["id", "title"]].drop_duplicates()
df_ratings

Unnamed: 0,id,title


## Relationship Schema

###  Game-Platform Schema

In [43]:
df_game_platforms = pd.read_csv(os.path.join(data_directory, "game_platform.csv"))

In [46]:
# no idea why this has so many duplicates

try:
    df_game_metacritic = pd.read_csv(os.path.join(data_directory, "game_details_metacritic.csv"))

except pd.errors.EmptyDataError:
    df_game_metacritic = pd.DataFrame(columns=["metascore", "url", "platform", "platform_id", "game_id"])

In [47]:
# Add metacritic score info into this relationship table

df_game_platform = pd.merge(df_game_platforms, df_game_metacritic[["metascore", "url", "platform_id", "game_id"]], how="left", on=["platform_id", "game_id"])
df_game_platform.rename(columns={"url": "metacritic_url"}, inplace=True)
df_game_platform.drop_duplicates(inplace=True)
df_game_platform

Unnamed: 0,game_id,platform_id,metascore,metacritic_url
0,902909,4,,
1,904202,4,,


In [48]:
# CHECK FOR PLATFORMS NOT IN PLATFORM_SCHEMA

df_game_platform[~df_game_platform["platform_id"].isin(df_platform["id"])]

Unnamed: 0,game_id,platform_id,metascore,metacritic_url


###  Game-Ratings Schema

In [75]:
df_game_ratings = pd.read_csv(os.path.join(data_directory, "game_rating.csv"))

In [76]:
df_game_ratings

Unnamed: 0,game_id


In [71]:
df_game_ratings = df_game_ratings[["id", "count", "game_id"]]
df_game_rating_output = df_game_ratings.rename(columns={"id": "rating_id"})
df_game_rating_output["rating_id"] = df_game_rating_output["rating_id"].astype(int)
df_game_rating_output

Unnamed: 0,rating_id,count,game_id


### Game-Genre Schema

In [50]:
df_game_genre = pd.read_csv(os.path.join(data_directory, "game_genre.csv"))

In [51]:
df_game_genre["genre_id"] = df_game_genre["genre_id"].astype(int)
df_game_genre_output = df_game_genre[["genre_id", "game_id"]]
df_game_genre_output

Unnamed: 0,genre_id,game_id
0,40,902909
1,10,902909
2,51,902909
3,51,904202
4,3,904202


In [52]:
# No of duplicated row
sum(df_game_genre_output.duplicated())

0

In [53]:
# CHECK FOR GENRE NOT IN GENRE_SCHEMA

df_game_genre_output[~df_game_genre_output["genre_id"].isin(df_genre["id"])]

Unnamed: 0,genre_id,game_id


### Game-Store Schema

In [54]:
df_game_store = pd.read_csv(os.path.join(data_directory, "game_store.csv"))

In [55]:
df_game_store

Unnamed: 0,game_id,store_id
0,902909,1
1,904202,1


In [56]:
# No of duplicated row
sum(df_game_store.duplicated())

0

In [57]:
# CHECK FOR STORES NOT IN STORE_SCHEMA

df_game_store[~df_game_store["store_id"].isin(df_store["id"])]

Unnamed: 0,game_id,store_id


### Game-Tag Schema

- Need an additional DAG to fetch Tag data not captured in Tag/List API

In [58]:
df_game_tag = pd.read_csv(os.path.join(data_directory, "game_tag.csv"))

In [59]:
df_game_tag

Unnamed: 0,game_id,tag_id
0,902909,31
1,902909,42396
2,902909,42398
3,902909,42421
4,902909,42399
5,902909,42413
6,902909,571
7,902909,142
8,902909,42526
9,902909,42582


In [60]:
# No of duplicated row
sum(df_game_tag.duplicated())

0

In [62]:
# CHECK FOR TAGS NOT IN TAG_SCHEMA

df_game_tag[~df_game_tag["tag_id"].isin(df_tag["id"])]

Unnamed: 0,game_id,tag_id
1,902909,42396
2,902909,42398
3,902909,42421
4,902909,42399
5,902909,42413
8,902909,42526
9,902909,42582
12,902909,42590
13,902909,42595
16,902909,58267


### Game-Publisher Schema

In [63]:
df_game_publisher = pd.read_csv(os.path.join(data_directory, "game_details_publisher.csv"))

In [64]:
df_game_publisher["publisher_id"] = df_game_publisher["publisher_id"].astype(int)

In [65]:
df_game_publisher

Unnamed: 0,publisher_id,game_id
0,62696,902909
1,62736,904182
2,4175,909879
3,54389,902908
4,62731,904171
5,36218,904172
6,62737,904185
7,55950,904195
8,22396,904186
9,50475,904198


In [66]:
# Check for duplicates
df_game_publisher[df_game_publisher.duplicated()]

Unnamed: 0,publisher_id,game_id


In [67]:
# Drop duplicates
df_game_publisher.drop_duplicates(inplace=True)

In [68]:
# CHECK FOR PUBLISHERS NOT IN PUBLISHER_SCHEMA
df_game_publisher[~df_game_publisher["publisher_id"].isin(df_publisher["id"])]

Unnamed: 0,publisher_id,game_id


# Export Schema Data

In [183]:
data_upload_directory = os.path.join(os.getcwd(), "initial_upload", "data", "transformed_data")

In [184]:
# Entity Tables

df_game_output.to_csv(os.path.join(data_upload_directory, "entity_game.csv"), index=False)
df_parent_platform.to_csv(os.path.join(data_upload_directory, "entity_parent_platform.csv"), index=False)
df_platform_output.to_csv(os.path.join(data_upload_directory, "entity_platform.csv"), index=False)
df_publisher.to_csv(os.path.join(data_upload_directory, "entity_publisher.csv"), index=False)
df_tag.to_csv(os.path.join(data_upload_directory, "entity_tag.csv"), index=False)
df_genre.to_csv(os.path.join(data_upload_directory, "entity_genre.csv"), index=False)
df_store.to_csv(os.path.join(data_upload_directory, "entity_store.csv"), index=False)
df_ratings.to_csv(os.path.join(data_upload_directory, "entity_rating.csv"), index=False)

# Relationship Tables
df_game_platform.to_csv(os.path.join(data_upload_directory, "rs_game_platform.csv"), index=False)
df_game_rating_output.to_csv(os.path.join(data_upload_directory, "rs_game_rating.csv"), index=False)
df_game_genre_output.to_csv(os.path.join(data_upload_directory, "rs_game_genre.csv"), index=False)
df_game_store.to_csv(os.path.join(data_upload_directory, "rs_game_store.csv"), index=False)
df_game_tag.to_csv(os.path.join(data_upload_directory, "rs_game_tag.csv"), index=False)

In [185]:
df_game_publisher.to_csv(os.path.join(data_upload_directory, "rs_game_publisher.csv"), index=False)