In [1]:
import os
import pandas as pd

In [2]:
data_directory = os.path.join(os.getcwd(), "raw_data")

# Transform Data

## Main Entities

### Game Schema

In [3]:
# To put game list and game details' data together
df_game = pd.read_csv(os.path.join(data_directory, "game_data.csv"))
df_game_details_data = pd.read_csv(os.path.join(data_directory, "game_details_data.csv"))

In [4]:
df_game_details_data_subset = df_game_details_data[[col for col in df_game_details_data.columns if col not in df_game.columns.tolist()] + ["id"]].copy()
df_game_output = df_game.merge(df_game_details_data_subset, on=["id"])

In [5]:
#### Add GAME STATUS data into game table

df_status = pd.read_csv(os.path.join(data_directory, "game_status.csv"))
df_game_output = df_game_output.merge(df_status.rename(columns={"game_id": "id"}), on=["id"])

In [7]:
#### Add ESRB data into game table

df_game_esrb = pd.read_csv(os.path.join(data_directory, "game_esrb.csv"))
df_game_output = df_game_output.merge(df_game_esrb[["name", "game_id"]].rename(columns={"name": "esrb", "game_id": "id"}), how="left", on="id")

In [8]:
#### Remove these columns

col_to_delete = [
    "tba",
    "background_image",
    "rating_top",
    "ratings_count",
    "clip",
    "user_game",
    "saturated_color",
    "dominant_color",
    "community_rating",
    "name_original",
    "description",
    "background_image_additional",
    "reddit_logo",
    "reddit_description",
    "metacritic_url"
]

df_game_output.drop(columns=col_to_delete, inplace=True)

In [9]:
df_game_output.iloc[0:5, 0:10]

Unnamed: 0,slug,name,playtime,released,rating,reviews_text_count,added,metacritic,suggestions_count,updated
0,red-dead-redemption-2,Red Dead Redemption 2,18,2018-10-26,4.59,65,13682,96.0,584,2023-02-23T12:16:08
1,hitman-2,Hitman 2,5,2018-11-13,4.04,5,6860,83.0,288,2023-02-22T17:06:36
2,dark-souls-ii-scholar-of-the-first-sin,Dark Souls II: Scholar of the First Sin,24,2015-04-01,4.14,3,5132,79.0,684,2023-02-23T12:30:29
3,street-fighter-v,Street Fighter V,6,2016-02-15,3.5,5,3881,74.0,600,2023-02-20T23:17:57
4,the-last-guardian,The Last Guardian,7,2016-10-25,4.08,4,2975,82.0,524,2023-02-22T12:35:52


In [10]:
df_game_output.iloc[0:5, 11:20]

Unnamed: 0,score,reviews_count,website,screenshots_count,movies_count,creators_count,achievements_count,parent_achievements_count,reddit_url
0,,4478,https://www.rockstargames.com/reddeadredemption2/,47,0,8,238,40,https://www.reddit.com/r/Red_Dead_Redemption_2/
1,,1101,https://hitman.com,5,0,3,667,103,
2,,1025,http://www.darksoulsii.com,34,0,8,106,10,
3,,469,http://www.streetfighter.com/,66,6,9,225,45,
4,,691,http://www.gendesign.co.jp/E_index.html,10,0,6,144,24,https://www.reddit.com/r/thelastguardian/


In [11]:
df_game_output.iloc[0:5, 21:]

Unnamed: 0,reddit_count,twitch_count,youtube_count,parents_count,additions_count,game_series_count,description_raw,yet,owned,beaten,toplay,dropped,playing,esrb
0,1566,196,1000000,0,1,3,"America, 1899. The end of the wild west era ha...",837.0,7203.0,2712.0,1504.0,567.0,859.0,Mature
1,0,134,1000000,0,13,9,Hitman 2 is the seventh game in its series. It...,565.0,4954.0,553.0,386.0,277.0,125.0,Mature
2,0,0,391215,1,0,10,New take in the Souls series. This time player...,301.0,3541.0,898.0,111.0,204.0,77.0,Mature
3,0,121,1000000,0,2,13,"Street Fighter V is a fighting game, the fifth...",126.0,3223.0,152.0,41.0,286.0,53.0,Teen
4,526,119,1000000,1,0,6,The Last Guardian is a brainchild of Fumito Ue...,237.0,1691.0,513.0,307.0,156.0,71.0,Teen


### Parent Platform Schema

In [12]:
df_parent_platform = pd.read_csv(os.path.join(data_directory, "parent_platform_data.csv"))
df_parent_platform

Unnamed: 0,id,name,slug
0,1,PC,pc
1,2,PlayStation,playstation
2,3,Xbox,xbox
3,4,iOS,ios
4,8,Android,android
5,5,Apple Macintosh,mac
6,6,Linux,linux
7,7,Nintendo,nintendo
8,9,Atari,atari
9,10,Commodore / Amiga,commodore-amiga


### Platform Schema

In [14]:
df_platform = pd.read_csv(os.path.join(data_directory, "platform_data.csv"))
df_platform = df_platform[["id", "name", "slug"]].copy()
df_platform

Unnamed: 0,id,name,slug
0,4,PC,pc
1,187,PlayStation 5,playstation5
2,18,PlayStation 4,playstation4
3,1,Xbox One,xbox-one
4,186,Xbox Series S/X,xbox-series-x
5,7,Nintendo Switch,nintendo-switch
6,3,iOS,ios
7,21,Android,android
8,8,Nintendo 3DS,nintendo-3ds
9,9,Nintendo DS,nintendo-ds


In [15]:
#### Add Parent Platform FK in

df_parent_platform_platform = pd.read_csv(os.path.join(data_directory, "parent_platform_platform.csv"))
df_platform_output = df_platform.merge(df_parent_platform_platform[["platform_id", "parent_platform_id"]].rename(columns={"platform_id": "id"}), how="left", on="id")
df_platform_output

Unnamed: 0,id,name,slug,parent_platform_id
0,4,PC,pc,1
1,187,PlayStation 5,playstation5,2
2,18,PlayStation 4,playstation4,2
3,1,Xbox One,xbox-one,3
4,186,Xbox Series S/X,xbox-series-x,3
5,7,Nintendo Switch,nintendo-switch,7
6,3,iOS,ios,4
7,21,Android,android,8
8,8,Nintendo 3DS,nintendo-3ds,7
9,9,Nintendo DS,nintendo-ds,7


### Publisher Schema

In [16]:
df_publisher = pd.read_csv(os.path.join(data_directory, "publisher_data.csv"))
df_publisher.columns

Index(['id', 'name', 'slug', 'games_count', 'image_background', 'description'], dtype='object')

In [17]:
# Exclude these columns first

df_publisher.drop(columns=["games_count", "image_background"], inplace=True)

In [18]:
df_publisher.sample(5)

Unnamed: 0,id,name,slug,description
49,311,Deep Silver,deep-silver,<p>Deep Silver is a German-based video game pu...
42,358,2K Games,2k-games,<p>2K Games is an American-based video game pu...
33,3588,Nexon,nexon,
52,10691,Konami,konami,<p>Konami Digital Entertainment is a Japanese-...
16,34843,Xbox Game Studios,xbox-game-studios,


### Tag Schema

In [19]:
df_tag = pd.read_csv(os.path.join(data_directory, "tag_data.csv"))
df_tag.columns

Index(['id', 'name', 'slug'], dtype='object')

In [20]:
df_tag

Unnamed: 0,id,name,slug
0,31,Singleplayer,singleplayer
1,40847,Steam Achievements,steam-achievements
2,7,Multiplayer,multiplayer
3,40836,Full controller support,full-controller-support
4,13,Atmospheric,atmospheric
...,...,...,...
9476,17670,healer,healer
9477,3718,effects,effects
9478,63875,dreamcore,dreamcore
9479,60495,weirdcore,weirdcore


###  Genre Schema

In [21]:
df_genre = pd.read_csv(os.path.join(data_directory, "genre_data.csv"))
df_genre.columns

Index(['id', 'name', 'slug'], dtype='object')

In [22]:
df_genre

Unnamed: 0,id,name,slug
0,4,Action,action
1,51,Indie,indie
2,3,Adventure,adventure
3,5,RPG,role-playing-games-rpg
4,10,Strategy,strategy
5,2,Shooter,shooter
6,40,Casual,casual
7,14,Simulation,simulation
8,7,Puzzle,puzzle
9,11,Arcade,arcade


### Store Schema

In [23]:
df_store = pd.read_csv(os.path.join(data_directory, "store_data.csv"))
df_store.columns

Index(['id', 'name', 'domain', 'slug'], dtype='object')

In [24]:
df_store

Unnamed: 0,id,name,domain,slug
0,1,Steam,store.steampowered.com,steam
1,3,PlayStation Store,store.playstation.com,playstation-store
2,2,Xbox Store,microsoft.com,xbox-store
3,4,App Store,apps.apple.com,apple-appstore
4,5,GOG,gog.com,gog
5,6,Nintendo Store,nintendo.com,nintendo
6,7,Xbox 360 Store,marketplace.xbox.com,xbox360
7,8,Google Play,play.google.com,google-play
8,9,itch.io,itch.io,itch
9,11,Epic Games,epicgames.com,epic-games


## Weak Schema

###  Ratings Schema

In [27]:
df_game_ratings = pd.read_csv(os.path.join(data_directory, "game_rating.csv"))
df_game_ratings

Unnamed: 0,id,title,count,percent,game_id
0,5.0,exceptional,3278.0,73.20,28
1,4.0,recommended,819.0,18.29,28
2,3.0,meh,250.0,5.58,28
3,1.0,skip,131.0,2.93,28
4,4.0,recommended,579.0,52.59,58654
...,...,...,...,...,...
161,1.0,skip,1.0,50.00,622500
162,4.0,recommended,1.0,50.00,795357
163,1.0,skip,1.0,50.00,795357
164,4.0,recommended,1.0,100.00,846569


In [28]:
df_ratings = df_game_ratings[["id", "title"]].drop_duplicates()
df_ratings

Unnamed: 0,id,title
0,5.0,exceptional
1,4.0,recommended
2,3.0,meh
3,1.0,skip


###  ESRB Schema
- Added to Game for now

In [64]:
df_game_esrb = pd.read_csv(os.path.join(data_directory, "games_esrb.csv"))
df_esrb = df_game_esrb[["esrb_id", "name", "slug"]].drop_duplicates()
df_esrb

Unnamed: 0,esrb_id,name,slug
0,4,Mature,mature
3,3,Teen,teen
5,2,Everyone 10+,everyone-10-plus
19,1,Everyone,everyone


### Reactions
- No idea what it is

## Relationship Schema

###  Game-Platform Schema

In [30]:
df_game_platforms = pd.read_csv(os.path.join(data_directory, "game_platform.csv"))

In [31]:
# Add metacritic score info into this relationship table

df_game_metacritic = pd.read_csv(os.path.join(data_directory, "game_details_metacritic.csv"))
df_game_platform = pd.merge(df_game_platforms, df_game_metacritic[["metascore", "url", "platform_id", "game_id"]], how="left", on=["platform_id", "game_id"])
df_game_platform.rename(columns={"url": "metacritic_url"}, inplace=True)
df_game_platform

Unnamed: 0,game_id,platform_id,metascore,metacritic_url
0,28,4,93.0,https://www.metacritic.com/game/pc/red-dead-re...
1,28,1,97.0,https://www.metacritic.com/game/xbox-one/red-d...
2,28,18,97.0,https://www.metacritic.com/game/playstation-4/...
3,58654,4,82.0,https://www.metacritic.com/game/pc/hitman-2
4,58654,1,84.0,https://www.metacritic.com/game/xbox-one/hitman-2
...,...,...,...,...
155,795349,1,,
156,795349,18,,
157,795349,186,,
158,795349,7,,


###  Game-Ratings Schema

In [32]:
df_game_ratings = df_game_ratings[["id", "count", "game_id"]]
df_game_rating_output = df_game_ratings.rename(columns={"id": "rating_id"})
df_game_rating_output

Unnamed: 0,rating_id,count,game_id
0,5.0,3278.0,28
1,4.0,819.0,28
2,3.0,250.0,28
3,1.0,131.0,28
4,4.0,579.0,58654
...,...,...,...
161,1.0,1.0,622500
162,4.0,1.0,795357
163,1.0,1.0,795357
164,4.0,1.0,846569


###  Game-ESRB Schema
- Added into Game Schema

In [33]:
df_game_esrb[["esrb_id", "game_id"]]

Unnamed: 0,esrb_id,game_id
0,4,28
1,4,58654
2,4,3371
3,3,2805
4,3,693
5,2,10108
6,4,304953
7,3,10638
8,4,423944
9,3,388308


###  Game-Developer Schema
- TBD

### Game-Genre Schema

In [35]:
df_game_genre = pd.read_csv(os.path.join(data_directory, "game_genre.csv"))

In [36]:
df_game_genre_output =df_game_genre[["genre_id", "game_id"]]
df_game_genre_output

Unnamed: 0,genre_id,game_id
0,3.0,28
1,4.0,28
2,2.0,58654
3,4.0,58654
4,4.0,3371
...,...,...
134,2.0,795349
135,40.0,892942
136,51.0,892942
137,4.0,892942


### Game-Store Schema

In [37]:
df_game_store = pd.read_csv(os.path.join(data_directory, "game_store.csv"))

In [38]:
df_game_store

Unnamed: 0,game_id,store_id
0,28,1
1,28,3
2,28,2
3,28,11
4,58654,1
...,...,...
117,404524,4
118,290031,1
119,65106,1
120,795349,1


### Game-Tag Schema

In [39]:
df_game_tag = pd.read_csv(os.path.join(data_directory, "game_tag.csv"))

In [40]:
df_game_tag

Unnamed: 0,game_id,tag_id
0,28,31
1,28,42396
2,28,42417
3,28,42392
4,28,7
...,...,...
1049,892942,58124
1050,892942,55320
1051,892942,58419
1052,892942,73243


### Game-Publisher Schema

In [41]:
df_game_publisher = pd.read_csv(os.path.join(data_directory, "game_details_publisher.csv"))

In [42]:
df_game_publisher["publisher_id"] = df_game_publisher["publisher_id"].astype(int)

In [43]:
df_game_publisher

Unnamed: 0,publisher_id,game_id
0,2155,28
1,308,3439
2,19651,3439
3,339,2454
4,11687,58175
...,...,...
3305,8352,336333
3306,308,887670
3307,10392,892942
3308,48076,892942


# Export Schema Data

In [44]:
data_upload_directory = os.path.join(os.getcwd(), "transformed_data")

In [157]:
# Entity Tables

df_game_output.to_csv(os.path.join(data_upload_directory, "entity_game.csv"), index=False)
df_parent_platform.to_csv(os.path.join(data_upload_directory, "entity_parent_platform.csv"), index=False)
df_platform_output.to_csv(os.path.join(data_upload_directory, "entity_platform.csv"), index=False)
df_publisher.to_csv(os.path.join(data_upload_directory, "entity_publisher.csv"), index=False)
df_tag.to_csv(os.path.join(data_upload_directory, "entity_tag.csv"), index=False)
df_genre.to_csv(os.path.join(data_upload_directory, "entity_genre.csv"), index=False)
df_store.to_csv(os.path.join(data_upload_directory, "entity_store.csv"), index=False)
df_ratings.to_csv(os.path.join(data_upload_directory, "entity_rating.csv"), index=False)

# Relationship Tables
df_game_platform.to_csv(os.path.join(data_upload_directory, "rs_game_platform.csv"), index=False)
df_game_rating_output.to_csv(os.path.join(data_upload_directory, "rs_game_rating.csv"), index=False)
df_game_genre_output.to_csv(os.path.join(data_upload_directory, "rs_game_genre.csv"), index=False)
df_game_store.to_csv(os.path.join(data_upload_directory, "rs_game_store.csv"), index=False)
df_game_tag.to_csv(os.path.join(data_upload_directory, "rs_game_tag.csv"), index=False)

In [167]:
df_game_publisher.to_csv(os.path.join(data_upload_directory, "rs_game_publisher.csv"), index=False)