# Explanatory Data Analysis Notebook

## 1. Packages

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

## 2. Load Data and Preprocessing

In [2]:
# Define data path
DATA_PATH = "./data"
GAME_DATA = DATA_PATH + "/games.csv"

In [3]:
# Load and check data
df = pd.read_csv(GAME_DATA)
df.head(3)

Unnamed: 0,AppID,Name,Release date,Estimated owners,Peak CCU,Required age,Price,DLC count,About the game,Supported languages,...,Average playtime two weeks,Median playtime forever,Median playtime two weeks,Developers,Publishers,Categories,Genres,Tags,Screenshots,Movies
0,20200,Galactic Bowling,"Oct 21, 2008",0 - 20000,0,0,19.99,0,Galactic Bowling is an exaggerated and stylize...,['English'],...,0,0,0,Perpetual FX Creative,Perpetual FX Creative,"Single-player,Multi-player,Steam Achievements,...","Casual,Indie,Sports","Indie,Casual,Sports,Bowling",https://cdn.akamai.steamstatic.com/steam/apps/...,http://cdn.akamai.steamstatic.com/steam/apps/2...
1,655370,Train Bandit,"Oct 12, 2017",0 - 20000,0,0,0.99,0,THE LAW!! Looks to be a showdown atop a train....,"['English', 'French', 'Italian', 'German', 'Sp...",...,0,0,0,Rusty Moyher,Wild Rooster,"Single-player,Steam Achievements,Full controll...","Action,Indie","Indie,Action,Pixel Graphics,2D,Retro,Arcade,Sc...",https://cdn.akamai.steamstatic.com/steam/apps/...,http://cdn.akamai.steamstatic.com/steam/apps/2...
2,1732930,Jolt Project,"Nov 17, 2021",0 - 20000,0,0,4.99,0,Jolt Project: The army now has a new robotics ...,"['English', 'Portuguese - Brazil']",...,0,0,0,Campião Games,Campião Games,Single-player,"Action,Adventure,Indie,Strategy",,https://cdn.akamai.steamstatic.com/steam/apps/...,http://cdn.akamai.steamstatic.com/steam/apps/2...


### 2.1. Rename columns

In [4]:
# List all columns
df.columns

Index(['AppID', 'Name', 'Release date', 'Estimated owners', 'Peak CCU',
       'Required age', 'Price', 'DLC count', 'About the game',
       'Supported languages', 'Full audio languages', 'Reviews',
       'Header image', 'Website', 'Support url', 'Support email', 'Windows',
       'Mac', 'Linux', 'Metacritic score', 'Metacritic url', 'User score',
       'Positive', 'Negative', 'Score rank', 'Achievements', 'Recommendations',
       'Notes', 'Average playtime forever', 'Average playtime two weeks',
       'Median playtime forever', 'Median playtime two weeks', 'Developers',
       'Publishers', 'Categories', 'Genres', 'Tags', 'Screenshots', 'Movies'],
      dtype='object')

In [4]:
og_cols = df.columns
col_mapping = dict(zip(og_cols, ["_".join(col.lower().split()) for col in og_cols]))
df = df.rename(columns=col_mapping)
df.head(3)

Unnamed: 0,appid,name,release_date,estimated_owners,peak_ccu,required_age,price,dlc_count,about_the_game,supported_languages,...,average_playtime_two_weeks,median_playtime_forever,median_playtime_two_weeks,developers,publishers,categories,genres,tags,screenshots,movies
0,20200,Galactic Bowling,"Oct 21, 2008",0 - 20000,0,0,19.99,0,Galactic Bowling is an exaggerated and stylize...,['English'],...,0,0,0,Perpetual FX Creative,Perpetual FX Creative,"Single-player,Multi-player,Steam Achievements,...","Casual,Indie,Sports","Indie,Casual,Sports,Bowling",https://cdn.akamai.steamstatic.com/steam/apps/...,http://cdn.akamai.steamstatic.com/steam/apps/2...
1,655370,Train Bandit,"Oct 12, 2017",0 - 20000,0,0,0.99,0,THE LAW!! Looks to be a showdown atop a train....,"['English', 'French', 'Italian', 'German', 'Sp...",...,0,0,0,Rusty Moyher,Wild Rooster,"Single-player,Steam Achievements,Full controll...","Action,Indie","Indie,Action,Pixel Graphics,2D,Retro,Arcade,Sc...",https://cdn.akamai.steamstatic.com/steam/apps/...,http://cdn.akamai.steamstatic.com/steam/apps/2...
2,1732930,Jolt Project,"Nov 17, 2021",0 - 20000,0,0,4.99,0,Jolt Project: The army now has a new robotics ...,"['English', 'Portuguese - Brazil']",...,0,0,0,Campião Games,Campião Games,Single-player,"Action,Adventure,Indie,Strategy",,https://cdn.akamai.steamstatic.com/steam/apps/...,http://cdn.akamai.steamstatic.com/steam/apps/2...


In [5]:
df.columns

Index(['appid', 'name', 'release_date', 'estimated_owners', 'peak_ccu',
       'required_age', 'price', 'dlc_count', 'about_the_game',
       'supported_languages', 'full_audio_languages', 'reviews',
       'header_image', 'website', 'support_url', 'support_email', 'windows',
       'mac', 'linux', 'metacritic_score', 'metacritic_url', 'user_score',
       'positive', 'negative', 'score_rank', 'achievements', 'recommendations',
       'notes', 'average_playtime_forever', 'average_playtime_two_weeks',
       'median_playtime_forever', 'median_playtime_two_weeks', 'developers',
       'publishers', 'categories', 'genres', 'tags', 'screenshots', 'movies'],
      dtype='object')

In [6]:
col_to_keep = ['name', 'peak_ccu', 'price','average_playtime_two_weeks', 'genres']

In [7]:
filtered_df = df[col_to_keep]
filtered_df.head(3)

Unnamed: 0,name,peak_ccu,price,average_playtime_two_weeks,genres
0,Galactic Bowling,0,19.99,0,"Casual,Indie,Sports"
1,Train Bandit,0,0.99,0,"Action,Indie"
2,Jolt Project,0,4.99,0,"Action,Adventure,Indie,Strategy"


In [8]:
# drop the rows where average_playtime_two_weeks is 0 
filtered_df = filtered_df[filtered_df['average_playtime_two_weeks'] != 0]
filtered_df.head(3)

Unnamed: 0,name,peak_ccu,price,average_playtime_two_weeks,genres
47,Far Cry® 5,2164,59.99,223,"Action,Adventure"
57,Forza Horizon 4,7571,59.99,230,Racing
66,Max Payne,49,3.49,79,Action


In [9]:
filtered_df.sort_values(by='average_playtime_two_weeks', ascending=False, inplace=True)

In [10]:
filtered_df.head()

Unnamed: 0,name,peak_ccu,price,average_playtime_two_weeks,genres
58237,America's Army: Proving Grounds,233,0.0,19159,"Action,Free to Play"
33019,GRID,69,19.99,10996,"Action,Casual,Racing,Simulation,Sports"
53432,Fe,7,19.99,10995,"Action,Adventure"
2037,Need for Speed™ Payback,342,29.99,10994,"Action,Adventure,Racing,Sports,Strategy"
36135,Plants vs. Zombies: Battle for Neighborville™,281,39.99,10993,"Action,Casual,Strategy"


In [11]:
filtered_df['average_play_hours_per_day'] = filtered_df['average_playtime_two_weeks'] / (14 * 60)
filtered_df.head()

Unnamed: 0,name,peak_ccu,price,average_playtime_two_weeks,genres,average_play_hours_per_day
58237,America's Army: Proving Grounds,233,0.0,19159,"Action,Free to Play",22.808333
33019,GRID,69,19.99,10996,"Action,Casual,Racing,Simulation,Sports",13.090476
53432,Fe,7,19.99,10995,"Action,Adventure",13.089286
2037,Need for Speed™ Payback,342,29.99,10994,"Action,Adventure,Racing,Sports,Strategy",13.088095
36135,Plants vs. Zombies: Battle for Neighborville™,281,39.99,10993,"Action,Casual,Strategy",13.086905


In [12]:
len(filtered_df)

2055

In [13]:
filtered_df.dropna(inplace=True)

In [14]:
len(filtered_df)

2053

In [15]:
cols_to_drop = ['average_playtime_two_weeks']

In [16]:
filtered_df.drop(columns=cols_to_drop, inplace=True)
filtered_df.head()

Unnamed: 0,name,peak_ccu,price,genres,average_play_hours_per_day
58237,America's Army: Proving Grounds,233,0.0,"Action,Free to Play",22.808333
33019,GRID,69,19.99,"Action,Casual,Racing,Simulation,Sports",13.090476
53432,Fe,7,19.99,"Action,Adventure",13.089286
2037,Need for Speed™ Payback,342,29.99,"Action,Adventure,Racing,Sports,Strategy",13.088095
36135,Plants vs. Zombies: Battle for Neighborville™,281,39.99,"Action,Casual,Strategy",13.086905


In [17]:
filtered_df = filtered_df.assign(genres=df["genres"].str.split(',')).explode("genres")

In [18]:
len(filtered_df)

5654

In [19]:
filtered_df['genres'].unique()

array(['Action', 'Free to Play', 'Casual', 'Racing', 'Simulation',
       'Sports', 'Adventure', 'Strategy', 'Indie',
       'Massively Multiplayer', 'RPG', 'Early Access', 'Education',
       'Software Training', 'Utilities', 'Video Production',
       'Design & Illustration', 'Violent', 'Animation & Modeling',
       'Game Development', 'Audio Production', 'Web Publishing',
       'Photo Editing', 'Sexual Content', 'Nudity', 'Gore', 'Accounting'],
      dtype=object)

In [20]:
genre_counts = filtered_df.groupby(["name", "genres"]).size().reset_index(name="count")
genre_counts

Unnamed: 0,name,genres,count
0,(the) Gnorp Apologue,Casual,1
1,(the) Gnorp Apologue,Indie,1
2,(the) Gnorp Apologue,Simulation,1
3,(the) Gnorp Apologue,Strategy,1
4,100% Orange Juice,Indie,1
...,...,...,...
5642,👑 Idle Calibur 👑（选王之剑）,Casual,1
5643,👑 Idle Calibur 👑（选王之剑）,Indie,1
5644,👑 Idle Calibur 👑（选王之剑）,RPG,1
5645,👑 Idle Calibur 👑（选王之剑）,Simulation,1


In [21]:
genre_counts_aggregated = genre_counts.groupby('genres')['count'].sum().reset_index(name='total_count')
sorted_genres = genre_counts_aggregated.sort_values(by='total_count', ascending=False)

In [25]:
sorted_genres

Unnamed: 0,genres,total_count
1,Action,1054
12,Indie,945
2,Adventure,760
19,Simulation,555
16,RPG,526
22,Strategy,505
5,Casual,409
7,Early Access,278
9,Free to Play,177
13,Massively Multiplayer,134


In [22]:
top_to_genres = sorted_genres.head(10)
top_to_genres

Unnamed: 0,genres,total_count
1,Action,1054
12,Indie,945
2,Adventure,760
19,Simulation,555
16,RPG,526
22,Strategy,505
5,Casual,409
7,Early Access,278
9,Free to Play,177
13,Massively Multiplayer,134


In [27]:
top_10_genre =top_to_genres['genres'].values

In [28]:
top_10_genre

array(['Action', 'Indie', 'Adventure', 'Simulation', 'RPG', 'Strategy',
       'Casual', 'Early Access', 'Free to Play', 'Massively Multiplayer'],
      dtype=object)

In [29]:
filtered_df_further = filtered_df[filtered_df['genres'].isin(top_10_genre)]

In [31]:
len(filtered_df_further)

5343

In [None]:
filtered_df_further.to_csv(DATA_PATH + "/genreAddictivity.csv", index=False)