# Explanatory Data Analysis Notebook

## 1. Packages

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

## 2. Load Data and Preprocessing

In [2]:
# Define data path
DATA_PATH = "./data"
GAME_DATA = DATA_PATH + "/games.csv"

In [3]:
# Load and check data
df = pd.read_csv(GAME_DATA)
df.head(3)

Unnamed: 0,AppID,Name,Release date,Estimated owners,Peak CCU,Required age,Price,DLC count,About the game,Supported languages,...,Average playtime two weeks,Median playtime forever,Median playtime two weeks,Developers,Publishers,Categories,Genres,Tags,Screenshots,Movies
0,20200,Galactic Bowling,"Oct 21, 2008",0 - 20000,0,0,19.99,0,Galactic Bowling is an exaggerated and stylize...,['English'],...,0,0,0,Perpetual FX Creative,Perpetual FX Creative,"Single-player,Multi-player,Steam Achievements,...","Casual,Indie,Sports","Indie,Casual,Sports,Bowling",https://cdn.akamai.steamstatic.com/steam/apps/...,http://cdn.akamai.steamstatic.com/steam/apps/2...
1,655370,Train Bandit,"Oct 12, 2017",0 - 20000,0,0,0.99,0,THE LAW!! Looks to be a showdown atop a train....,"['English', 'French', 'Italian', 'German', 'Sp...",...,0,0,0,Rusty Moyher,Wild Rooster,"Single-player,Steam Achievements,Full controll...","Action,Indie","Indie,Action,Pixel Graphics,2D,Retro,Arcade,Sc...",https://cdn.akamai.steamstatic.com/steam/apps/...,http://cdn.akamai.steamstatic.com/steam/apps/2...
2,1732930,Jolt Project,"Nov 17, 2021",0 - 20000,0,0,4.99,0,Jolt Project: The army now has a new robotics ...,"['English', 'Portuguese - Brazil']",...,0,0,0,Campião Games,Campião Games,Single-player,"Action,Adventure,Indie,Strategy",,https://cdn.akamai.steamstatic.com/steam/apps/...,http://cdn.akamai.steamstatic.com/steam/apps/2...


### 2.1. Rename columns

In [4]:
# List all columns
df.columns

Index(['AppID', 'Name', 'Release date', 'Estimated owners', 'Peak CCU',
       'Required age', 'Price', 'DLC count', 'About the game',
       'Supported languages', 'Full audio languages', 'Reviews',
       'Header image', 'Website', 'Support url', 'Support email', 'Windows',
       'Mac', 'Linux', 'Metacritic score', 'Metacritic url', 'User score',
       'Positive', 'Negative', 'Score rank', 'Achievements', 'Recommendations',
       'Notes', 'Average playtime forever', 'Average playtime two weeks',
       'Median playtime forever', 'Median playtime two weeks', 'Developers',
       'Publishers', 'Categories', 'Genres', 'Tags', 'Screenshots', 'Movies'],
      dtype='object')

In [5]:
og_cols = df.columns
col_mapping = dict(zip(og_cols, ["_".join(col.lower().split()) for col in og_cols]))
df = df.rename(columns=col_mapping)
df.head(3)

Unnamed: 0,appid,name,release_date,estimated_owners,peak_ccu,required_age,price,dlc_count,about_the_game,supported_languages,...,average_playtime_two_weeks,median_playtime_forever,median_playtime_two_weeks,developers,publishers,categories,genres,tags,screenshots,movies
0,20200,Galactic Bowling,"Oct 21, 2008",0 - 20000,0,0,19.99,0,Galactic Bowling is an exaggerated and stylize...,['English'],...,0,0,0,Perpetual FX Creative,Perpetual FX Creative,"Single-player,Multi-player,Steam Achievements,...","Casual,Indie,Sports","Indie,Casual,Sports,Bowling",https://cdn.akamai.steamstatic.com/steam/apps/...,http://cdn.akamai.steamstatic.com/steam/apps/2...
1,655370,Train Bandit,"Oct 12, 2017",0 - 20000,0,0,0.99,0,THE LAW!! Looks to be a showdown atop a train....,"['English', 'French', 'Italian', 'German', 'Sp...",...,0,0,0,Rusty Moyher,Wild Rooster,"Single-player,Steam Achievements,Full controll...","Action,Indie","Indie,Action,Pixel Graphics,2D,Retro,Arcade,Sc...",https://cdn.akamai.steamstatic.com/steam/apps/...,http://cdn.akamai.steamstatic.com/steam/apps/2...
2,1732930,Jolt Project,"Nov 17, 2021",0 - 20000,0,0,4.99,0,Jolt Project: The army now has a new robotics ...,"['English', 'Portuguese - Brazil']",...,0,0,0,Campião Games,Campião Games,Single-player,"Action,Adventure,Indie,Strategy",,https://cdn.akamai.steamstatic.com/steam/apps/...,http://cdn.akamai.steamstatic.com/steam/apps/2...


In [6]:
df.columns

Index(['appid', 'name', 'release_date', 'estimated_owners', 'peak_ccu',
       'required_age', 'price', 'dlc_count', 'about_the_game',
       'supported_languages', 'full_audio_languages', 'reviews',
       'header_image', 'website', 'support_url', 'support_email', 'windows',
       'mac', 'linux', 'metacritic_score', 'metacritic_url', 'user_score',
       'positive', 'negative', 'score_rank', 'achievements', 'recommendations',
       'notes', 'average_playtime_forever', 'average_playtime_two_weeks',
       'median_playtime_forever', 'median_playtime_two_weeks', 'developers',
       'publishers', 'categories', 'genres', 'tags', 'screenshots', 'movies'],
      dtype='object')

In [7]:
col_to_keep = ['name', 'peak_ccu', 'price','average_playtime_two_weeks', 'genres']

In [8]:
filtered_df = df[col_to_keep]
filtered_df.head(3)

Unnamed: 0,name,peak_ccu,price,average_playtime_two_weeks,genres
0,Galactic Bowling,0,19.99,0,"Casual,Indie,Sports"
1,Train Bandit,0,0.99,0,"Action,Indie"
2,Jolt Project,0,4.99,0,"Action,Adventure,Indie,Strategy"


In [9]:
# drop the rows where average_playtime_two_weeks is 0 
filtered_df = filtered_df[filtered_df['average_playtime_two_weeks'] != 0]
filtered_df.head(3)

Unnamed: 0,name,peak_ccu,price,average_playtime_two_weeks,genres
47,Far Cry® 5,2164,59.99,223,"Action,Adventure"
57,Forza Horizon 4,7571,59.99,230,Racing
66,Max Payne,49,3.49,79,Action


In [10]:
filtered_df.sort_values(by='average_playtime_two_weeks', ascending=False, inplace=True)

In [11]:
filtered_df.head()

Unnamed: 0,name,peak_ccu,price,average_playtime_two_weeks,genres
58237,America's Army: Proving Grounds,233,0.0,19159,"Action,Free to Play"
33019,GRID,69,19.99,10996,"Action,Casual,Racing,Simulation,Sports"
53432,Fe,7,19.99,10995,"Action,Adventure"
2037,Need for Speed™ Payback,342,29.99,10994,"Action,Adventure,Racing,Sports,Strategy"
36135,Plants vs. Zombies: Battle for Neighborville™,281,39.99,10993,"Action,Casual,Strategy"


In [12]:
filtered_df['average_play_hours_per_day'] = filtered_df['average_playtime_two_weeks'] / (14 * 60)
filtered_df.head()

Unnamed: 0,name,peak_ccu,price,average_playtime_two_weeks,genres,average_play_hours_per_day
58237,America's Army: Proving Grounds,233,0.0,19159,"Action,Free to Play",22.808333
33019,GRID,69,19.99,10996,"Action,Casual,Racing,Simulation,Sports",13.090476
53432,Fe,7,19.99,10995,"Action,Adventure",13.089286
2037,Need for Speed™ Payback,342,29.99,10994,"Action,Adventure,Racing,Sports,Strategy",13.088095
36135,Plants vs. Zombies: Battle for Neighborville™,281,39.99,10993,"Action,Casual,Strategy",13.086905


In [13]:
len(filtered_df)

2055

In [14]:
filtered_df.dropna(inplace=True)

In [15]:
len(filtered_df)

2053

In [16]:
cols_to_drop = ['average_playtime_two_weeks', 'genres']

In [None]:
filtered_df.drop(columns=cols_to_drop, inplace=True)

In [19]:
filtered_df.head()

Unnamed: 0,name,peak_ccu,price,average_play_hours_per_day
58237,America's Army: Proving Grounds,233,0.0,22.808333
33019,GRID,69,19.99,13.090476
53432,Fe,7,19.99,13.089286
2037,Need for Speed™ Payback,342,29.99,13.088095
36135,Plants vs. Zombies: Battle for Neighborville™,281,39.99,13.086905


In [30]:
top_100 = filtered_df.head(100)
top_100

Unnamed: 0,name,peak_ccu,price,average_play_hours_per_day
58237,America's Army: Proving Grounds,233,0.00,22.808333
33019,GRID,69,19.99,13.090476
53432,Fe,7,19.99,13.089286
2037,Need for Speed™ Payback,342,29.99,13.088095
36135,Plants vs. Zombies: Battle for Neighborville™,281,39.99,13.086905
...,...,...,...,...
72692,The Swordsmen X: Survival,0,29.74,1.852381
45743,FINAL FANTASY XIV Online,31735,19.99,1.848810
12922,EVE Online,4318,0.00,1.842857
34805,Fallout 4,23636,19.99,1.826190


In [31]:
# create a new column called addictivity, which is Too Dangerous if average_play_hours_per_day > 16,
# Dangerous if 10 < average_play_hours_per_day <= 16, Addictive if 5 < average_play_hours_per_day <= 10 and safe if average_play_hours_per_day <= 5
top_100['addictivity'] = pd.cut(top_100['average_play_hours_per_day'], bins=[0, 5, 10, 16, 24], labels=['Safe', 'Addictive', 'Dangerous', 'Too Dangerous'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  top_100['addictivity'] = pd.cut(top_100['average_play_hours_per_day'], bins=[0, 5, 10, 16, 24], labels=['Safe', 'Addictive', 'Dangerous', 'Too Dangerous'])


In [32]:
top_100

Unnamed: 0,name,peak_ccu,price,average_play_hours_per_day,addictivity
58237,America's Army: Proving Grounds,233,0.00,22.808333,Too Dangerous
33019,GRID,69,19.99,13.090476,Dangerous
53432,Fe,7,19.99,13.089286,Dangerous
2037,Need for Speed™ Payback,342,29.99,13.088095,Dangerous
36135,Plants vs. Zombies: Battle for Neighborville™,281,39.99,13.086905,Dangerous
...,...,...,...,...,...
72692,The Swordsmen X: Survival,0,29.74,1.852381,Safe
45743,FINAL FANTASY XIV Online,31735,19.99,1.848810,Safe
12922,EVE Online,4318,0.00,1.842857,Safe
34805,Fallout 4,23636,19.99,1.826190,Safe


In [34]:
top_100.to_csv(DATA_PATH + "/gameAddictivity.csv", index=False)