In [20]:
# from IPython.display import HTML

# HTML('''<script>
# code_show=true; 
# function code_toggle() {
#  if (code_show){
#  $('div.input').hide();
#  } else {
#  $('div.input').show();
#  }
#  code_show = !code_show
# } 
# $( document ).ready(code_toggle);
# </script>
# The raw code for this IPython notebook is by default hidden for easier reading.
# To toggle on/off the raw code, click <a href="javascript:code_toggle()">here</a>.''')

In [21]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline

In [22]:
#We will exclude the last row of the dataset once it represents no information for the analysis
steam = pd.read_csv(r'./data/steam-200k.csv', usecols=[0,1,2,3])

# Dataset description and prep

This dataset is a list of user behaviors, with columns: user-id, game-title, behavior-name, value. The behaviors included are 'purchase' and 'play'. The value indicates the degree to which the behavior was performed - in the case of 'purchase' the value is always 1, and in the case of 'play' the value represents the number of hours the user has played the game.

First, let's check a few lines of the dataset

In [23]:
steam.head()

Unnamed: 0,151603712,The Elder Scrolls V Skyrim,purchase,1.0
0,151603712,The Elder Scrolls V Skyrim,play,273.0
1,151603712,Fallout 4,purchase,1.0
2,151603712,Fallout 4,play,87.0
3,151603712,Spore,purchase,1.0
4,151603712,Spore,play,14.9


We see that our csv file has **no header**, so it's considering the first observation as it. Let's fix that by importing it again and adding our columns names!

In [24]:
steam = pd.read_csv(r'data/steam-200k.csv',
                 usecols=[0,1,2,3],
                 names = ['user_id', 'game', 'behavior', 'status'])

In [25]:
steam.head()

Unnamed: 0,user_id,game,behavior,status
0,151603712,The Elder Scrolls V Skyrim,purchase,1.0
1,151603712,The Elder Scrolls V Skyrim,play,273.0
2,151603712,Fallout 4,purchase,1.0
3,151603712,Fallout 4,play,87.0
4,151603712,Spore,purchase,1.0


Much better!

Now, we see we have duplicate rows of the same game for the same user_id. This happens for the way the dataset is built. I would recommend considering removing the rows for *behavior* = 'purchase', once it just brings the same information of the column *play* (that is, the user has aquired the game in order to play it. Or could he have bought it and not played and the dataset would just show purchase and not the play line?

Let's check?

In [26]:
n_games_purchased = steam[steam['behavior'] == 'purchase']['behavior'].value_counts().sum()
n_games_played = steam[steam['behavior'] == 'play']['behavior'].value_counts().sum()
ratio_played_purchased = n_games_played/n_games_purchased

In [27]:
print(f'Number of games purchased: {n_games_purchased}')
print(f'Number of games played: {n_games_played}')
print(f'Ratio Played/Purchased: {round(ratio_played_purchased*100,2)}%')

Number of games purchased: 129511
Number of games played: 70489
Ratio Played/Purchased: 54.43%


Well, from here we can see that the ratio of games purchased and played is *54.43%*, this means almost half the games bought are not played.

In [28]:
ign = pd.read_csv(r'./data/ign.csv')

In [29]:
ign.head()

Unnamed: 0.1,Unnamed: 0,score_phrase,title,url,platform,score,genre,editors_choice,release_year,release_month,release_day
0,0,Amazing,LittleBigPlanet PS Vita,/games/littlebigplanet-vita/vita-98907,PlayStation Vita,9.0,Platformer,Y,2012,9,12
1,1,Amazing,LittleBigPlanet PS Vita -- Marvel Super Hero E...,/games/littlebigplanet-ps-vita-marvel-super-he...,PlayStation Vita,9.0,Platformer,Y,2012,9,12
2,2,Great,Splice: Tree of Life,/games/splice/ipad-141070,iPad,8.5,Puzzle,N,2012,9,12
3,3,Great,NHL 13,/games/nhl-13/xbox-360-128182,Xbox 360,8.5,Sports,N,2012,9,11
4,4,Great,NHL 13,/games/nhl-13/ps3-128181,PlayStation 3,8.5,Sports,N,2012,9,11


In [30]:
#Creating PC only dataframe for the ign dataset
ign_pc = ign[ign['platform'] == 'PC'].copy()

In [31]:
ign_pc.head()

Unnamed: 0.1,Unnamed: 0,score_phrase,title,url,platform,score,genre,editors_choice,release_year,release_month,release_day
7,7,Amazing,Guild Wars 2,/games/guild-wars-2/pc-896298,PC,9.0,RPG,Y,2012,9,11
9,9,Good,Total War Battles: Shogun,/games/total-war-battles-shogun/pc-142564,PC,7.0,Strategy,N,2012,9,11
14,14,Amazing,Mark of the Ninja,/games/mark-of-the-ninja-135615/pc-143761,PC,9.0,"Action, Adventure",Y,2012,9,7
16,16,Okay,Home: A Unique Horror Adventure,/games/home-a-unique-horror-adventure/pc-137135,PC,6.5,Adventure,N,2012,9,6
23,23,Good,Mass Effect 3: Leviathan,/games/mass-effect-3-leviathan/pc-138919,PC,7.5,RPG,N,2012,8,31


In [32]:
steam.isna().sum()

user_id     0
game        0
behavior    0
status      0
dtype: int64

In [33]:
ign.isna().sum()

Unnamed: 0         0
score_phrase       0
title              0
url                0
platform           0
score              0
genre             36
editors_choice     0
release_year       0
release_month      0
release_day        0
dtype: int64

In [34]:
filtro = ign['genre'].isna()
ign[filtro]

Unnamed: 0.1,Unnamed: 0,score_phrase,title,url,platform,score,genre,editors_choice,release_year,release_month,release_day
12,12,Good,Wild Blood,/games/wild-blood/iphone-139363,iPhone,7.0,,N,2012,9,10
113,113,Good,Retro/Grade,/games/retrograde-138590/ps3-21766,PlayStation 3,7.0,,N,2012,8,15
160,160,Good,10000000,/games/10000000/iphone-139135,iPhone,7.5,,N,2012,8,9
176,176,Okay,Colour Bind,/games/colour-bind/pc-143757,PC,6.2,,N,2012,10,15
9375,9375,Great,Duke Nukem Arena,/games/duke-nukem-arena/cell-893821,Wireless,8.0,,Y,2007,6,15
9488,9488,Okay,Rengoku,/games/rengoku/cell-924924,Wireless,6.5,,N,2007,6,26
9767,9767,Good,Super Sketcher,/games/super-sketcher/cell-874054,Wireless,7.5,,N,2007,9,14
9774,9774,Amazing,Critter Crunch,/games/critter-crunch/cell-963486,Wireless,9.0,,Y,2007,9,13
10494,10494,Awful,Clue / Mouse Trap / Perfection / Aggravation,/games/clue-mouse-trap-perfection-aggravation/...,Nintendo DS,3.5,,N,2008,1,23
11367,11367,Painful,Jeep Thrills,/games/jeep-thrills/ps2-14246598,PlayStation 2,2.0,,N,2008,8,18


In [None]:
#mudança no master