In [2]:
import numpy as np
import pandas as pd
import collections
from matplotlib import pyplot as plt
import seaborn as sns

In [64]:
# load the data set
steam = pd.read_csv("steam-200k.csv", header=None, index_col=None,\
names=['ID', 'GameName', 'Action', 'Hours', 'Empty'])
steam.head()

Unnamed: 0,ID,GameName,Action,Hours,Empty
0,151603712,The Elder Scrolls V Skyrim,purchase,1.0,0
1,151603712,The Elder Scrolls V Skyrim,play,273.0,0
2,151603712,Fallout 4,purchase,1.0,0
3,151603712,Fallout 4,play,87.0,0
4,151603712,Spore,purchase,1.0,0


In [65]:
# Remove the purchase flag. That will get rid of expansions and other 'games' in this list 
#that shouldn't be counted

steam = steam.loc[steam['Action'] == 'play']

# Pretty limited data set and appears to be a couple years old. For the full project I intend to update this
# data using steam's API and merging it using subscriber data from twitch.tv. 

# delete final column
del steam['Empty']

In [136]:
steam.drop(['Action'],axis=1).reset_index().head(10)

Unnamed: 0,index,ID,GameName,Hours
0,1,151603712,The Elder Scrolls V Skyrim,273.0
1,3,151603712,Fallout 4,87.0
2,5,151603712,Spore,14.9
3,7,151603712,Fallout New Vegas,12.1
4,9,151603712,Left 4 Dead 2,8.9
5,11,151603712,HuniePop,8.5
6,13,151603712,Path of Exile,8.1
7,15,151603712,Poly Bridge,7.5
8,17,151603712,Left 4 Dead,3.3
9,19,151603712,Team Fortress 2,2.8


In [118]:
steam=steam.reset_index()

In [128]:
user_item=steam.pivot_table(index='ID',columns='GameName',values='Hours')

In [129]:
user_item.reset_index()

GameName,ID,007 Legends,0RBITALIS,1... 2... 3... KICK IT! (Drop That Beat Like an Ugly Baby),10 Second Ninja,"10,000,000",100% Orange Juice,1000 Amps,12 Labours of Hercules,12 Labours of Hercules II The Cretan Bull,...,rFactor,rFactor 2,realMyst,realMyst Masterpiece Edition,resident evil 4 / biohazard 4,rymdkapsel,sZone-Online,the static speaks my name,theHunter,theHunter Primal
0,5250,,,,,,,,,,...,,,,,,,,,,
1,76767,,,,,,,,,,...,,,,,,,,,,
2,86540,,,,,,,,,,...,,,,,,,,,,
3,144736,,,,,,,,,,...,,,,,,,,,,
4,181212,,,,,,,,,,...,,,,,,,,,,
5,229911,,,,,,,,,,...,,,,,,,,,,
6,298950,,,,,,,,,,...,,,,,,,,,,
7,381543,,,,,,,,,,...,,,,,,,,,,
8,547685,,,,,,,,,,...,,,,,,,,,0.2,
9,554278,,,,,,,,,,...,,,,,,,,,,


In [144]:
n_gamers = steam.ID.unique().shape[0]

In [145]:
n_gamers

11350

In [146]:
n_games = steam.GameName.unique().shape[0]

In [147]:
n_games

3600

In [149]:
print('Number of Users =' + str(n_gamers) + ' | Number of games ' + str(n_games))

Number of Users =11350 | Number of games 3600


In [130]:
user_item.head()

GameName,007 Legends,0RBITALIS,1... 2... 3... KICK IT! (Drop That Beat Like an Ugly Baby),10 Second Ninja,"10,000,000",100% Orange Juice,1000 Amps,12 Labours of Hercules,12 Labours of Hercules II The Cretan Bull,12 Labours of Hercules III Girl Power,...,rFactor,rFactor 2,realMyst,realMyst Masterpiece Edition,resident evil 4 / biohazard 4,rymdkapsel,sZone-Online,the static speaks my name,theHunter,theHunter Primal
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5250,,,,,,,,,,,...,,,,,,,,,,
76767,,,,,,,,,,,...,,,,,,,,,,
86540,,,,,,,,,,,...,,,,,,,,,,
144736,,,,,,,,,,,...,,,,,,,,,,
181212,,,,,,,,,,,...,,,,,,,,,,


In [116]:
from sklearn import cross_validation as cv

In [131]:
# replace all NaN with 0 
user_item.fillna(0)

GameName,007 Legends,0RBITALIS,1... 2... 3... KICK IT! (Drop That Beat Like an Ugly Baby),10 Second Ninja,"10,000,000",100% Orange Juice,1000 Amps,12 Labours of Hercules,12 Labours of Hercules II The Cretan Bull,12 Labours of Hercules III Girl Power,...,rFactor,rFactor 2,realMyst,realMyst Masterpiece Edition,resident evil 4 / biohazard 4,rymdkapsel,sZone-Online,the static speaks my name,theHunter,theHunter Primal
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5250,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
76767,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
86540,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
144736,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
181212,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
229911,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
298950,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
381543,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
547685,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2,0.0
554278,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [132]:
train_data, test_data = cv.train_test_split(user_item,test_size=0.25)

(8512, 3600)

In [None]:
sklearn decomposition 

In [None]:
PCA, fit_transform
non negative matrix factorization NMF

In [14]:
len(steam.GameName.unique())

3600

In [17]:
len(steam.ID.unique())

11350

In [22]:
Tophours = steam.groupby("GameName", as_index=True).Hours.aggregate('sum')

In [27]:
Tophours.sort_values(ascending=False).head()

GameName
Dota 2                             981684.6
Counter-Strike Global Offensive    322771.6
Team Fortress 2                    173673.3
Counter-Strike                     134261.1
Sid Meier's Civilization V          99821.3
Name: Hours, dtype: float64