In [1]:
import pandas as pd
import numpy as np 
import matplotlib
import matplotlib.pyplot as plt 
import seaborn as sns

In [2]:
games = pd.read_csv('steam-200k.csv')
games = games.rename({'151603712':'userId', 'The Elder Scrolls V Skyrim': 'gameName', '1.0':'Actions'}, axis = 1)
games.drop(['0'],axis = 1, inplace = True)

get a sense of the data

In [3]:
print("number of distinct users = %d" %games['userId'].nunique()) 
print("number of distinct games = %d" %games['gameName'].nunique()) 
games.head()

number of distinct users = 12393
number of distinct games = 5155


Unnamed: 0,userId,gameName,purchase,Actions
0,151603712,The Elder Scrolls V Skyrim,play,273.0
1,151603712,Fallout 4,purchase,1.0
2,151603712,Fallout 4,play,87.0
3,151603712,Spore,purchase,1.0
4,151603712,Spore,play,14.9


In [4]:
# checking missing data in games 
total = games.isnull().sum().sort_values(ascending = False) # total number of missing values
percent = (games.isnull().sum()/games.isnull().count()*100).sort_values(ascending = False)
missing_games_data  = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_games_data.head()

Unnamed: 0,Total,Percent
Actions,0,0.0
purchase,0,0.0
gameName,0,0.0
userId,0,0.0


we want to format data in a way such that each row get distinct pair of userId and gameName

In [5]:
games.head()

Unnamed: 0,userId,gameName,purchase,Actions
0,151603712,The Elder Scrolls V Skyrim,play,273.0
1,151603712,Fallout 4,purchase,1.0
2,151603712,Fallout 4,play,87.0
3,151603712,Spore,purchase,1.0
4,151603712,Spore,play,14.9


In [6]:
# split the purchase column to two dataframes and perform an outer join to group highly duplicated row
games_temp = games[games['purchase'] == 'play']
games_temp =games_temp.rename({'purchase':'play'}, axis = 1)
games = games[games.purchase =='purchase']
games.drop(columns = 'Actions', inplace = True)
result = pd.merge(games,games_temp, how='outer', on=['userId','gameName'])

In [7]:
# reindex to group the data associated with the same user together
reindex_result = result.sort_values(by = 'userId')
reindex_result.set_index(np.arange(len(reindex_result.index)))

Unnamed: 0,userId,gameName,purchase,play,Actions
0,5250,Portal 2,purchase,play,13.6
1,5250,Cities Skylines,purchase,play,144.0
2,5250,Deus Ex Human Revolution,purchase,play,62.0
3,5250,Alien Swarm,purchase,play,4.9
4,5250,Team Fortress 2,purchase,play,0.8
5,5250,Dota 2,purchase,play,0.2
6,5250,Counter-Strike,purchase,,
7,5250,Counter-Strike Source,purchase,,
8,5250,Day of Defeat,purchase,,
9,5250,Half-Life,purchase,,


In [8]:
save the cleaned data to a pickle file

'cleaned data to a pickle file' was not found in history, as a file, url, nor in the user namespace.


In [9]:
reindex_result.to_pickle('clean_steam_data.pkl')