In [25]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd

### Ratings file
Each line contains a rated video game:

- a user
- an item
- a rating from 1 to 5

In [26]:
raw_ratings = pd.read_csv('ratings_latest.csv', sep=',')

raw_ratings.head()

Unnamed: 0,id,rate,appid
0,76561197960265729,1.0,10
1,76561197960265729,1.0,20
2,76561197960265729,1.0,30
3,76561197960265729,1.0,40
4,76561197960265729,1.0,50


### Metadata file

This file contains information about each game, specifically:
- item
- name
- genres

In [27]:
items = pd.read_csv('metadata.csv', sep=',')

items.head()

Unnamed: 0,appid,appname,genres
0,10,Counter-Strike,1/
1,20,Team Fortress Classic,1/
2,30,Day of Defeat,1/
3,40,Deathmatch Classic,1/
4,50,Half-Life: Opposing Force,1/


In [28]:
all_ratings = pd.merge(items, raw_ratings)

all_ratings.head()

Unnamed: 0,appid,appname,genres,id,rate
0,10,Counter-Strike,1/,76561197960265729,1.0
1,10,Counter-Strike,1/,76561197960265730,2.05
2,10,Counter-Strike,1/,76561197960265731,1.0
3,10,Counter-Strike,1/,76561197960265733,1.4
4,10,Counter-Strike,1/,76561197960265734,1.0


### Data pre-analysis
Survey the data and analyze its distribution and statistics. The following are computed:

- number of users
- number of items
- rating distribution

In [29]:
all_ratings['rate'].describe()

count    2.573393e+07
mean     1.275410e+00
std      7.960212e-01
min      1.000000e+00
25%      1.000000e+00
50%      1.000000e+00
75%      1.100000e+00
max      5.950000e+00
Name: rate, dtype: float64

In [30]:
all_ratings['id'].value_counts().index.tolist()[0]
all_ratings['id'].value_counts().tolist()[0]

10440

User 76561197973009892 contains the most games in our dataset, with more than 10440 ratings.

In [31]:
all_ratings['appid'].value_counts().index.tolist()[0]

340

### Data pre-processing

In [32]:
users = all_ratings.id.unique()
user_map = {i:val for i,val in enumerate(users)}
inverse_user_map = {val:i for i,val in enumerate(users)}


games = all_ratings.appid.unique()
game_map = {i:val for i,val in enumerate(games)}
inverse_game_map = {val:i for i,val in enumerate(games)}

all_ratings["id"] = all_ratings["id"].map(inverse_user_map)

all_ratings["old_id"] = all_ratings["appid"] # copying for join with metadata
all_ratings["appid"] = all_ratings["appid"].map(inverse_game_map)

print ("A total of: ", users.shape[0], "unique users")
print ("A total of: ", games.shape[0], "unique games")

A total of:  676668 unique users
A total of:  18865 unique games


In [33]:
all_ratings.sort_values('rate', ascending=False).head(10)

Unnamed: 0,appid,appname,genres,id,rate,old_id
543073,2,Day of Defeat,1/,100887,5.95,30
2433028,11,Counter-Strike: Source,1/,364830,5.95,240
13934442,1625,Football Manager 2013™,28/18/,560547,5.95,207890
25295968,13817,PLAYERUNKNOWN'S BATTLEGROUNDS,73/1/25/29/,475864,5.95,578080
6266506,283,BioShock Infinite,1/,428206,5.95,8870
23728290,8791,Raw Data,1/23/2/,280654,5.95,436320
6827805,333,Call of Duty®: Modern Warfare® 2,1/,446221,5.95,10190
24754130,11471,Panzer Warfare,1/4/23/29/28/2/70/,358534,5.95,513880
9434766,811,Arma 2: Operation Arrowhead,1/28/2/,404183,5.95,33930
18807596,2846,XCOM® 2,2/,45380,5.95,268500
