In [1]:
# Packages for data manipulation
import pandas as pd
import numpy as np

# Packages used to create the game recommender 
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.preprocessing import StandardScaler

# Data visualization packages
import matplotlib.pyplot as plt 
import seaborn as sns


%matplotlib inline

In [2]:
# Importing Steam data from 2016 into a pandas dataframe

df = pd.read_csv('./steam_data.csv')

In [3]:
# Inspecting the dataframe

df.head()

Unnamed: 0,User ID,Game Title,Hours Played
0,151603712,The Elder Scrolls V Skyrim,273.0
1,151603712,Fallout 4,87.0
2,151603712,Spore,14.9
3,151603712,Fallout New Vegas,12.1
4,151603712,Left 4 Dead 2,8.9


In [4]:
# Checking the shape of the dataframe

df.shape

(70489, 3)

In [5]:
# Removing entries with less than 40 hours played 

df = df[df['Hours Played'] > 40]

In [6]:
# Creating a pivot table 

pivot = pd.pivot_table(df, index='User ID',columns='Game Title',values='Hours Played')

In [7]:
# Replacing NaN's with 0's

pivot.fillna(0,inplace=True)

In [8]:
# Checking shape of pivot table

pivot.shape

(4410, 875)

In [9]:
# Scaling my data

ss = StandardScaler()
ss.fit_transform(pivot)

array([[-0.01506017, -0.0501587 , -0.01506017, ..., -0.01506017,
        -0.01506017, -0.01506017],
       [-0.01506017, -0.0501587 , -0.01506017, ..., -0.01506017,
        -0.01506017, -0.01506017],
       [-0.01506017, -0.0501587 , -0.01506017, ..., -0.01506017,
        -0.01506017, -0.01506017],
       ...,
       [-0.01506017, -0.0501587 , -0.01506017, ..., -0.01506017,
        -0.01506017, -0.01506017],
       [-0.01506017, -0.0501587 , -0.01506017, ..., -0.01506017,
        -0.01506017, -0.01506017],
       [-0.01506017, -0.0501587 , -0.01506017, ..., -0.01506017,
        -0.01506017, -0.01506017]])

In [10]:
# Using pairwise distance to calculate correlation between games. I tried various metrics including: euclidean,hamming, etc.
# Spearman's correlation created the most accurate recommendations

distances = 1-pairwise_distances(pivot.T, metric='correlation')

In [11]:
# Putting distances in a df 

distance_df = pd.DataFrame(distances,index=pivot.columns, columns=pivot.columns)

In [12]:
# Checking the df to verify everything worked okay

distance_df.head(2)

Game Title,3DMark,7 Days to Die,A Game of Thrones - Genesis,APB Reloaded,ARK Survival Evolved,Ace of Spades,Action! - Gameplay Recording and Streaming,AdVenture Capitalist,Aftermath,Agarest Generations of War,...,X3 Terran Conflict,XCOM Enemy Unknown,Xenonauts,You Need A Budget 4 (YNAB),Zombie Panic Source,Zombies Monsters Robots,Zuma's Revenge,liteCam Game 100 FPS Game Capture,theHunter,theHunter Primal
Game Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3DMark,1.0,-0.000755,-0.000227,-0.000392,-0.001419,-0.000317,-0.000373,-0.000844,-0.000317,-0.000227,...,-0.000227,-0.001079,-0.000227,-0.000227,-0.000409,-0.000227,-0.000227,-0.000227,-0.000227,-0.000227
7 Days to Die,-0.000755,1.0,-0.000755,-0.001304,0.069431,-0.001057,0.020927,-0.002811,-0.001056,-0.000755,...,-0.000755,0.007593,-0.000755,-0.000755,-0.001361,-0.000755,-0.000755,-0.000755,-0.000755,-0.000755


In [13]:
# Viewing recommendations for a game

query = "Fallout 3"
query = [col for col in distance_df.columns if query in col][0]
distance_df[query].sort_values(ascending=False)[1:15]

Game Title
Red Faction Guerrilla Steam Edition                      0.485015
Sid Meier's Civilization IV                              0.435344
The Elder Scrolls IV Oblivion                            0.406692
Dungeons & Dragons Online                                0.292175
Magic The Gathering - Duels of the Planeswalkers 2013    0.278948
The Last Remnant                                         0.241324
The Elder Scrolls III Morrowind                          0.209115
Marvel Puzzle Quest                                      0.199949
Gems of War                                              0.192506
Kingdoms of Amalur Reckoning                             0.187719
Magic The Gathering  Duels of the Planeswalkers 2012     0.170760
Fallout New Vegas                                        0.139285
FTL Faster Than Light                                    0.127336
The Talos Principle                                      0.118075
Name: Fallout 3, dtype: float64

In [14]:
# Viewing recommendations for a game

query = "Path of Exile"
query = [col for col in distance_df.columns if query in col][0]
distance_df[query].sort_values(ascending=False)[1:15]

Game Title
Bound By Flame                                        0.352334
Galactic Civilizations III                            0.321694
Elite Dangerous                                       0.295133
Tom Clancy's Ghost Recon Future Soldier               0.267285
Total War ATTILA                                      0.213423
Lords Of The Fallen                                   0.174506
NOBUNAGA'S AMBITION Sphere of Influence               0.164196
The Walking Dead Season Two                           0.145975
The Incredible Adventures of Van Helsing Final Cut    0.143530
Demigod                                               0.143530
Impire                                                0.143530
Hitman Absolution                                     0.135397
Warframe                                              0.132698
Middle-earth Shadow of Mordor                         0.129922
Name: Path of Exile, dtype: float64

In [15]:
# Viewing recommendations for a game

query = "Dota 2"
query = [col for col in distance_df.columns if query in col][0]
distance_df[query].sort_values(ascending=False)[1:15]

Game Title
Company of Heroes Opposing Fronts             0.111106
Dark Messiah of Might & Magic Multi-Player    0.098859
I Am Alive                                    0.074222
Everlasting Summer                            0.073742
Alan Wake                                     0.072387
Warhammer End Times - Vermintide              0.068247
Darksiders                                    0.067045
Dungeon Defenders II                          0.065169
Nuclear Dawn                                  0.063248
Nosgoth                                       0.057031
Company of Heroes                             0.053482
Burnout Paradise The Ultimate Box             0.048152
The Walking Dead                              0.042741
Castle Crashers                               0.039353
Name: Dota 2, dtype: float64

In [16]:
# Viewing recommendations for a game

query = "Counter-Strike Global Offensive"
query = [col for col in distance_df.columns if query in col][0]
distance_df[query].sort_values(ascending=False)[1:15]

Game Title
Call of Duty Black Ops - Multiplayer OSX    0.236364
Counter-Strike Source                       0.225857
TrackMania United                           0.212724
Pro Cycling Manager 2015                    0.212724
Pro Cycling Manager 2013                    0.197576
Rust                                        0.142166
Two Worlds II                               0.127124
GRID 2                                      0.108669
DayZ                                        0.095763
Call of Duty Advanced Warfare               0.095723
AdVenture Capitalist                        0.095254
Garry's Mod                                 0.095116
Counter-Strike                              0.080612
H1Z1                                        0.080103
Name: Counter-Strike Global Offensive, dtype: float64