In [1]:
# Packages for data manipulation
import pandas as pd
import numpy as np

# Packages used to create the game recommender 
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.preprocessing import StandardScaler

# Data visualization packages
import matplotlib.pyplot as plt 
import seaborn as sns


%matplotlib inline

In [2]:
# Importing Steam data from 2016 into a pandas dataframe

df = pd.read_csv('./steam_data.csv')

In [3]:
# Inspecting the dataframe

df.head()

Unnamed: 0,User ID,Game Title,Hours Played
0,151603712,The Elder Scrolls V Skyrim,273.0
1,151603712,Fallout 4,87.0
2,151603712,Spore,14.9
3,151603712,Fallout New Vegas,12.1
4,151603712,Left 4 Dead 2,8.9


In [4]:
# Checking the shape of the dataframe

df.shape

(70489, 3)

In [68]:
# Removing entries with less than 20 hours played to remove 1 hour outliers.

df = df[df['Hours Played'] > 20]

In [69]:
# Creating a pivot table 

pivot = pd.pivot_table(df, index='User ID',columns='Game Title',values='Hours Played')

In [70]:
# Replacing NaN's with 0's

pivot.fillna(0,inplace=True)

In [71]:
# Checking shape of pivot table

pivot.shape

(4109, 766)

In [80]:
# Scaling my data

ss = StandardScaler()
ss.fit_transform(pivot)

array([[-0.01560216, -0.0499805 , -0.02693775, ..., -0.01560216,
        -0.01560216, -0.01560216],
       [-0.01560216, -0.0499805 , -0.02693775, ..., -0.01560216,
        -0.01560216, -0.01560216],
       [-0.01560216, -0.0499805 , -0.02693775, ..., -0.01560216,
        -0.01560216, -0.01560216],
       ...,
       [-0.01560216, -0.0499805 , -0.02693775, ..., -0.01560216,
        -0.01560216, -0.01560216],
       [-0.01560216, -0.0499805 , -0.02693775, ..., -0.01560216,
        -0.01560216, -0.01560216],
       [-0.01560216, -0.0499805 , -0.02693775, ..., -0.01560216,
        -0.01560216, -0.01560216]])

In [81]:
# Using pairwise distance to calculate correlation between games. I tried various metrics including: euclidean,hamming, etc.
# Spearman's correlation created the most accurate recommendations

distances = 1-pairwise_distances(pivot.T, metric='correlation')

In [82]:
# Putting distances in a df 

distance_df = pd.DataFrame(distances,index=pivot.columns, columns=pivot.columns)

In [90]:
# Checking the df to verify everything worked okay

distance_df.head(2)

Game Title,3DMark,7 Days to Die,APB Reloaded,ARK Survival Evolved,Ace of Spades,Action! - Gameplay Recording and Streaming,AdVenture Capitalist,Aftermath,Agarest Generations of War,Age of Chivalry,...,X3 Reunion,X3 Terran Conflict,XCOM Enemy Unknown,Xenonauts,You Need A Budget 4 (YNAB),Zombie Panic Source,Zuma's Revenge,liteCam Game 100 FPS Game Capture,theHunter,theHunter Primal
Game Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3DMark,1.0,-0.00078,-0.00042,-0.001487,-0.000243,-0.0004,-0.000891,-0.000243,-0.000243,-0.000359,...,-0.00032,-0.000243,-0.001127,-0.000243,-0.000243,-0.000439,-0.000243,-0.000243,-0.000243,-0.000243
7 Days to Die,-0.00078,1.0,-0.001346,0.054382,-0.00078,-0.001282,-0.002856,-0.00078,-0.00078,-0.00115,...,-0.001025,-0.00078,0.007639,-0.00078,-0.00078,-0.001405,-0.00078,-0.00078,-0.00078,-0.00078


In [84]:
# Viewing recommendations for a game

query = "Fallout 3"
query = [col for col in distance_df.columns if query in col][0]
distance_df[query].sort_values(ascending=False)[1:15]

Game Title
Red Faction Guerrilla Steam Edition                      0.484991
Sid Meier's Civilization IV                              0.454930
The Elder Scrolls IV Oblivion                            0.410416
Dungeons & Dragons Online                                0.307679
Magic The Gathering - Duels of the Planeswalkers 2013    0.278915
The Last Remnant                                         0.258306
The Elder Scrolls III Morrowind                          0.214319
Marvel Puzzle Quest                                      0.200070
Gems of War                                              0.199866
Kingdoms of Amalur Reckoning                             0.192437
Magic The Gathering  Duels of the Planeswalkers 2012     0.170702
Fallout New Vegas                                        0.140011
FTL Faster Than Light                                    0.131119
Divinity Original Sin                                    0.078424
Name: Fallout 3, dtype: float64

In [85]:
# Viewing recommendations for a game

query = "Path of Exile"
query = [col for col in distance_df.columns if query in col][0]
distance_df[query].sort_values(ascending=False)[1:15]

Game Title
Tom Clancy's Ghost Recon Future Soldier               0.352744
Bound By Flame                                        0.352744
Galactic Civilizations III                            0.322048
Elite Dangerous                                       0.295867
Total War ATTILA                                      0.214103
Lords Of The Fallen                                   0.174676
NOBUNAGA'S AMBITION Sphere of Influence               0.164355
Hitman Absolution                                     0.164290
The Incredible Adventures of Van Helsing Final Cut    0.143675
The Incredible Adventures of Van Helsing              0.143675
Demigod                                               0.143675
Middle-earth Shadow of Mordor                         0.136792
Warframe                                              0.129916
Total War ROME II - Emperor Edition                   0.112038
Name: Path of Exile, dtype: float64

In [86]:
# Viewing recommendations for a game

query = "Dota 2"
query = [col for col in distance_df.columns if query in col][0]
distance_df[query].sort_values(ascending=False)[1:15]

Game Title
Company of Heroes Opposing Fronts             0.111185
Dark Messiah of Might & Magic Multi-Player    0.098890
Dungeon Defenders II                          0.074264
I Am Alive                                    0.074155
Alan Wake                                     0.072259
Warhammer End Times - Vermintide              0.068007
Darksiders                                    0.066796
SMITE                                         0.056918
Company of Heroes                             0.053185
Nosgoth                                       0.050085
The Walking Dead                              0.043423
Batman Arkham Asylum GOTY Edition             0.040994
Fallout 3 - Game of the Year Edition          0.039018
Castle Crashers                               0.039001
Name: Dota 2, dtype: float64

In [87]:
# Viewing recommendations for a game

query = "Counter-Strike Global Offensive"
query = [col for col in distance_df.columns if query in col][0]
distance_df[query].sort_values(ascending=False)[1:15]

Game Title
Call of Duty Black Ops - Multiplayer OSX      0.236689
Counter-Strike Source                         0.224187
Pro Cycling Manager 2013                      0.203235
Two Worlds II                                 0.146247
Rust                                          0.138779
AdVenture Capitalist                          0.093933
DayZ                                          0.093747
Garry's Mod                                   0.091050
GRID 2                                        0.079809
Counter-Strike                                0.078470
H1Z1                                          0.076947
The Elder Scrolls Online Tamriel Unlimited    0.074776
Arma 2 Operation Arrowhead                    0.074266
Unturned                                      0.067058
Name: Counter-Strike Global Offensive, dtype: float64