In [1]:
# Packages for data manipulation
import pandas as pd
import numpy as np

# Packages used to create the game recommender 
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.preprocessing import StandardScaler

# Data visualization packages
import matplotlib.pyplot as plt 
import seaborn as sns


%matplotlib inline

In [2]:
# Importing Steam data from 2016 into a pandas dataframe

df = pd.read_csv('./steam_data.csv')

In [3]:
# Inspecting the dataframe

df.head()

Unnamed: 0,User ID,Game Title,Hours Played
0,151603712,The Elder Scrolls V Skyrim,273.0
1,151603712,Fallout 4,87.0
2,151603712,Spore,14.9
3,151603712,Fallout New Vegas,12.1
4,151603712,Left 4 Dead 2,8.9


In [4]:
# Checking the shape of the dataframe

df.shape

(70489, 3)

In [5]:
# Removing entries with less than 30 hours played

df = df[df['Hours Played'] > 30]

In [6]:
# Creating a pivot table 

pivot = pd.pivot_table(df, index='User ID',columns='Game Title',values='Hours Played')

In [7]:
# Replacing NaN's with 0's

pivot.fillna(0,inplace=True)

In [8]:
# Checking shape of pivot table

pivot.shape

(4806, 1024)

In [9]:
# Scaling my data

ss = StandardScaler()
ss.fit_transform(pivot)

array([[-0.02036653, -0.01442625, -0.01442625, ..., -0.01442625,
        -0.01819092, -0.01442625],
       [-0.02036653, -0.01442625, -0.01442625, ..., -0.01442625,
        -0.01819092, -0.01442625],
       [-0.02036653, -0.01442625, -0.01442625, ..., -0.01442625,
        -0.01819092, -0.01442625],
       ...,
       [-0.02036653, -0.01442625, -0.01442625, ..., -0.01442625,
        -0.01819092, -0.01442625],
       [-0.02036653, -0.01442625, -0.01442625, ..., -0.01442625,
        -0.01819092, -0.01442625],
       [-0.02036653, -0.01442625, -0.01442625, ..., -0.01442625,
        -0.01819092, -0.01442625]])

In [10]:
# Using pairwise distance to calculate correlation between games. I tried various metrics including: euclidean,hamming, etc.
# Spearman's correlation created the most accurate recommendations

distances = 1-pairwise_distances(pivot.T, metric='correlation')

In [11]:
# Putting distances in a df 

distance_df = pd.DataFrame(distances,index=pivot.columns, columns=pivot.columns)

In [12]:
# Checking the df to verify everything worked okay

distance_df.head(2)

Game Title,100% Orange Juice,3DMark,4 Elements,7 Days to Die,A Game of Thrones - Genesis,A.V.A - Alliance of Valiant Arms,APB Reloaded,ARK Survival Evolved,Ace of Spades,Action! - Gameplay Recording and Streaming,...,XCOM Enemy Unknown,Xenonauts,You Must Build A Boat,You Need A Budget 4 (YNAB),Zombie Panic Source,Zombies Monsters Robots,Zuma's Revenge,liteCam Game 100 FPS Game Capture,theHunter,theHunter Primal
Game Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100% Orange Juice,1.0,-0.000294,-0.000294,-0.001036,-0.000294,-0.000294,-0.000509,-0.001851,-0.000411,-0.000483,...,-0.001588,-0.000409,-0.000294,-0.000384,-0.000529,-0.000294,-0.000294,-0.000294,-0.00037,-0.000294
3DMark,-0.000294,1.0,-0.000208,-0.000734,-0.000208,-0.000208,-0.000361,-0.001311,-0.000291,-0.000342,...,-0.001125,-0.00029,-0.000208,-0.000272,-0.000375,-0.000208,-0.000208,-0.000208,-0.000262,-0.000208


In [13]:
# Viewing recommendations for a game

query = "Fallout 3"
query = [col for col in distance_df.columns if query in col][0]
distance_df[query].sort_values(ascending=False)[1:15]

Game Title
Commandos 2 Men of Courage                               0.513889
Red Faction Guerrilla Steam Edition                      0.437213
Sid Meier's Civilization IV                              0.415664
The Elder Scrolls IV Oblivion                            0.398951
Dungeons & Dragons Online                                0.284860
Magic The Gathering - Duels of the Planeswalkers 2013    0.275663
The Last Remnant                                         0.218871
Marvel Puzzle Quest                                      0.197486
The Elder Scrolls III Morrowind                          0.196513
Gems of War                                              0.190243
Kingdoms of Amalur Reckoning                             0.182772
Magic The Gathering  Duels of the Planeswalkers 2012     0.168754
Deus Ex Game of the Year Edition                         0.152852
Jade Empire Special Edition                              0.152852
Name: Fallout 3, dtype: float64

In [14]:
# Viewing recommendations for a game

query = "Path of Exile"
query = [col for col in distance_df.columns if query in col][0]
distance_df[query].sort_values(ascending=False)[1:15]

Game Title
Bound By Flame                                        0.352008
Galactic Civilizations III                            0.318673
Elite Dangerous                                       0.294683
Tom Clancy's Ghost Recon Future Soldier               0.242645
The Wolf Among Us                                     0.222633
Total War ATTILA                                      0.211930
Lords Of The Fallen                                   0.174389
NOBUNAGA'S AMBITION Sphere of Influence               0.164087
The Walking Dead Season Two                           0.145885
Impire                                                0.143427
Age of Wonders                                        0.143427
Krater                                                0.143427
The Incredible Adventures of Van Helsing Final Cut    0.143427
Warframe                                              0.132708
Name: Path of Exile, dtype: float64

In [15]:
# Viewing recommendations for a game

query = "Dota 2"
query = [col for col in distance_df.columns if query in col][0]
distance_df[query].sort_values(ascending=False)[1:15]

Game Title
Company of Heroes Opposing Fronts             0.109898
Dark Messiah of Might & Magic Multi-Player    0.098821
Titan Quest                                   0.092131
I Am Alive                                    0.074295
Alan Wake                                     0.072529
Darksiders                                    0.067325
Nuclear Dawn                                  0.063371
Warhammer 40,000 Dawn of War  Soulstorm       0.063371
Warhammer End Times - Vermintide              0.062489
Dungeon Defenders II                          0.060049
Everlasting Summer                            0.057466
Devilian                                      0.056463
Nosgoth                                       0.055829
Sonic & All-Stars Racing Transformed          0.052327
Name: Dota 2, dtype: float64

In [16]:
# Viewing recommendations for a game

query = "Counter-Strike Global Offensive"
query = [col for col in distance_df.columns if query in col][0]
distance_df[query].sort_values(ascending=False)[1:15]

Game Title
Call of Duty Black Ops - Multiplayer OSX    0.235975
Counter-Strike Source                       0.228610
TrackMania United                           0.212408
Pro Cycling Manager 2015                    0.212408
KickBeat Steam Edition                      0.199878
Pro Cycling Manager 2013                    0.197444
Rust                                        0.148627
Nidhogg                                     0.141645
Two Worlds II                               0.122707
Garry's Mod                                 0.100130
DayZ                                        0.098952
GRID 2                                      0.098919
AdVenture Capitalist                        0.095746
Call of Duty Advanced Warfare               0.087080
Name: Counter-Strike Global Offensive, dtype: float64