In [1]:
# packages for data manipulation
import pandas as pd
import numpy as np

# packages used to create the game recommender 
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.preprocessing import StandardScaler

# data visualization packages
import matplotlib.pyplot as plt 
import seaborn as sns


%matplotlib inline

In [2]:
# importing Steam data from 2016 into a pandas dataframe

df = pd.read_csv('./steam_data.csv')

In [3]:
# inspecting the dataframe

df.head()

Unnamed: 0,User ID,Game Title,Hours Played
0,151603712,The Elder Scrolls V Skyrim,273.0
1,151603712,Fallout 4,87.0
2,151603712,Spore,14.9
3,151603712,Fallout New Vegas,12.1
4,151603712,Left 4 Dead 2,8.9


In [4]:
# checking the shape of the dataframe

df.shape

(70489, 3)

In [7]:
# creating a pivot table as I find them easy to work with 

pivot = pd.pivot_table(df, index='User ID',columns='Game Title',values='Hours Played')

In [8]:
# replacing NaN's with 0's

pivot.fillna(0,inplace=True)

In [9]:
# checking shape of pivot table

pivot.shape

(11350, 3600)

In [10]:
# scaling my data

ss = StandardScaler()
ss.fit_transform(pivot)

array([[-0.00938688, -0.01532983, -0.01495176, ..., -0.02384842,
        -0.02784376, -0.01123474],
       [-0.00938688, -0.01532983, -0.01495176, ..., -0.02384842,
        -0.02784376, -0.01123474],
       [-0.00938688, -0.01532983, -0.01495176, ..., -0.02384842,
        -0.02784376, -0.01123474],
       ...,
       [-0.00938688, -0.01532983, -0.01495176, ..., -0.02384842,
        -0.02784376, -0.01123474],
       [-0.00938688, -0.01532983, -0.01495176, ..., -0.02384842,
        -0.02784376, -0.01123474],
       [-0.00938688, -0.01532983, -0.01495176, ..., -0.02384842,
        -0.02784376, -0.01123474]])

In [11]:
# using pairwise distance to calculate correlation between users. I tried various metrics including: euclidean,hamming, etc.
# Spearman's correlation created the most accurate recommendations

distances = 1-pairwise_distances(pivot, metric='correlation')

In [12]:
# putting distances in a df for visibility

distance_df = pd.DataFrame(distances,index=pivot.index, columns=pivot.index)

In [13]:
# checking the df to verify everything worked okay

distance_df.head()

User ID,5250,76767,86540,144736,181212,229911,298950,381543,547685,554278,...,309228590,309255941,309262440,309265377,309404240,309434439,309554670,309626088,309824202,309903146
User ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5250,1.0,0.001565,-0.000756,-0.000398,-0.000475,-0.000738,0.014971,-0.000398,-0.000417,-0.000509,...,0.000873,-0.000398,0.004685,-0.000517,0.000288,0.000873,-0.000398,-0.000398,0.000873,0.000873
76767,0.001565,1.0,-0.001089,0.68534,0.66892,0.338205,0.001733,0.68534,0.036828,0.049974,...,-0.000641,-0.000641,-0.000641,-0.000833,-0.00088,-0.000641,-0.000641,0.02398,-0.000641,-0.000641
86540,-0.000756,-0.001089,1.0,-0.000528,-0.00063,-0.000992,0.103222,-0.000528,-0.000553,0.001662,...,-0.000528,-0.000528,-0.000528,-0.000686,-0.000726,-0.000528,-0.000528,0.00469,-0.000528,-0.000528
144736,-0.000398,0.68534,-0.000528,1.0,0.976184,0.216129,-0.000448,1.0,0.044666,-0.000355,...,-0.000278,-0.000278,-0.000278,-0.000361,-0.000382,-0.000278,-0.000278,-0.000278,-0.000278,-0.000278
181212,-0.000475,0.66892,-0.00063,0.976184,1.0,0.211194,-0.000624,0.976184,0.043542,-0.000424,...,-0.000332,-0.000332,-0.000332,-0.000431,-0.000456,-0.000332,-0.000332,-0.000332,-0.000332,-0.000332


In [14]:
distance_df.columns[678]

35489574

In [15]:
# Viewing recommendations for a user

query = "5250"
query = [col for col in distance_df.columns if query in str(col)][0]
distance_df[query].sort_values(ascending=False)[1:15]

User ID
228209477    0.914582
275437638    0.914582
298446224    0.914582
257528104    0.914582
224844255    0.914582
298516674    0.914582
224751217    0.914582
301274389    0.914582
261857176    0.914582
228343275    0.914582
263936784    0.914582
135012938    0.914161
142475478    0.913298
64754418     0.891728
Name: 5250, dtype: float64

In [16]:
# Viewing recommendations for a user

query = "25185866"
query = [col for col in distance_df.columns if query in str(col)][0]
distance_df[query].sort_values(ascending=False)[1:15]

User ID
63020970     1.0
32965833     1.0
36714440     1.0
19925584     1.0
40286953     1.0
19594216     1.0
25185866     1.0
45314861     1.0
15382030     1.0
181344998    1.0
30221587     1.0
12673872     1.0
74966682     1.0
10714039     1.0
Name: 25185866, dtype: float64

In [17]:
# Viewing recommendations for a user

query = "5250"
query = [col for col in distance_df.columns if query in str(col)][0]
distance_df[query].sort_values(ascending=False)[1:15]

User ID
228209477    0.914582
275437638    0.914582
298446224    0.914582
257528104    0.914582
224844255    0.914582
298516674    0.914582
224751217    0.914582
301274389    0.914582
261857176    0.914582
228343275    0.914582
263936784    0.914582
135012938    0.914161
142475478    0.913298
64754418     0.891728
Name: 5250, dtype: float64

In [18]:
# Viewing recommendations for a user

query = "5250"
query = [col for col in distance_df.columns if query in str(col)][0]
distance_df[query].sort_values(ascending=False)[1:15]

User ID
228209477    0.914582
275437638    0.914582
298446224    0.914582
257528104    0.914582
224844255    0.914582
298516674    0.914582
224751217    0.914582
301274389    0.914582
261857176    0.914582
228343275    0.914582
263936784    0.914582
135012938    0.914161
142475478    0.913298
64754418     0.891728
Name: 5250, dtype: float64