# Imports

In [1]:
import numpy as np
import pandas as pd

import os
from datetime import datetime

# Functions

In [2]:
def df_from_indices(df, indices):
    result_df = [df.iloc[i] for i in indices]

    return pd.DataFrame(result_df)

# Data loading

In [3]:
DATA_FOLDER = os.path.join('..', 'data')

In [4]:
df = pd.read_csv(os.path.join(DATA_FOLDER, 'steam.csv'))
description_df = pd.read_csv(os.path.join(DATA_FOLDER, 'steam_description_data.csv'))
media_df = pd.read_csv(os.path.join(DATA_FOLDER, 'steam_media_data.csv'))

In [14]:
description_df.rename(columns={'steam_appid': 'appid'}, inplace=True)
media_df.rename(columns={'steam_appid': 'appid'}, inplace=True)

In [35]:
description_df.columns, media_df.columns

(Index(['appid', 'detailed_description', 'about_the_game', 'short_description'], dtype='object'),
 Index(['appid', 'header_image', 'screenshots', 'background', 'movies'], dtype='object'))

In [19]:
merged_df = pd.merge(df, description_df, on='appid', how='inner')
merged_df2 = pd.merge(merged_df, media_df, on='appid', how='inner')

In [20]:
merged_df2.shape

(27075, 25)

In [49]:
merged_df2.index.names = ['id']

In [50]:
merged_df2.head(2)

Unnamed: 0_level_0,appid,name,release_date,english,developer,publisher,platforms,required_age,categories,genres,...,median_playtime,owners,price,detailed_description,about_the_game,short_description,header_image,screenshots,background,movies
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,10,Counter-Strike,2000-11-01,1,Valve,Valve,windows;mac;linux,0,Multi-player;Online Multi-Player;Local Multi-P...,Action,...,317,10000000-20000000,7.19,Play the world's number 1 online action game. ...,Play the world's number 1 online action game. ...,Play the world's number 1 online action game. ...,https://steamcdn-a.akamaihd.net/steam/apps/10/...,"[{'id': 0, 'path_thumbnail': 'https://steamcdn...",https://steamcdn-a.akamaihd.net/steam/apps/10/...,
1,20,Team Fortress Classic,1999-04-01,1,Valve,Valve,windows;mac;linux,0,Multi-player;Online Multi-Player;Local Multi-P...,Action,...,62,5000000-10000000,3.99,One of the most popular online action games of...,One of the most popular online action games of...,One of the most popular online action games of...,https://steamcdn-a.akamaihd.net/steam/apps/20/...,"[{'id': 0, 'path_thumbnail': 'https://steamcdn...",https://steamcdn-a.akamaihd.net/steam/apps/20/...,


In [51]:
merged_df2.to_csv(os.path.join(DATA_FOLDER, 'steam_merged.csv'))

# Data preprocessing

In [27]:
df = merged_df2

In [28]:
df = df.fillna(np.mean)

In [40]:
df.columns

Index(['appid', 'name', 'release_date', 'english', 'developer', 'publisher',
       'platforms', 'required_age', 'categories', 'genres', 'steamspy_tags',
       'achievements', 'positive_ratings', 'negative_ratings',
       'average_playtime', 'median_playtime', 'owners', 'price',
       'detailed_description', 'about_the_game', 'short_description',
       'header_image', 'screenshots', 'background', 'movies',
       'includes level editor', 'in-app purchases', 'commentary available',
       'online multi-player', 'partial controller support',
       'cross-platform multiplayer', 'mods', 'steam trading cards', 'co-op',
       'mmo', 'includes source sdk', 'multi-player', 'single-player',
       'vr support', 'local multi-player', 'mods (require hl2)', 'stats',
       'steam achievements', 'full controller support', 'local co-op',
       'valve anti-cheat enabled', 'steam leaderboards', 'steam workshop',
       'shared/split screen', 'steamvr collectibles', 'steam cloud',
       'online

In [29]:
dummy_columns = ['categories', 'genres', 'owners']
dummy_column_values = {}

def make_dummies(df, column, sep=';'):
    categories = set()
    [categories.add(i) for l in df[column].apply(lambda x: x.split(sep)).values for i in l]

    for category in list(categories):
        df[category.lower()] = df[column].str.contains(category).astype(np.uint8)
    
    dummy_column_values[column] = list(categories)

for dummy in dummy_columns:
    make_dummies(df, dummy)

df.head(2)

  return func(self, *args, **kwargs)


Unnamed: 0,appid,name,release_date,english,developer,publisher,platforms,required_age,categories,genres,...,100000-200000,10000000-20000000,20000-50000,1000000-2000000,100000000-200000000,20000000-50000000,50000000-100000000,50000-100000,500000-1000000,5000000-10000000
0,10,Counter-Strike,2000-11-01,1,Valve,Valve,windows;mac;linux,0,Multi-player;Online Multi-Player;Local Multi-P...,Action,...,0,1,0,0,0,0,0,0,0,0
1,20,Team Fortress Classic,1999-04-01,1,Valve,Valve,windows;mac;linux,0,Multi-player;Online Multi-Player;Local Multi-P...,Action,...,0,0,0,0,0,0,0,0,0,1


In [30]:
df['oc_windows'] = df['platforms'].str.contains('windows').astype(np.uint8)
df['oc_linux'] = df['platforms'].str.contains('linux').astype(np.uint8)
df['oc_mac'] = df['platforms'].str.contains('mac').astype(np.uint8)

In [31]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 27075 entries, 0 to 27074
Data columns (total 99 columns):
appid                         27075 non-null int64
name                          27075 non-null object
release_date                  27075 non-null object
english                       27075 non-null int64
developer                     27075 non-null object
publisher                     27075 non-null object
platforms                     27075 non-null object
required_age                  27075 non-null int64
categories                    27075 non-null object
genres                        27075 non-null object
steamspy_tags                 27075 non-null object
achievements                  27075 non-null int64
positive_ratings              27075 non-null int64
negative_ratings              27075 non-null int64
average_playtime              27075 non-null int64
median_playtime               27075 non-null int64
owners                        27075 non-null object
price          

# Convert date to discrete value

In [33]:
df['release_date'] = df['release_date'].apply(lambda x: datetime.strptime(x, '%Y-%m-%d'))
df['release_date'] = (df['release_date'] - datetime(1970,1,1)).apply(lambda x: x.total_seconds())

# Min-max scaling

In [34]:
columns_to_scale = ['release_date', 'positive_ratings', 'negative_ratings',
                    'average_playtime', 'median_playtime', 'price']
scaling_values = {}

def min_max_scaling(df, column):
    col_min = df[column].min()
    col_max = df[column].max()
    scaling_values[column] = (col_min, col_max)
    df[column] = (df[column] - col_min) / (col_max - col_min)

min_max_scaling(df, 'release_date')
min_max_scaling(df, 'positive_ratings')
min_max_scaling(df, 'negative_ratings')
min_max_scaling(df, 'average_playtime')
min_max_scaling(df, 'median_playtime')
min_max_scaling(df, 'price')

In [195]:
df.head()

Unnamed: 0,appid,name,release_date,english,developer,publisher,platforms,required_age,categories,genres,...,50000-100000,20000000-50000000,2000000-5000000,200000-500000,20000-50000,100000-200000,10000000-20000000,oc_windows,oc_linux,oc_mac
0,10,Counter-Strike,0.152978,1,Valve,Valve,windows;mac;linux,0,Multi-player;Online Multi-Player;Local Multi-P...,Action,...,0,0,0,0,0,0,1,1,1,1
1,20,Team Fortress Classic,0.080251,1,Valve,Valve,windows;mac;linux,0,Multi-player;Online Multi-Player;Local Multi-P...,Action,...,0,0,0,0,0,0,0,1,1,1
2,30,Day of Defeat,0.26721,1,Valve,Valve,windows;mac;linux,0,Multi-player;Valve Anti-Cheat enabled,Action,...,0,0,0,0,0,0,0,1,1,1
3,40,Deathmatch Classic,0.179561,1,Valve,Valve,windows;mac;linux,0,Multi-player;Online Multi-Player;Local Multi-P...,Action,...,0,0,0,0,0,0,0,1,1,1
4,50,Half-Life: Opposing Force,0.107085,1,Gearbox Software,Valve,windows;mac;linux,0,Single-player;Multi-player;Valve Anti-Cheat en...,Action,...,0,0,0,0,0,0,0,1,1,1


In [38]:
clean_df = df.drop(['appid', 'publisher', 'platforms',
                    'achievements', 'steamspy_tags', 'owners',
                    'developer', 'name'] + dummy_columns
                    + list(description_df.columns.values) + list(media_df.columns.values), axis=1)

In [39]:
clean_df.head(2)

Unnamed: 0,release_date,english,required_age,positive_ratings,negative_ratings,average_playtime,median_playtime,price,includes level editor,in-app purchases,...,1000000-2000000,100000000-200000000,20000000-50000000,50000000-100000000,50000-100000,500000-1000000,5000000-10000000,oc_windows,oc_linux,oc_mac
0,0.152978,1,0,0.047093,0.006855,0.092391,0.001663,0.017038,0,0,...,0,0,0,0,0,0,0,1,1,1
1,0.080251,1,0,0.001255,0.0013,0.001453,0.000325,0.009455,0,0,...,0,0,0,0,0,0,1,1,1,1


In [41]:
X = clean_df.values

In [42]:
def l2_norm(a, b):
    return np.sum(np.sqrt((a - b) ** 2), axis=0)

def cosine(a, b):
    return (a @ b) / (np.sqrt(np.sum(a ** 2, axis=0)) * np.sqrt(np.sum(b ** 2, axis=0)))
    
def knn(vec, X, k=5, metric='l2_norm', weights=[0.5, 0.5]):
    dists = np.empty((X.shape[0],))
    for i, row in enumerate(X):
        if np.array_equal(row, vec):
            continue
        if metric == 'l2_norm':
            dists[i] = -l2_norm(vec, row)
        elif metric == 'cosine':
            dists[i] = cosine(vec, row)
        elif metric == 'combined':
            dists[i] = weights[0] * -l2_norm(vec, row) + weights[1] * cosine(vec, row)
    
    return dists.argsort()[-k:][::-1]

game_id = 0
k = 20
result = knn(X[game_id], X, k=k)
cos_result = knn(X[game_id], X, k=k, metric='cosine')
combined_result = knn(X[game_id], X, k=k, metric='combined', weights=[0.25, 0.75])

In [43]:
df_from_indices(df, cos_result)

Unnamed: 0,appid,name,release_date,english,developer,publisher,platforms,required_age,categories,genres,...,1000000-2000000,100000000-200000000,20000000-50000000,50000000-100000000,50000-100000,500000-1000000,5000000-10000000,oc_windows,oc_linux,oc_mac
3,40,Deathmatch Classic,0.179561,1,Valve,Valve,windows;mac;linux,0,Multi-player;Online Multi-Player;Local Multi-P...,Action,...,0,0,0,0,0,0,1,1,1,1
1,20,Team Fortress Classic,0.080251,1,Valve,Valve,windows;mac;linux,0,Multi-player;Online Multi-Player;Local Multi-P...,Action,...,0,0,0,0,0,0,1,1,1,1
7,80,Counter-Strike: Condition Zero,0.305455,1,Valve,Valve,windows;mac;linux,0,Single-player;Multi-player;Valve Anti-Cheat en...,Action,...,0,0,0,0,0,0,0,1,1,1
13,320,Half-Life 2: Deathmatch,0.336176,1,Valve,Valve,windows;mac;linux,0,Multi-player;Valve Anti-Cheat enabled;Includes...,Action,...,0,0,0,0,0,0,0,1,1,1
5,60,Ricochet,0.152978,1,Valve,Valve,windows;mac;linux,0,Multi-player;Online Multi-Player;Valve Anti-Ch...,Action,...,0,0,0,0,0,0,1,1,1,1
9229,475530,Go Go Electric Samurai,0.92163,1,Hexagon Games;NAMI TENTOU,Hexagon Games,windows;mac;linux,0,Multi-player;Online Multi-Player,Action;Indie,...,0,0,0,0,0,0,0,1,1,1
2,30,Day of Defeat,0.26721,1,Valve,Valve,windows;mac;linux,0,Multi-player;Valve Anti-Cheat enabled,Action,...,0,0,0,0,0,0,1,1,1,1
15,360,Half-Life Deathmatch: Source,0.404639,1,Valve,Valve,windows;mac;linux,0,Multi-player;Valve Anti-Cheat enabled,Action,...,0,0,0,0,0,0,1,1,1,1
17102,706960,Super Slime Arena,0.941944,1,JellyTeam,JellyTeam,windows;mac;linux,0,Multi-player;Online Multi-Player;Local Multi-P...,Action;Casual;Indie,...,0,0,0,0,0,0,0,1,1,1
6,70,Half-Life,0.062194,1,Valve,Valve,windows;mac;linux,0,Single-player;Multi-player;Online Multi-Player...,Action,...,0,0,0,0,0,0,1,1,1,1


In [44]:
liked_game_indicies = [0, 10, 22, 6, 7, 1]
disliked_game_indices = [1406, 1510, 1670, 2100]

In [45]:
df_from_indices(df, liked_game_indicies)

Unnamed: 0,appid,name,release_date,english,developer,publisher,platforms,required_age,categories,genres,...,1000000-2000000,100000000-200000000,20000000-50000000,50000000-100000000,50000-100000,500000-1000000,5000000-10000000,oc_windows,oc_linux,oc_mac
0,10,Counter-Strike,0.152978,1,Valve,Valve,windows;mac;linux,0,Multi-player;Online Multi-Player;Local Multi-P...,Action,...,0,0,0,0,0,0,0,1,1,1
10,240,Counter-Strike: Source,0.336176,1,Valve,Valve,windows;mac;linux,0,Multi-player;Cross-Platform Multiplayer;Steam ...,Action,...,0,0,0,0,0,0,0,1,1,1
22,570,Dota 2,0.733918,1,Valve,Valve,windows;mac;linux,0,Multi-player;Co-op;Steam Trading Cards;Steam W...,Action;Free to Play;Strategy,...,0,1,0,0,0,0,0,1,1,1
6,70,Half-Life,0.062194,1,Valve,Valve,windows;mac;linux,0,Single-player;Multi-player;Online Multi-Player...,Action,...,0,0,0,0,0,0,1,1,1,1
7,80,Counter-Strike: Condition Zero,0.305455,1,Valve,Valve,windows;mac;linux,0,Single-player;Multi-player;Valve Anti-Cheat en...,Action,...,0,0,0,0,0,0,0,1,1,1
1,20,Team Fortress Classic,0.080251,1,Valve,Valve,windows;mac;linux,0,Multi-player;Online Multi-Player;Local Multi-P...,Action,...,0,0,0,0,0,0,1,1,1,1


In [46]:
df_from_indices(df, disliked_game_indices)

Unnamed: 0,appid,name,release_date,english,developer,publisher,platforms,required_age,categories,genres,...,1000000-2000000,100000000-200000000,20000000-50000000,50000000-100000000,50000-100000,500000-1000000,5000000-10000000,oc_windows,oc_linux,oc_mac
1406,214360,Tower Wars,0.692665,1,SuperVillain Studios,SuperVillain Studios,windows;mac,0,Single-player;Multi-player;Online Multi-Player...,Action;Indie;Strategy,...,0,0,0,0,0,0,0,1,0,1
1510,221540,DG2: Defense Grid 2,0.789216,1,Hidden Path Entertainment,505 Games,windows,0,Single-player;Multi-player;Co-op;Shared/Split ...,Indie;Strategy,...,0,0,0,0,0,0,0,1,0,0
1670,233310,Avadon 2: The Corruption,0.748088,1,Spiderweb Software,Spiderweb Software,windows;mac,0,Single-player;Steam Achievements,Indie;RPG;Strategy,...,0,0,0,0,1,0,0,1,0,1
2100,257030,Project Nimbus,0.927022,1,GameCrafterTeam,KISS ltd;GameTomo,windows;mac,0,Single-player;Steam Trading Cards;Partial Cont...,Action;Indie,...,0,0,0,0,0,0,0,1,0,1


In [347]:
users = pd.DataFrame([
    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
], columns=dummy_column_values['genres'] + dummy_column_values['categories'])
users

Unnamed: 0,Strategy,Software Training,Photo Editing,Gore,Violent,Action,Education,RPG,Tutorial,Free to Play,...,Commentary available,SteamVR Collectibles,Partial Controller Support,Steam Workshop,VR Support,Online Multi-Player,MMO,Valve Anti-Cheat enabled,Steam Leaderboards,Full controller support
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [348]:
def process_user_preferences(user, indices, val=1):
    for index in indices:
        genres = set()
        categories = set()
        [genres.add(i) for i in df['genres'].iloc[index].split(';')]
        [categories.add(i) for i in df['categories'].iloc[index].split(';')]
        for genre in genres:
            user[genre] += val
        for category in categories:
            user[category] += val

In [349]:
process_user_preferences(users.iloc[0], liked_game_indicies, val=1)
process_user_preferences(users.iloc[0], disliked_game_indices, val=-1)


In [350]:
users

Unnamed: 0,Strategy,Software Training,Photo Editing,Gore,Violent,Action,Education,RPG,Tutorial,Free to Play,...,Commentary available,SteamVR Collectibles,Partial Controller Support,Steam Workshop,VR Support,Online Multi-Player,MMO,Valve Anti-Cheat enabled,Steam Leaderboards,Full controller support
0,-2,0,0,0,0,4,0,-1,0,1,...,0,1,-1,-1,0,2,0,5,-2,-1


In [351]:
softmax = np.exp(users.values) / np.sum(np.exp(users.values))
softmax

array([[4.22085863e-04, 3.11881612e-03, 3.11881612e-03, 3.11881612e-03,
        3.11881612e-03, 1.70281590e-01, 3.11881612e-03, 1.14734833e-03,
        3.11881612e-03, 8.47782118e-03, 3.11881612e-03, 3.11881612e-03,
        3.11881612e-03, 3.11881612e-03, 3.11881612e-03, 3.11881612e-03,
        3.11881612e-03, 3.11881612e-03, 3.11881612e-03, 5.71231098e-05,
        3.11881612e-03, 3.11881612e-03, 3.11881612e-03, 3.11881612e-03,
        3.11881612e-03, 3.11881612e-03, 3.11881612e-03, 3.11881612e-03,
        3.11881612e-03, 2.30451073e-02, 3.11881612e-03, 3.11881612e-03,
        8.47782118e-03, 1.70281590e-01, 1.14734833e-03, 1.14734833e-03,
        4.22085863e-04, 3.11881612e-03, 1.14734833e-03, 8.47782118e-03,
        4.22085863e-04, 1.14734833e-03, 1.14734833e-03, 3.11881612e-03,
        3.11881612e-03, 3.11881612e-03, 1.14734833e-03, 4.22085863e-04,
        3.11881612e-03, 8.47782118e-03, 1.14734833e-03, 1.14734833e-03,
        3.11881612e-03, 2.30451073e-02, 3.11881612e-03, 4.628733

In [352]:
genres_df = df[[col.lower() for col in list(dummy_column_values['genres']) + list(dummy_column_values['categories'])]]
genres_df.head(3)

Unnamed: 0,strategy,software training,photo editing,gore,violent,action,education,rpg,tutorial,free to play,...,commentary available,steamvr collectibles,partial controller support,steam workshop,vr support,online multi-player,mmo,valve anti-cheat enabled,steam leaderboards,full controller support
0,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,1,0,1,0,0
1,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,1,0,1,0,0
2,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [356]:
def reccomend(df, recommendation_df, users, k=5):
    res = (recommendation_df.values @ users.values.T).reshape(len(recommendation_df),)
    res_indices = res.argsort(axis=0)[-k:][::-1]
    return df_from_indices(df, res_indices)

In [359]:
reccomend(df, genres_df, pd.DataFrame(users), k=10)

Unnamed: 0,appid,name,release_date,english,developer,publisher,platforms,required_age,categories,genres,...,50000-100000,20000000-50000000,2000000-5000000,200000-500000,20000-50000,100000-200000,10000000-20000000,oc_windows,oc_linux,oc_mac
0,10,Counter-Strike,0.152978,1,Valve,Valve,windows;mac;linux,0,Multi-player;Online Multi-Player;Local Multi-P...,Action,...,0,0,0,0,0,0,1,1,1,1
1,20,Team Fortress Classic,0.080251,1,Valve,Valve,windows;mac;linux,0,Multi-player;Online Multi-Player;Local Multi-P...,Action,...,0,0,0,0,0,0,0,1,1,1
3,40,Deathmatch Classic,0.179561,1,Valve,Valve,windows;mac;linux,0,Multi-player;Online Multi-Player;Local Multi-P...,Action,...,0,0,0,0,0,0,0,1,1,1
5,60,Ricochet,0.152978,1,Valve,Valve,windows;mac;linux,0,Multi-player;Online Multi-Player;Valve Anti-Ch...,Action,...,0,0,0,0,0,0,0,1,1,1
13,320,Half-Life 2: Deathmatch,0.336176,1,Valve,Valve,windows;mac;linux,0,Multi-player;Valve Anti-Cheat enabled;Includes...,Action,...,0,0,0,0,0,0,1,1,1,1
1506,221100,DayZ,0.982571,1,Bohemia Interactive,Bohemia Interactive,windows,18,Multi-player;Online Multi-Player;Steam Worksho...,Action;Adventure;Massively Multiplayer,...,0,0,1,0,0,0,0,1,0,0
2,30,Day of Defeat,0.26721,1,Valve,Valve,windows;mac;linux,0,Multi-player;Valve Anti-Cheat enabled,Action,...,0,0,0,0,0,0,0,1,1,1
6,70,Half-Life,0.062194,1,Valve,Valve,windows;mac;linux,0,Single-player;Multi-player;Online Multi-Player...,Action,...,0,0,0,0,0,0,0,1,1,1
15,360,Half-Life Deathmatch: Source,0.404639,1,Valve,Valve,windows;mac;linux,0,Multi-player;Valve Anti-Cheat enabled,Action,...,0,0,0,0,0,0,0,1,1,1
23499,895150,Endless Battle,0.985078,1,Netdragon Websoft Inc,Netdragon Websoft Inc,windows,0,Multi-player;Online Multi-Player;In-App Purchases,Action;Casual;Free to Play;Massively Multiplayer,...,1,0,0,0,0,0,0,1,0,0
