In [1]:
import numpy as np
import pandas as pd
import kaggle
import os

if (not os.path.isdir("pickle_files")):
    os.makedirs("pickle_files")

if (not os.path.isdir("data")):
    os.makedirs("data")

kaggle.api.authenticate()

# Dataset link: https://www.kaggle.com/datasets/fronkongames/steam-games-dataset/
kaggle.api.dataset_download_files(
    'fronkongames/steam-games-dataset', path='data', unzip=True)

# Removing unnecessary files
os.remove('data/games.json')

In [2]:
# Reading the dataset
df = pd.read_csv('data/games.csv')

# List columns present in this dataset
df.columns

Index(['AppID', 'Name', 'Release date', 'Estimated owners', 'Peak CCU',
       'Required age', 'Price', 'DLC count', 'About the game',
       'Supported languages', 'Full audio languages', 'Reviews',
       'Header image', 'Website', 'Support url', 'Support email', 'Windows',
       'Mac', 'Linux', 'Metacritic score', 'Metacritic url', 'User score',
       'Positive', 'Negative', 'Score rank', 'Achievements', 'Recommendations',
       'Notes', 'Average playtime forever', 'Average playtime two weeks',
       'Median playtime forever', 'Median playtime two weeks', 'Developers',
       'Publishers', 'Categories', 'Genres', 'Tags', 'Screenshots', 'Movies'],
      dtype='object')

In [3]:
# Adding a 'PositiveRate' column
df['PositiveRate'] = df['Positive'] / (df['Positive'] + df['Negative'])

# Dropping null values and games which don't support English language
df = df.drop(df[df['About the game'].isna()].index)
df = df.drop(df[df['Categories'].isna()].index)
df = df.drop(df[df['Genres'].isna()].index)
df = df.drop(df[df['Tags'].isna()].index)
df = df.drop(df[~df['Supported languages'].str.contains('English')].index)

In [4]:
# Limiting the 'About the game' text to 1800 characters (so it won't use too much ram)
# This is optional
# df['About the game'] = df['About the game'].apply(lambda x: x[:1800] if len(x) > 1800 else x)

# Keeping only the year in the 'Release date' label
df['Release date'] = df['Release date'].str[-4:]

In [5]:
# Dropping labels that won't be used
labels_to_drop = ['Reviews', 'Peak CCU',
       'Required age', 'DLC count',
       'Full audio languages',
       'Header image', 'Website', 'Support url', 'Support email', 'Windows',
       'Mac', 'Linux', 'Metacritic score', 'Metacritic url', 'User score',
       'Score rank', 'Achievements', 'Recommendations',
       'Notes', 'Average playtime forever', 'Average playtime two weeks',
       'Median playtime forever', 'Median playtime two weeks', 'Developers',
       'Publishers', 'Screenshots', 'Movies']

for label in labels_to_drop:
  df.drop(label, axis=1, inplace=True)

In [6]:
df.columns

Index(['AppID', 'Name', 'Release date', 'Estimated owners', 'Price',
       'About the game', 'Supported languages', 'Positive', 'Negative',
       'Categories', 'Genres', 'Tags', 'PositiveRate'],
      dtype='object')

In [7]:
# Filtering non coop games
df = df[df['Categories'].str.contains('Co-op')]
df = df.reset_index(drop=True)

In [8]:
# Import TfIdfVectorizer from scikit-learn
# TF-IDF(Term Frequency Inverse Document Frequency): 
# It can be defined as the calculation of how relevant a word is to a text
from sklearn.feature_extraction.text import TfidfVectorizer

# Define a TF-IDF Vectorizer Object. Remove all english stop words such as 'the', 'a'
tfidf = TfidfVectorizer(stop_words='english')

# Fits the TF-IDF vectorizer on the 'About the game' column of the df DataFrame, 
# which means it learns the vocabulary of the corpus and calculates the term frequencies and inverse document frequencies for each term.
tfidf_matrix = tfidf.fit_transform(df['About the game'])

In [9]:
# Import linear_kernel from scikit-learn
from sklearn.metrics.pairwise import linear_kernel

# Compute the cosine similarity matrix: a square matrix where each element represents the cosine similarity between two documents. 
# It's a measure of how similar they are, with a higher cosine similarity indicating greater similarity.
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [10]:
# Construct map of indices and game titles
indices = pd.Series(df.index, index=df['Name']).drop_duplicates()

# Saving data to files

In [11]:
import pickle

# Saving data to files is useful so that you don't need to download the dataset or update it when you don't want or need

# Define the filename for the pickle file
PICKLE_FILENAME = 'pickle_files/cosine_similarity_matrix.pkl'

# Save the cosine_sim matrix to a pickle file
with open(PICKLE_FILENAME, 'wb') as pickle_file:
    pickle.dump(cosine_sim, pickle_file)

    

PICKLE_FILENAME = 'pickle_files/df.pkl'

with open(PICKLE_FILENAME, 'wb') as pickle_file:
    pickle.dump(df, pickle_file)
    
'''
To load, it's possible to do this:

# Load the cosine_sim matrix from the pickle file
with open(pickle_filename, 'rb') as pickle_file:
    df_loaded = pickle.load(pickle_file)
'''

PICKLE_FILENAME = 'pickle_files/indices.pkl'

with open(PICKLE_FILENAME, 'wb') as pickle_file:
    pickle.dump(indices, pickle_file)


In [12]:
# Example on how to implement searching
indices[indices.index.str.find("raf") != -1]

Name
Playcraft                                      238
FortressCraft : Chapter 1                      245
Exocraft                                       290
Kingspray Graffiti VR                          805
Crafty                                        1015
FortressCraft Evolved!                        1237
Guncraft                                      1288
Tesla vs Lovecraft                            1578
Lumencraft                                    1947
ConflictCraft                                 1973
Minecraft Dungeons                            2208
Dungeoncraft                                  2473
Craftopia                                     2943
OceanCraft                                    3012
Robocraft                                     3353
Solace Crafting                               3406
Milkcraft                                     3559
Witchcraft                                    3711
RogueCraft Squadron                           3858
World of Aircraft: Glider 

In [13]:
# Example of function that takes in game title as input and outputs the 10 most similar games
def get_recommendations(name, cosine_sim=cosine_sim):
    
    if name not in indices:
      return None

    # Get the index of the game that matches the title
    idx = indices[name]

    if type(idx) == pd.core.series.Series: 
        idx = idx[0]

    # Get the pairwise similarity scores of all games with the inputed game
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the games based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar games
    sim_scores = sim_scores[1:11]

    # Get the game indices
    game_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar games
    return df.iloc[game_indices]

In [14]:
get_recommendations("Inside the Backrooms")

Unnamed: 0,AppID,Name,Release date,Estimated owners,Price,About the game,Supported languages,Positive,Negative,Categories,Genres,Tags,PositiveRate
3045,1889640,The Backrooms: Survival,2022,0 - 20000,6.99,Inspired by the popular internet 'creepypasta'...,['English'],446,65,"Single-player,Multi-player,PvP,Online PvP,Co-o...","Action,Adventure,Indie,Massively Multiplayer,S...","Early Access,Horror,Multiplayer,Survival Horro...",0.872798
2609,2006440,Secret Backrooms,2022,0 - 20000,2.99,Secret Backrooms A Different Approach To The B...,['English'],6,1,"Single-player,Multi-player,Co-op,LAN Co-op","Action,Indie","Action,3D,Cinematic,First-Person,Dark,Horror,C...",0.857143
5653,2264460,UnderBackrooms,2023,0 - 20000,3.99,UnderBackrooms is an independent horror game t...,['English'],9,11,"Single-player,Multi-player,Co-op,Online Co-op","Adventure,Indie,Early Access","Adventure,Indie,Early Access,Gore,Violent,Mult...",0.45
5722,2283870,The Backrooms Experiment,2023,0 - 20000,7.99,The Backrooms Experiment The Backrooms Experie...,['English'],4,2,"Single-player,Multi-player,Co-op,Online Co-op,...","Adventure,Indie,Early Access","Adventure,Walking Simulator,3D,First-Person,Re...",0.666667
5538,1879030,Return the Backrooms,2022,0 - 20000,3.74,Return the Backrooms is a co-op horror game wi...,"['English', 'French', 'Italian', 'German', 'Sp...",13,0,"Single-player,Multi-player,Co-op,Online Co-op","Indie,Early Access","Adventure,FPS,3D,First-Person,Psychedelic,Horr...",1.0
5369,1943950,Escape the Backrooms,2022,20000 - 50000,8.99,Explore the seemingly infinite expanse of eeri...,['English'],625,223,"Single-player,Multi-player,Co-op,Online Co-op,...","Indie,Early Access","Horror,Multiplayer,Adventure,Action,First-Pers...",0.737028
5487,2141730,Backrooms: Escape Together,2022,0 - 20000,3.99,"As a team of researchers, you are sent into th...",['English'],109,18,"Single-player,Multi-player,Co-op,Online Co-op","Action,Adventure,Casual,Indie,Simulation,Early...","Mystery,Psychological Horror,Survival Horror,M...",0.858268
2865,1985930,The Backrooms 1998 - Found Footage Survival Ho...,2022,20000 - 50000,9.99,TRIGGER WARNING: This game may be disturbing t...,['English'],303,50,"Single-player,Multi-player,Co-op","Action,Adventure,Indie,Simulation,Early Access","Psychological Horror,Early Access,Survival Hor...",0.858357
5848,2430410,Myth,2023,0 - 20000,5.59,DESCRIPTION If you enjoy games centered around...,"['English', 'Thai']",9,0,"Single-player,Multi-player,Co-op,Online Co-op,...","Action,Adventure","Action,Adventure,Puzzle,Exploration,3D,First-P...",1.0
5648,2251870,The Backrooms - Nightmare Dimension,2023,0 - 20000,6.99,"After long research, the scientists from SCP F...",['English'],0,3,"Single-player,Multi-player,Co-op,Online Co-op","Action,Indie","Horror,Online Co-Op,Multiplayer,Psychological ...",0.0
