# Recommendation System using Item-based Collaborative Filtering
This is a recommendation system that suggests games to users based on their similarities with other items on the list

In [1]:
import pandas as pd 
import numpy as np
import scipy as sp 
from sklearn.metrics.pairwise import cosine_similarity
import operator
import fastparquet as fp 
import pyarrow as pa 
import pyarrow.parquet as pq

In [2]:
df_item_rec = pd.read_parquet('../data/processed/df_item_rec.parquet')
df_item_rec.head()

Unnamed: 0_level_0,title,genres
id,Unnamed: 1_level_1,Unnamed: 2_level_1
10.0,Counter-Strike,Action
20.0,Team Fortress Classic,Action
30.0,Day of Defeat,Action
40.0,Deathmatch Classic,Action
50.0,Half-Life: Opposing Force,Action


In [3]:
#create dummies 
df_item_rec = pd.get_dummies(df_item_rec, columns=['genres'], prefix='')

df_item_rec = df_item_rec.groupby(['id', 'title']).sum().reset_index()

df_item_rec.head()

Unnamed: 0,id,title,_Accounting,_Action,_Adventure,_Animation &amp; Modeling,_Audio Production,_Casual,_Design &amp; Illustration,_Early Access,...,_Photo Editing,_RPG,_Racing,_Simulation,_Software Training,_Sports,_Strategy,_Utilities,_Video Production,_Web Publishing
0,10.0,Counter-Strike,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,20.0,Team Fortress Classic,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,30.0,Day of Defeat,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,40.0,Deathmatch Classic,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,50.0,Half-Life: Opposing Force,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
similarities = cosine_similarity(df_item_rec.iloc[:,3:])
print(similarities)

[[1. 1. 1. ... 0. 1. 1.]
 [1. 1. 1. ... 0. 1. 1.]
 [1. 1. 1. ... 0. 1. 1.]
 ...
 [0. 0. 0. ... 1. 0. 0.]
 [1. 1. 1. ... 0. 1. 1.]
 [1. 1. 1. ... 0. 1. 1.]]


In [5]:
similarities.shape

(27422, 27422)

In [6]:
def game_recommend(id):
    
    id = int(id)
    # Create new dataset filtered by the given ID. 
    selected_game = df_item_rec[df_item_rec['id'] == id]
    # return error in case id  is not found on the dataframe
    if selected_game.empty:
        return "Game Id is not in the dataframe."
    
    # Calculate similarity and add other games in dataframe
    similarity_scores = similarities[df_item_rec[df_item_rec['id'] == id].index[0]]
    
    # Sort games by similarity, excluding the game itself from recommendation list. List top 5
    similar_games = similarity_scores.argsort()[::-1][1:6]
    
    # Add names of the games
    recommended_games = df_item_rec.iloc[similar_games]['title']
    
    return recommended_games

In [7]:
game_recommend(772540)

12658                Tempest: Pirate Action RPG
12701                  GGXrd System Voice - MAY
12700             GGXrd System Voice - KY KISKE
12699           GGXrd System Voice - SOL BADGUY
12698    GGXrd Character Unlock - LEO WHITEFANG
Name: title, dtype: object

In [8]:
df_item_rec

Unnamed: 0,id,title,_Accounting,_Action,_Adventure,_Animation &amp; Modeling,_Audio Production,_Casual,_Design &amp; Illustration,_Early Access,...,_Photo Editing,_RPG,_Racing,_Simulation,_Software Training,_Sports,_Strategy,_Utilities,_Video Production,_Web Publishing
0,10.0,Counter-Strike,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,20.0,Team Fortress Classic,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,30.0,Day of Defeat,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,40.0,Deathmatch Classic,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,50.0,Half-Life: Opposing Force,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27417,901805.0,Saints Row: The Third Season Pass DLC Pack,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
27418,2028055.0,Tom Clancy's Ghost Recon Future Soldier - Seas...,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
27419,2028056.0,Worms Revolution Season Pass,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
27420,2028103.0,Assassin’s Creed® III Season Pass,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In order to optimize memory usage for the API in render, we proceed to filter dataset

In [9]:
#Count rows
rows_count= len(df_item_rec)

#Divide dataframe
rows_fraction= rows_count// 10
 
#Selecciono la mitad superior
df_rec= df_item_rec.iloc[:rows_fraction]

In [10]:
df_rec.shape

(2742, 24)

In [11]:
similarities_render = cosine_similarity(df_rec.iloc[:,3:])

In [17]:
df_rec

Unnamed: 0,id,title,_Accounting,_Action,_Adventure,_Animation &amp; Modeling,_Audio Production,_Casual,_Design &amp; Illustration,_Early Access,...,_Photo Editing,_RPG,_Racing,_Simulation,_Software Training,_Sports,_Strategy,_Utilities,_Video Production,_Web Publishing
0,10.0,Counter-Strike,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,20.0,Team Fortress Classic,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,30.0,Day of Defeat,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,40.0,Deathmatch Classic,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,50.0,Half-Life: Opposing Force,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2737,218040.0,Democracy 2,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2738,218060.0,BIT.TRIP Presents... Runner2: Future Legend of...,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2739,218065.0,Runner2 - Good Friends Character Pack,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2740,218090.0,Unity of Command: Stalingrad Campaign,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


load dataset

In [12]:
df_rec.to_parquet('../data/df_rec.parquet')

We now can create a new function called 'game_recommend', that takes the id of a game and  returns an array of recommended games. The algorithm to generate recommendations is as follows:

In [13]:
def game_recommend(id: int):
    
    # Check if game 'id' exists in df_rec
    game = df_rec[df_rec['id'] == id]

    if game.empty:
        return("Game id '{id}' not registered.")
    
    # Find index of given game
    idx = game.index[0]

    # Take random sample of dataframe
    sample_size = 2000  # Define sample size
    df_sample = df_rec.sample(n=sample_size, random_state=42)  

    # Check similarity between game and sample
    sim_scores = cosine_similarity([df_rec.iloc[idx, 3:]], df_sample.iloc[:, 3:])

    # Get similarity score
    sim_scores = sim_scores[0]

    # Sort games based on similarity scores (descending order)
    similar_games = [(i, sim_scores[i]) for i in range(len(sim_scores)) if i != idx]
    similar_games = sorted(similar_games, key=lambda x: x[1], reverse=True)

    # Get top 5 most similar games
    similar_game_indices = [i[0] for i in similar_games[:5]]

    # List  of recommended games
    similar_game_names = df_sample['title'].iloc[similar_game_indices].tolist()

    return {"similar_games": similar_game_names}

In [14]:
game_recommend(3310)

{'similar_games': ['Rocksmith - Foster the People - Pumped Up Kicks',
  'Rocksmith - Albert King with Stevie Ray Vaughan - Born Under a Bad Sign',
  'Chime',
  'Samantha Swift and the Hidden Roses of Athena',
  'Elizabeth Find M.D. - Diagnosis Mystery - Season 2']}

In [18]:
game_recommend(218060)

{'similar_games': ['Magicka: Peculiar Gadgets Item Pack',
  'HOARD: Flame-Broiled SANDwich',
  'Capsized',
  'Steel Storm: Burning Retribution',
  'Sugar Cube: Bittersweet Factory']}

In [19]:
game_recommend(50)

{'similar_games': ['Magicka: Peculiar Gadgets Item Pack',
  'HOARD: Flame-Broiled SANDwich',
  'Capsized',
  'Steel Storm: Burning Retribution',
  'Sugar Cube: Bittersweet Factory']}

In [20]:
game_recommend(24420)

{'similar_games': ['Galcon Fusion',
  "Bad Rats: the Rats' Revenge",
  'Mevo and The Grooveriders',
  'Frozen Synapse',
  'iBomber Defense']}

# Recommendation System using User-based Collaborative Filtering
This is a recommendation system that suggests games to users based on their preferences and the preferences of similar users. 

In [2]:
import pandas as pd
import numpy as np

import scipy as sp
from sklearn.metrics.pairwise import cosine_similarity

import pyarrow as pa
import pyarrow.parquet as pq

In [3]:
df = pd.read_csv('../data/processed/df_recommendation.csv')
df

Unnamed: 0,user_id,rating,item_name
0,76561197970982479,5,Killing Floor
1,76561197970982479,5,Zeno Clash
2,76561197970982479,5,Metro 2033
3,js41637,5,Barbie™ Dreamhouse Party™
4,js41637,5,Euro Truck Simulator 2
...,...,...,...
49038,llDracuwulf,1,Counter-Strike: Global Offensive
49039,76561198223837952,5,Enclave
49040,76561198229845636,3,Counter-Strike: Global Offensive
49041,76561198232478272,5,Counter-Strike: Global Offensive


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49043 entries, 0 to 49042
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   user_id    49043 non-null  object
 1   rating     49043 non-null  int64 
 2   item_name  43278 non-null  object
dtypes: int64(1), object(2)
memory usage: 1.1+ MB


## model training
Model trained with user-based collaborative filtering to generate recommendations.

In [4]:
piv = df.pivot_table(index=['user_id'], columns=['item_name'], values='rating')
piv

item_name,0RBITALIS,"10,000,000",100% Orange Juice,1001 Spikes,12 Labours of Hercules,12 Labours of Hercules II: The Cretan Bull,123 Slaughter Me Street,140,16 Bit Arena,200% Mixed Juice!,...,ibb & obb,inMomentum,liteCam Game: 100 FPS Game Capture,oO,planetarian ~the reverie of a little planet~,resident evil 4 / biohazard 4,sZone-Online,the static speaks my name,theHunter,theHunter: Primal
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
--000--,,,,,,,,,,,...,,,,,,,,,,
--ace--,,,,,,,,,,,...,,,,,,,,,,
--ionex--,,,,,,,,,,,...,,,,,,,,,,
-2SV-vuLB-Kg,,,,,,,,,,,...,,,,,,,,,,
-Azsael-,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
zvanik,,,,,,,,,,,...,,,,,,,,,,
zwanzigdrei,,,,,,,,,,,...,,,,,,,,,,
zy0705,,,,,,,,,,,...,,,,,,,,,,
zynxgameth,,,,,,,,,,,...,,,,,,,,,,


In [5]:
# Normalize 'piv'using min-max scaler
piv_norm = piv.apply(lambda x: (x-np.mean(x))/(np.max(x)-np.min(x)), axis=1)
# Delete columns containing only zeros or no rating at all, fill voids with  0 and transpose
piv_norm.fillna(0, inplace=True)
piv_norm = piv_norm.T
piv_norm = piv_norm.loc[:, (piv_norm != 0).any(axis=0)]
piv_norm

user_id,-GM-Dragon,-Mad-,-_PussyDestroyer_-,00000000000000000001227,00454211432342,00True,01001000-01101001,01189958889189157253,022899,04061993,...,zimbalor,zombi_anon,zombiehackerbrah,zombieman182,zomgieee,zp3413,zrustz16,zuzuga2003,zv_odd,zvanik
item_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0RBITALIS,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
100% Orange Juice,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1001 Spikes,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12 Labours of Hercules,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
resident evil 4 / biohazard 4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
sZone-Online,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
the static speaks my name,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
theHunter,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
piv_sparse = sp.sparse.csr_matrix(piv_norm.values)
piv_sparse

<2640x6512 sparse matrix of type '<class 'numpy.float64'>'
	with 23658 stored elements in Compressed Sparse Row format>

In [7]:
item_similarity = cosine_similarity(piv_sparse)
user_similarity = cosine_similarity(piv_sparse.T)

In [8]:
#item similarity dataframe
item_sim_df = pd.DataFrame(item_similarity, index = piv_norm.index, columns = piv_norm.index)
#user similarity dataframe
user_sim_df = pd.DataFrame(user_similarity, index = piv_norm.columns, columns = piv_norm.columns)

## def recommendation_user(user_id:str)
takes in a user ID as input and returns a list of the top 5 recommended game names for that user.

Methodology:
- Check if the user exists in the data. If not, return an empty list.
- Get the top 5 most similar users to the target user based on the user similarity dataframe.
- Get the items rated by the similar users from the user-item rating matrix.
- Remove the items already rated by the target user.
- Fill missing values with 0.
- Calculate the average rating for each item across similar users.
- Sort the items by their average rating and get the top 5.
- Return the top recommended item names.

In [9]:
def recommendation_user(user_id):
  """
  Recommends items (games) to a user based on user-based collaborative filtering.

  Args:
      user_id (str): The ID of the user for whom recommendations are generated.

  Returns:
      list: A list of top 5 recommended item names (games) for the user.
      If the user ID is not found in the data, an empty list is returned.
  """
  # Check if user exists in the data
  if user_id not in piv_norm.columns:
    return ['No data availiable on user{}'.format(user_id)] 

  # Get similar users based on user_similarity dataframe
  similar_users = user_sim_df.sort_values(by=user_id, ascending=False).index[1:11]

  # Get items rated by similar users
  recommended_items = piv_norm.loc[:, similar_users].copy()

  # Remove items already rated by the target user
  if user_id in recommended_items.columns:
    recommended_items.drop(user_id, axis=1, inplace=True)

  # Fill missing values with 0
  recommended_items.fillna(0, inplace=True)

  # Calculate average rating for each item across similar users
  average_ratings = recommended_items.mean(axis=1)

  # Sort items by their average rating and get the top 5
  top_recommendations = average_ratings.sort_values(ascending=False).head(5).index.tolist()

  print(f'Top 5 recommended games for {user_id}: ')

  return top_recommendations

In [10]:
recommendation_user('zombieman182')

Top 5 recommended games for zombieman182: 


['Counter-Strike: Global Offensive',
 'Path of Exile',
 'Stranded Deep',
 'METAL GEAR RISING: REVENGEANCE',
 'Age of Empires® III: Complete Collection']

In [11]:
recommendation_user('zvanik')

Top 5 recommended games for zvanik: 


['Terraria',
 "Garry's Mod",
 'Counter-Strike: Global Offensive',
 'Arma 2: Operation Arrowhead',
 'Hand of Fate']

## Load

In [12]:
piv_norm.to_csv('../data/processed/piv_norm.csv', index=False, encoding='utf-8')
item_sim_df.to_csv('../data/processed/item_sim_df.csv', index=False, encoding='utf-8')
user_sim_df.to_csv('../data/processed/user_sim_df.csv', index=False, encoding='utf-8')

In [13]:
pq.write_table(pa.Table.from_pandas(piv_norm), '../data/piv_norm.parquet')
pq.write_table(pa.Table.from_pandas(user_sim_df), '../data/user_sim_df.parquet')
pq.write_table(pa.Table.from_pandas(item_sim_df), '../data/item_sim_df.parquet')