In [1]:
import numpy as np
import pandas as pd
import data_prep 
import TensorRecommend
import os

#geting this path to work on any computer
base_path = os.getcwd()
games_info_path = os.path.join(base_path, "..", "data", "raw", "games_detailed_info2025.csv")
bgg_reviews_path = os.path.join(base_path, "..", "data", "raw", "bgg-26m-reviews.csv")

games_info = pd.read_csv(games_info_path)
games_info.rename(columns={'id': 'ID'}, inplace=True) #renaming column to match with reviews dataset for merging
bgg_reviews = pd.read_csv(bgg_reviews_path)

data_preprocessor = data_prep.DataPreprocessor(games_info, bgg_reviews)
game_info_cut = data_preprocessor.discretize_column(games_info, 'usersrated', n_bins=10, percent=0.01, drop_original=True )
game_info_final = data_preprocessor.discretize_column(game_info_cut, 'yearpublished', n_bins=10, drop_original=True)


In [2]:
#Merging datasets and adding features to the final dataset
dataprep = data_prep.DataPreprocessor(game_info_final, bgg_reviews)
data = dataprep.merge_datasets(featires_to_add = ['yearpublished_bin', 'usersrated_bin', 'minplayers', 'maxplayers'])
#dropping columns that are not needed for training and dropping rows with missing values
data.drop(columns=['Unnamed: 0', 'comment', 'name'], inplace=True)
data.dropna(inplace=True)

In [3]:
data.info()

<class 'pandas.DataFrame'>
Index: 9365559 entries, 0 to 9365581
Data columns (total 7 columns):
 #   Column             Dtype  
---  ------             -----  
 0   user               str    
 1   rating             float64
 2   ID                 int64  
 3   yearpublished_bin  int64  
 4   usersrated_bin     int64  
 5   minplayers         int64  
 6   maxplayers         int64  
dtypes: float64(1), int64(5), str(1)
memory usage: 571.6 MB


# Demo model

In [4]:
"""
data_entries - number of rows in the final dataset
num_users - number of unique users in the dataset
num_games - number of unique games in the dataset
num_features - number of unique values for each feature in the dataset
user2idx - dictionary mapping user to index value in tensor
game2idx - dictionary mapping game to index value in tensor
feature2idx - list of dictionaries mapping feature values to index values in tensor for each feature
"""

'\ndata_entries - number of rows in the final dataset\nnum_users - number of unique users in the dataset\nnum_games - number of unique games in the dataset\nnum_features - number of unique values for each feature in the dataset\nuser2idx - dictionary mapping user to index value in tensor\ngame2idx - dictionary mapping game to index value in tensor\nfeature2idx - list of dictionaries mapping feature values to index values in tensor for each feature\n'

In [5]:
data_entries, num_users, num_games, num_features, user2idx, game2idx, feature2idx = dataprep.data_to_tuple(data, feature_col=['usersrated_bin', 'minplayers'])

In [6]:
data_train, data_test = dataprep.train_test_split(data_entries, test_size=0.6)

Traning time!

In [7]:
epochs = 1 #number of epochs for training
k = 5 #number of latent factors
lambda_ = 0.1 #regularization parameter
eta = 0.1 #learning rate
tessst = TensorRecommend.TensorRecommend(k, lambda_=lambda_, eta=eta, data_entries=data_train, num_users=num_users, 
                                             num_items=num_games, num_features=num_features)


for epoch in range(epochs):
    tessst.train_epoch(t=epoch+1, initial_lr=eta, max_norm=5.0)


Epoch Loss: 21165880.3685, NR Loss: 20870446.1966, avg error: 2.3603


In [8]:
# Save the model so that this works on any computer without needing to retrain
base_path = os.getcwd()
model_path = os.path.join(os.path.dirname(base_path), "model_26_2")
os.makedirs(model_path, exist_ok=True)
tessst.save_model(path=model_path)

Aditional traning for 5 more epochs to see if it improves the results

In [9]:
epochs = 1 #number of epochs for training
k = 5 #number of latent factors
lambda_ = 0.1 #regularization parameter
eta = 0.1 #learning rate
path = os.path.join(os.path.dirname(base_path), "model_26_2")
tessst = TensorRecommend.TensorRecommend(k, lambda_=lambda_, eta=eta, data_entries=data_train, num_users=num_users, 
                                             num_items=num_games, num_features=num_features, loading=True, path=path)
for epoch in range(epochs):
    tessst.train_epoch(t=epoch+1, initial_lr=eta, max_norm=5.0)

Epoch Loss: 20664840.9345, NR Loss: 20369550.5990, avg error: 2.3318


In [10]:
#save the model again after additional training
model_path = os.path.join(os.path.dirname(base_path), "model_26_2")
os.makedirs(model_path, exist_ok=True)
tessst.save_model(path=model_path)

In [11]:
"""
    Keep only the records in the test set where the user and item are already in the train set.
    This is important because the model cannot give recommendations for users or games it hasn't seen during training."""
def filter_test_set(train_entries, test_entries):

    train_users = set()
    train_items = set()

    # izvuƒçemo sve user i item indekse iz traina
    for entry in train_entries:
        u_idx, m_idx, *_ = entry
        train_users.add(u_idx)
        train_items.add(m_idx)

    # filtriramo test
    filtered_test = [
        entry for entry in test_entries
        if entry[0] in train_users and entry[1] in train_items
    ]

    return filtered_test


filtered_test = filter_test_set(data_train, data_test)

In [12]:
epochs = 5
k = 5
lambda_ = 0.1
path_to_model = os.path.join(os.path.dirname(base_path), "model_26_2")
tessst = TensorRecommend.TensorRecommend(k, lambda_=lambda_, eta=0.1, data_entries=data_train, num_users=num_users, 
                                             num_items=num_games, num_features=num_features, loading=True, path=path_to_model)

In [13]:
import numpy as np
#Evaluate the model using RMSE on the filtered test set
def evaluate_rmse(model, test_entries):
    errors = []

    for entry in test_entries:
        pred = model.predict(entry)
        r = entry[-1]
        errors.append((r - pred) ** 2)

    return np.sqrt(np.mean(errors))

In [14]:
import random

random.seed(42)
# evaluation on 20% of the filtered test set for faster evaluation
n_20 = int(0.2 * len(filtered_test))
subset_20 = random.sample(filtered_test, n_20)

print(evaluate_rmse(tessst, subset_20))

2.328291784557648


In [15]:
tessst.top_games(tessst.U[0])

[(23, np.float64(9.904908013108011)),
 (38, np.float64(9.754997343970532)),
 (174, np.float64(9.721251379169761)),
 (144, np.float64(9.718109477804626)),
 (30, np.float64(9.690383559952249)),
 (78, np.float64(9.624102480846275)),
 (42, np.float64(9.62403863121237)),
 (37, np.float64(9.495872843909197)),
 (89, np.float64(9.458724056353415)),
 (121, np.float64(9.422517311437744))]

In [16]:
def get_game_name(m_idx, idx2game, game_info_df):
    game_id = idx2game[m_idx]
    row = game_info_df.loc[game_info_df["ID"] == game_id]
    if not row.empty:
        return row["name"].values[0]
    return "Unknown"

In [17]:
idx2game = {v: k for k, v in game2idx.items()}

In [18]:
def get_top_games_for_user(user_idx, model, idx2game, game_info_final):
    user_vec = model.U[user_idx]
    top = model.top_games(user_vec, top_n=10)
    pretty_results = []
    for m_idx, score in top:
        name = get_game_name(m_idx, idx2game, game_info_final)
        pretty_results.append((name, score))

    return pretty_results

In [19]:
get_top_games_for_user(0, tessst, idx2game, game_info_final)

[('Gloomhaven', np.float64(9.904908013108011)),
 ('Brass: Birmingham', np.float64(9.754997343970532)),
 ('War of the Ring: Second Edition', np.float64(9.721251379169761)),
 ('Twilight Imperium: Fourth Edition', np.float64(9.718109477804626)),
 ('Pandemic Legacy: Season 1', np.float64(9.690383559952249)),
 ('Gloomhaven: Jaws of the Lion', np.float64(9.624102480846275)),
 ('Ark Nova', np.float64(9.62403863121237)),
 ('Dune: Imperium', np.float64(9.495872843909197)),
 ('Star Wars: Rebellion', np.float64(9.458724056353415)),
 ('Gaia Project', np.float64(9.422517311437744))]