# Recommender System for Board Games from [BoardGameGeek.com](https://boardgamegeek.com/)



1.   Iatrou Manos
2.   Papageorgiou Vasileios
3. Sykianakis Xaralambos




# Dataset Description



*   Games File: Game features like game complexity, category etc
*   User Ratings File: User ratings of board games
*   Mechanics File: More detailed info on game characteristics
*   Themes File: More detailed info on game characteristics



In [None]:
# Install the packages

!pip install alibi
!pip install umap-learn
!pip install optuna
!pip install anchor-exp
!pip install dice_ml
!pip install shap

In [2]:
import warnings
warnings.filterwarnings("ignore", message=".*colsample_bytree.*")
warnings.filterwarnings("ignore", message=".*subsample.*")
from numba.core.errors import NumbaDeprecationWarning, NumbaPendingDeprecationWarning
warnings.simplefilter('ignore', category=NumbaDeprecationWarning)
warnings.simplefilter('ignore', category=NumbaPendingDeprecationWarning)

import pandas as pd
import numpy as np
import os
import glob
from pathlib import Path
from typing import List
import lightgbm as lgb
from lightgbm import LGBMClassifier
from lightgbm.callback import early_stopping, log_evaluation
from scipy.sparse import csr_matrix
from scipy.sparse import coo_matrix
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.cluster import DBSCAN
from sklearn.ensemble import RandomForestClassifier
import shap
import umap
import optuna
from anchor import anchor_tabular
from alibi.explainers import AnchorTabular
import dice_ml
import matplotlib.pyplot as plt


In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [34]:
folder_path = Path('/content/drive/MyDrive/Colab Notebooks/Advanced Customer Analytics/Interpretable Predictions/Recommender_Files/datasets')
#folder_path = r'C:\Users\maniat\OneDrive - Lyse AS\Desktop\personal\MSc\Customer Analytics\Recommender Classification\Recommender_Files'

In [5]:
def get_description(df: pd.DataFrame):
    """
     Takes a dataframe as argument and collapses many
    binary one-hot encoded column names to text
    """
    cols = df.columns[15:]
    categories = df[cols].gt(0).apply(lambda x: x.index[x].tolist(), axis=1)

    return categories.apply(lambda x: ','.join(x) if x else 'Other')

In [6]:
def downcasting_types(df: pd.DataFrame):
  for column in df.columns:
    colType = df[column].dtype
    if colType == 'float64' :
      df[column] = pd.to_numeric(df[column], downcast='float')
    elif colType == 'int64' :
      df[column] = pd.to_numeric(df[column], downcast='integer')
  return df

In [7]:
def load_data(path: str):
    """
    Load all CSV files in the path folder and return a dictionary of DataFrames.

    """
    folder = Path(path)
    if not folder.exists() or not folder.is_dir():
        raise FileNotFoundError(f"{path} directory was not found")

    dataframes = {}
    for file in folder.glob('*.csv'):
        file_name = file.stem
        df = pd.read_csv(file)
        dataframes[file_name] = df

    return dataframes

In [8]:
def preprocess_and_merge(dataframes: dict):
    """
    Preprocess and merge DataFrames from the given dictionary
    and returns a tuple ,(user_ratings, games_df).
    """

    columns_to_keep = ['BGGId', 'GameWeight', 'MfgPlaytime', 'NumAlternates', 'NumExpansions',
                               'NumImplementations', 'Kickstarted', 'Cat:Thematic', 'Cat:Strategy', 'Cat:War',
                               'Cat:Family', 'Cat:CGS', 'Cat:Abstract', 'Cat:Party', 'Cat:Childrens']

    for file_name, df in dataframes.items():
        if file_name == "games":
            # Preprocessing for 'games'
            df = df[columns_to_keep].copy()
            renaming_dict = {col: col.replace('Cat:', '') if col.startswith('Cat:')
                             else col for col in df.columns}
            df.rename(columns=renaming_dict, inplace=True)

            # A small cleaning step
            df = df[(df['MfgPlaytime'] > 0) & (df['MfgPlaytime'] <= 240) & (df['GameWeight'] > 0)]

            # Discretizer
            # discretizer = KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='quantile')
            # df[columns_to_discretize] = discretizer.fit_transform(df[columns_to_discretize])


        elif file_name == "themes":
            # Preprocessing for 'themes'
            renaming_dict = {col: col.replace('Theme_', '') if col.startswith('Theme_')
                             else col for col in df.columns}
            df.rename(columns=renaming_dict, inplace=True)

        elif file_name == "mechanics":
            pass

        elif file_name == "user_ratings":

            df['Rating'] = df['Rating'].apply(lambda x: 1 if x >= 7 else -1)

            label_encoder = LabelEncoder()
            df['uid'] = label_encoder.fit_transform(df['Username'])
            df.drop('Username', axis=1, inplace=True)

            df = df.groupby('uid').filter(lambda x: len(x) >= 300).groupby('BGGId').filter(lambda x: len(x) >= 100)
            df.drop_duplicates(subset=['uid', 'BGGId'], keep='last', inplace=True)

        dataframes[file_name] = df

    # Prepare final games dataframe
    merged_df = pd.merge(dataframes['games'], dataframes['mechanics'], on='BGGId', how='left')
    games_df = pd.merge(merged_df, dataframes['themes'], on='BGGId', how='left')
    games_df['Details'] = get_description(games_df)

    #columns_to_drop = dataframes['themes'].columns[1:].tolist() + dataframes['mechanics'].columns[1:].tolist()
    columns_to_drop = [col for col in dataframes['themes'].columns.tolist() + dataframes['mechanics'].columns.tolist() if col != 'BGGId']

    games_df = games_df.drop(columns=columns_to_drop, axis=1)
    games_df = downcasting_types(games_df)

    # Prepare final user_ratings dataframe
    user_ratings_df = dataframes.get('user_ratings', pd.DataFrame())
    # user_ratings_df = dataframes.get('user_ratings', None)
    user_ratings_df = downcasting_types(user_ratings_df)


    return user_ratings_df, games_df

In [9]:
def create_user_clusters(train_df: pd.DataFrame,
                         test_df: pd.DataFrame,
                         uid_col: str, #user id col
                         gameid_col: str, #game id col
                         rating_col: str): #rating col
    """
    Create user clusters for both training and test data
    using UMAP for dimensionality reduction and DBSCAN for clustering
    and returns a tuple of the clusters for the X_train and X_test

    """

    #
    user_game_matrix_train = train_df.pivot(index=uid_col, columns=gameid_col, values=rating_col).fillna(0)

    # UMAP to reduce dimensions on training data
    reduction = umap.UMAP(n_neighbors=20, n_components=100, metric='cosine', min_dist=0.0, random_state=42)
    embedding_train = reduction.fit_transform(user_game_matrix_train)

    # DBSCAN on training data
    dbscan = DBSCAN(eps=0.26, min_samples=15, metric='euclidean')
    clusters_train = dbscan.fit_predict(embedding_train)
    user_game_matrix_train['Cluster'] = clusters_train

    # Transform test data using the trained UMAP model, Cluster with DBSCAN
    user_game_matrix_test = test_df.pivot(index=uid_col, columns=gameid_col, values=rating_col).fillna(0)
    embedding_test = reduction.transform(user_game_matrix_test)
    clusters_test = dbscan.fit_predict(embedding_test)
    user_game_matrix_test['Cluster'] = clusters_test

    return user_game_matrix_train[['Cluster']].reset_index(), user_game_matrix_test[['Cluster']].reset_index()

In [10]:
def prepare_train_test_split(path: str):
    """
    Loads data, preprocesses it, merges, and returns
    the train, test dataframes.
    """
    # Call load_data, preprocess_and_merge
    dataframes = load_data(path)
    user_ratings_df, games_df = preprocess_and_merge(dataframes)

    X = pd.merge(user_ratings_df, games_df, on='BGGId', how='left')
    X = X.dropna()                          # CHECK HERE
    y_model = X['Rating']
    X_model = X.drop(['Details'], axis=1)

    X_train, X_test, y_train, y_test = train_test_split(X_model, y_model, test_size=0.20)

    train_for_clustering = X_train[['uid', 'BGGId', 'Rating']]
    test_for_clustering = X_test[['uid', 'BGGId', 'Rating']]

    user_clusters_train, user_clusters_test = create_user_clusters(train_for_clustering, test_for_clustering, 'uid', 'BGGId', 'Rating')

    X_train = X_train.merge(user_clusters_train, on='uid', how='left')
    X_test = X_test.merge(user_clusters_test, on='uid', how='left')

    X_train = X_train.drop(['Rating'], axis=1)
    X_train = downcasting_types(X_train)
    X_test = X_test.drop(['Rating'], axis=1)
    X_test = downcasting_types(X_test)
    y_train = y_train.replace(-1, 0)
    y_test = y_test.replace(-1, 0)

    return X_train, X_test, y_train, y_test


In [11]:
def optimize_hyperparameters(X_train: pd.DataFrame,
                             y_train: pd.DataFrame,
                             n_trials: int = 5,
                             cv_folds: int = 5):
    """
    Performs hyperparameter optimization using Optuna for LGBMClassifier.
    Uses X_train and y_train for n_trials and returns the best
    parameters of the optimization.
    """

    study = optuna.create_study(direction="maximize")

    def objective(trial):
        params = {
            "boosting_type": "gbdt",
            "num_leaves": trial.suggest_int("num_leaves", 2, 256),
            "learning_rate": trial.suggest_float("learning_rate", 0.001, 0.5),
            "feature_fraction": trial.suggest_float("feature_fraction", 0.1, 1.0),
            "bagging_fraction": trial.suggest_float("bagging_fraction", 0.1, 1.0),
            "bagging_freq": trial.suggest_int("bagging_freq", 1, 10),
            "min_child_samples": trial.suggest_int("min_child_samples", 1, 100)
        }


        model = LGBMClassifier(objective='binary',
                               metric='binary_logloss',
                               **params, n_jobs=-1)

        # CV
        scores = cross_val_score(model, X_train, y_train,
                                 cv=cv_folds, scoring='roc_auc')
        return scores.mean()

    study.optimize(objective, n_trials=n_trials)
    best_params = study.best_params
    return best_params

In [12]:
def train_model_with_evaluation(X_train: pd.DataFrame,
                                y_train: pd.DataFrame,
                                X_test: pd.DataFrame,
                                y_test: pd.DataFrame,
                                n_trials: int, # if optuna = 1 , the number of trials
                                n_estimators: int,  # number of boosting rounds for LGBM,
                                optuna_on=0 #turn to 1 if you want to use
                               ):
    """
    Trains the model using the best parameters found
    by Optuna and evaluates it on the test set and returns the
    trained LightGBM model.
    """

    if optuna_on==1:
      best_params = optimize_hyperparameters(X_train, y_train, n_trials=n_trials)
      best_params["objective"] = "binary"
      best_params["metric"] = "binary_logloss"
      best_params["n_estimators"] = n_estimators
      best_params["n_jobs"] = -1
      model = LGBMClassifier(**best_params)
    else:
      model = LGBMClassifier()

    early_stopping_callback = early_stopping(stopping_rounds=100, first_metric_only=True, verbose=True)
    log_evaluation_callback = log_evaluation(period=50)


    model.fit(X_train, y_train, eval_set=[(X_test, y_test)], callbacks=[early_stopping_callback, log_evaluation_callback])


    y_pred = model.predict_proba(X_test)[:, 1]


    roc_auc = roc_auc_score(y_test, y_pred)
    print("ROC AUC on the test set:", roc_auc)

    return model, roc_auc


In [35]:
X_train, X_test, y_train, y_test = prepare_train_test_split(folder_path)

n_jobs value -1 overridden to 1 by setting random_state. Use no seed for parallelism.


In [36]:
columns_to_drop = ["BGGId", "uid"]

X_train_unindexed = X_train.drop(columns=columns_to_drop, axis=1)
X_test_unindexed = X_test.drop(columns=columns_to_drop, axis=1)

In [37]:
trained_model,roc_auc = train_model_with_evaluation(X_train_unindexed, y_train, X_test_unindexed, y_test, n_trials=10, n_estimators=1000,optuna_on=0)

[LightGBM] [Info] Number of positive: 2459243, number of negative: 1703666
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.542971 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 516
[LightGBM] [Info] Number of data points in the train set: 4162909, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.590751 -> initscore=0.367071
[LightGBM] [Info] Start training from score 0.367071
Training until validation scores don't improve for 100 rounds
[50]	valid_0's binary_logloss: 0.621179
[100]	valid_0's binary_logloss: 0.61678
Did not meet early stopping. Best iteration is:
[100]	valid_0's binary_logloss: 0.61678
Evaluated only: binary_logloss
ROC AUC on the test set: 0.6947796943106823


In [38]:
def explain_prediction(user_id:int,
                       board_game_id:int,
                       trained_model: lgb.LGBMModel,
                       X_train: pd.DataFrame,
                       X_test: pd.DataFrame,
                       y_train: pd.Series,
                       categorical_features: List[str]):
    """
    Function to explain a prediction made by a trained LGBM binary classifier model.
    Generates an Anchor explanation for a random pair or user,game
    If the prediction is 0, it also generates a counterfactual explanation using DiCE.

    """

    dataset = pd.concat([X_train, X_test])
    new_user = dataset.drop_duplicates(subset='uid').loc[:, ['uid', 'Cluster']]
    new_game = dataset.drop_duplicates(subset='BGGId').drop(columns=['uid', 'Cluster'])

    # Pick random user and game
    #random_user = new_user['uid'].sample(n=1).iloc[0]
    #random_game = new_game['BGGId'].sample(n=1).iloc[0]

    # Filter dataframes for the user and game pair
    selected_user = new_user[new_user['uid'] == user_id]
    selected_game = new_game[new_game['BGGId'] == board_game_id]
    test_instance = pd.concat([selected_user.reset_index(drop=True), selected_game.reset_index(drop=True)], axis=1)


    test_instance = test_instance[X_train.columns].drop(columns=['uid', 'BGGId'])
    print("Created Test Instance:\n\n",test_instance.to_string(index=False))
    prediction = trained_model.predict(test_instance)[0]

    print('\n\nAnchor Explanation:\n')
    # ANCHOR EXPLANATION
    columns_to_drop = ["BGGId", "uid"]

    X_train_unindexed = X_train.drop(columns=columns_to_drop, axis=1)
    X_test_unindexed = X_test.drop(columns=columns_to_drop, axis=1)

    categorical_names = {}
    headers = list(X_train_unindexed)
    for idx, feature in enumerate(headers):
        if feature in categorical_features:
          categorical_names[idx] = list(map(str,X_train_unindexed[feature].unique()))

    explainer = AnchorTabular(
    trained_model.predict,
    feature_names=X_train_unindexed.columns.tolist(),
    categorical_names=categorical_names)

    explainer.fit(X_train_unindexed.to_numpy())

    # Generate Anchor explanation
    result = explainer.explain(np.array(test_instance.values))

    #result = explainer.explain(test_instance.values)

    print('Prediction:', trained_model.predict(np.array(test_instance.values).reshape(1,-1))[0])
    print('Anchor:', result.data['anchor'])
    print("Precision: {:.2f}".format(result.data['precision']))
    print("Coverage: {:.2f}".format(result.data['coverage']))

#     print('\n\nASHAP Tree explainer')

#     # SHAP Tree explainer
#     explainer = shap.TreeExplainer(trained_model)
#     shap_instance_values = explainer.shap_values(test_instance)
#     shap.force_plot(explainer.expected_value, shap_instance_values, instance_to_explain)


    # Counterfactuals with DiCE (if prediction is 0)
    if prediction == 0:
        print('\n\nCounterfactuals with DiCE:\n')
        # Create the dataset for DiCE
        X_train_labeled = X_train_unindexed.reset_index(drop=True)
        y_train_labeled = y_train.reset_index(drop=True)

        data_new = pd.concat([X_train_labeled,y_train_labeled], axis=1)

        d = dice_ml.Data(
            dataframe=data_new,
            continuous_features=[
                'GameWeight', 'MfgPlaytime', 'NumAlternates',
                'NumExpansions', 'NumImplementations'],
            outcome_name='Rating'
        )

        backend = 'sklearn'
        m = dice_ml.Model(model=trained_model, backend=backend)

        exp = dice_ml.Dice(d, m)

        dice_exp = exp.generate_counterfactuals(test_instance,
                                                total_CFs=4,
                                                desired_class="opposite",
                                                features_to_vary=['GameWeight', 'MfgPlaytime', 'NumAlternates',
                                                                  'NumExpansions', 'NumImplementations'])

        # Visualize the counterfactuals
        dice_exp.visualize_as_dataframe(show_only_changes=True)


        return None


In [39]:
''' Select a pair of (user_id, game_id) to see prediction and explanation
    proposed pairs for testing:
    (195052, 7806)
    (331576, 115)
    (365655, 7806)
    (80816,10206)
'''
categorical_features = ['Kickstarted', 'Thematic', 'Strategy', 'War', 'Family', 'CGS', 'Abstract', 'Party', 'Childrens', 'Cluster']
explain_prediction(80816, 10206,trained_model, X_train, X_test, y_train, categorical_features)

Created Test Instance:

  GameWeight  MfgPlaytime  NumAlternates  NumExpansions  NumImplementations  Kickstarted  Thematic  Strategy  War  Family  CGS  Abstract  Party  Childrens  Cluster
     1.2644         30.0            4.0            0.0                 4.0          0.0       0.0       0.0  0.0     1.0  0.0       0.0    1.0        0.0        0


Anchor Explanation:

Prediction: 0
Anchor: ['GameWeight <= 1.66', 'NumExpansions <= 0.00']
Precision: 0.96
Coverage: 0.13


Counterfactuals with DiCE:



100%|██████████| 1/1 [00:08<00:00,  8.29s/it]

Query instance (original outcome : 0)





Unnamed: 0,GameWeight,MfgPlaytime,NumAlternates,NumExpansions,NumImplementations,Kickstarted,Thematic,Strategy,War,Family,CGS,Abstract,Party,Childrens,Cluster,Rating
0,1.2644,30.0,4.0,0.0,4.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0,0



Diverse Counterfactual set (new outcome: 1)


Unnamed: 0,GameWeight,MfgPlaytime,NumAlternates,NumExpansions,NumImplementations,Kickstarted,Thematic,Strategy,War,Family,CGS,Abstract,Party,Childrens,Cluster,Rating
0,2.700000047683716,-,-,-,18.399999618530273,-,-,-,-,-,-,-,-,-,-,1.0
1,2.400000095367432,-,-,66.9000015258789,-,-,-,-,-,-,-,-,-,-,-,1.0
2,3.5999999046325684,144.3000030517578,-,-,-,-,-,-,-,-,-,-,-,-,-,1.0
3,2.200000047683716,-,-,-,-,-,-,-,-,-,-,-,-,-,-,1.0
