In [1]:
import pandas as pd

bgg_GameItem = pd.read_csv('bgg_raw/bgg_GameItem.csv')
bgg_Person = pd.read_csv('bgg_raw/bgg_Person.csv')
# bgg_Category = pd.read_csv('bgg_raw/bgg_Category.csv')
# bgg_GameFamily = pd.read_csv('bgg_raw/bgg_GameFamily.csv')
# bgg_GameType = pd.read_csv('bgg_raw/bgg_GameType.csv')
# bgg_Mechanic = pd.read_csv('bgg_raw/bgg_Mechanic.csv')
#bgg_Publisher = pd.read_csv('bgg_raw/bgg_Publisher.csv')

In [2]:
# all game variables
game_covariates = [
#'num_votes',
#'avg_rating',
#'stddev_rating',
#'rank',
'bayes_rating',
#"novelty",
'name',
'year',
'bgg_id',
'mechanic',
'designer',
'complexity',
#'category',
'game_type',
'min_players',
'max_players',
# 'publisher',
# 'min_players_rec',
# 'max_players_rec',
# 'min_players_best',
# 'max_players_best',
 'min_age',
# 'min_age_rec',
# 'min_time',
# 'max_time',
# 'cooperative',
# 'family',
# 'language_dependency',
# 'artist',
# # filter out ?
# 'compilation',
#'compilation_of',
# 'implementation',
# 'integration',
#  'bga_id',
# 'dbpedia_id',
# 'luding_id',
# 'spielen_id',
# 'wikidata_id',
# 'wikipedia_id'
#
]

<h1>Cleaning</h1>

In [3]:
core = bgg_GameItem.copy()
core['mechanic'] = core.mechanic.str.split(",").map(lambda x: tuple(sorted(set(x))),na_action="ignore")

#filter out trivial reimplementations: those reimplementations of previously made games which do not change their mechanics.
lut_mechanic = core[["bgg_id","mechanic"]].set_index("bgg_id").mechanic.to_dict()
core['implementation'] = core.implementation.str.split(",")
expansions = core[~core.implementation.isna()][["bgg_id","mechanic","implementation"]].explode("implementation")
expansions.implementation = expansions.implementation.astype(int)
expansions["primitive"] = expansions.apply(lambda x: lut_mechanic.get(x["implementation"],-1)==x["mechanic"],axis=1)
expansions = expansions.sort_values("primitive",ascending=False).drop_duplicates("bgg_id")
primitive_reimplementations = expansions[expansions.primitive].bgg_id.values.tolist()

#filter out trivial expansions: those expansions of previously made games which do not change their mechanics.
lut_mechanic = core[["bgg_id","mechanic"]].set_index("bgg_id").mechanic.to_dict()
expansions = core[["bgg_id","mechanic","integration"]].copy()
expansions['integration'] = expansions.integration.str.split(",")
expansions = expansions[~expansions.integration.isna()].explode("integration")
expansions.integration = expansions.integration.astype(int)
expansions["primitive"] = expansions.apply(lambda x: lut_mechanic.get(x["integration"],-1)==x["mechanic"],axis=1)
expansions = expansions.sort_values("primitive",ascending=False).drop_duplicates("bgg_id")
primitive_expansions = expansions[expansions.primitive].bgg_id.values.tolist()
#apply filter jointly
core = core[~core.bgg_id.isin(primitive_reimplementations+primitive_expansions)]


# Team must include (credited) designer
core = core[core.designer.notna()]  
core['designer'] = core.designer.str.split(",")
core['designer'] = core['designer'].map(lambda x: sorted([d for d in x if d != "3"])) # 3 is uncredited
core['designer'] = core['designer'].map(lambda x: x if x else pd.NA)
core = core[core.designer.notna()]

core = core[core.mechanic.notna()]  # Must include mechanics
core = core[core.bayes_rating.notna()]  # At least 30 ratings
core = core[(core.year >= 1990) & (core.year < 2024)]  # Only games between 1990 and 2023
core.year = core.year.astype(int) 
core = core[game_covariates]
core = core.rename({"bayes_rating":"rating"},axis=1)
core = core.sort_values("year").reset_index(drop=True)
#core = core[core.mechanic.map(len)<5]

In [4]:
# Concatenate value counts of designer lengths
designer_length_counts = core.designer.map(len).value_counts()
designer_length_counts_normalized = core.designer.map(len).value_counts(normalize=True).cumsum()
result = pd.concat([designer_length_counts, designer_length_counts_normalized], axis=1)

# Display the result
result

Unnamed: 0_level_0,count,proportion
designer,Unnamed: 1_level_1,Unnamed: 2_level_1
1,13275,0.699605
2,4315,0.927009
3,980,0.978656
4,242,0.99141
5,88,0.996047
7,28,0.997523
6,25,0.998841
9,6,0.999157
8,6,0.999473
11,4,0.999684


In [5]:
core = core[core.designer.map(len) <= 9]  # Limit the number of designers to 9 => 99.9%


<h1>Data Preparation

<h4>Gender Inference

In [6]:
import sys
import os

module_path = os.path.abspath(os.path.join('infer_gender'))
if module_path not in sys.path:
    sys.path.append(module_path)

import gender_inference as gi  # type: ignore
import pandas as pd

bgg_person_path = './bgg_raw/bgg_Person.csv'
bgg_person = pd.read_csv(bgg_person_path)
bgg_person['is_man'] = gi.infer_gender(bgg_person['name'])
bgg_person.is_man.convert_dtypes().describe()

count     45251.0
mean     0.796424
std      0.402661
min           0.0
25%           1.0
50%           1.0
75%           1.0
max           1.0
Name: is_man, dtype: Float64

<h4>Compute Novelties

In [7]:
import pandas as pd
import novelty_computation as nc
import numpy as np

data = core[["bgg_id","year","mechanic"]].values
dummies, lut = nc.compute_dummies(data)
dist_matrix = nc.compute_distance_matrix(dummies)
novelty = nc.compute_novelty(data, dist_matrix, lut,normalize=False)
novelty = novelty.set_index("id").novelty

In [8]:
novelty_dict = novelty.to_dict()
core["novelty"] = core.bgg_id.map(novelty_dict)
core["log_novelty"] = core["novelty"].map(np.log)


In [9]:
core.round(2).sort_values("year").to_csv("bgg.csv",index=False)

<h1> Eventnet Format


In [10]:
dummy_designer = core.set_index("year").sort_index()["designer"].explode().drop_duplicates().reset_index()
dummy_designer["event"] = "add_designer"
#dummy_designer["year"] = dummy_designer["year"] - 1
dummy_designer.head(2)

Unnamed: 0,year,designer,event
0,1990,1718,add_designer
1,1990,1747,add_designer


In [11]:
dummy_mechanic = core.set_index("year").sort_index()["mechanic"].explode().drop_duplicates().reset_index()
dummy_mechanic["event"]= "add_mechanic"
#dummy_mechanic["year"] = dummy_mechanic["year"] - 1
dummy_mechanic.head(2)

Unnamed: 0,year,mechanic,event
0,1990,2026,add_mechanic
1,1990,2070,add_mechanic


In [12]:
gender_dict = bgg_person.set_index("bgg_id")["is_man"].replace(0,-1).fillna(0).to_dict()
dummy_gender = dummy_designer.copy()
dummy_gender["event"] = "add_gender"
dummy_gender["year"] = 1990
dummy_gender["weight"] = dummy_gender.designer.astype(int).map(gender_dict)
dummy_gender.head(2)

Unnamed: 0,year,designer,event,weight
0,1990,1718,add_gender,0
1,1990,1747,add_gender,1


In [13]:
import csv 
import seaborn
import matplotlib.pyplot as plt

game_event = core[["year","bgg_id","designer","mechanic"]].copy()
game_event.year = game_event.year.astype(int)
game_event.bgg_id = game_event.bgg_id.astype(int)

for target in ["novelty","rating"]:
    game = game_event.copy()
    game["weight"] = core[target]
    game["event"] = "game"

    # normalize weights like lerner => scientific impact
    average_weight_per_year = game.groupby('year')['weight'].mean().reset_index()
    average_weight_per_year.columns = ['year', 'avg_weight']    
    game = pd.merge(game, average_weight_per_year, on='year')
    game['weight'] = game['weight'] - game['avg_weight']
    game['weight'] = game['weight'].fillna(0)
    game.drop(columns=['avg_weight'], inplace=True)
    
    game = game.explode("mechanic").explode("designer")
    events = pd.concat([dummy_designer,dummy_mechanic,dummy_gender,game]).copy()
    events.bgg_id = events.bgg_id.fillna(-1).map(int).map(str)
    events = events.round(2)
    events = events.fillna(-1).convert_dtypes()
    events = events.sort_values("year")
    events = events.sort_values(["year","event"],ascending=(True,True))
    events = events[["year","bgg_id","event","designer","mechanic","weight"]]

    events.to_csv(f"bgg_two_mode_{target}.csv",index=False,quoting=csv.QUOTE_NONNUMERIC)