# Notebook to open the GameItem data file, clean it up, and write out a cleaned csv file 


In [20]:
import pandas as pd 
import numpy as np

In [21]:
# read in the csv file 
data_file = "../data/bgg_GameItem.csv"
clean_data_file = "../data/bgg_GameItem_clean.csv"
game_item_df = pd.read_csv(data_file)
game_item_df.head()

Unnamed: 0,bgg_id,name,year,game_type,designer,artist,publisher,min_players,max_players,min_players_rec,...,stddev_rating,bayes_rating,complexity,language_dependency,bga_id,dbpedia_id,luding_id,spielen_id,wikidata_id,wikipedia_id
0,1,Die Macher,1986.0,5497.0,1,125174959,1332272615108392491165253828147,3.0,5.0,4.0,...,1.5788,7.11944,4.3245,1.166667,,,,,,
1,2,Dragonmaster,1981.0,5497.0,8384,12424,6420,3.0,4.0,3.0,...,1.45047,5.79084,1.963,,,,,,,
2,3,Samurai,1998.0,5497.0,2,11883,"17,133,267,29,7340,7335,41,2973,4617,1391,8291...",2.0,4.0,2.0,...,1.1878,7.24091,2.4879,1.0,,,,,,
3,4,Tal der Könige,1992.0,,8008,2277,37,2.0,4.0,2.0,...,1.23454,5.68582,2.6667,,,,,,,
4,5,Acquire,1964.0,5497.0,4,1265818317,925487130828582962539246683846227107,2.0,6.0,3.0,...,1.33492,7.14585,2.5038,1.090278,,,,,,


In [22]:
# drop some obvious columns first 
game_item_df.drop(["designer", "artist", "publisher","family"],axis=1,inplace=True) 

In [23]:
# Keep the columns that have data  
number_valid_data = game_item_df.count()

cols = []
for ia in range(1,len(number_valid_data)):
    if number_valid_data[ia] > 0:
        cols.append(number_valid_data.index[ia])
        print(number_valid_data[ia], number_valid_data.index[ia]) 
game_item_df_col = game_item_df[cols].copy()
game_item_df_col.head() 

102325 name
93049 year
21870 game_type
100466 min_players
96919 max_players
100466 min_players_rec
96919 max_players_rec
100466 min_players_best
96919 max_players_best
79906 min_age
930 min_age_rec
80922 min_time
80922 max_time
100245 category
86813 mechanic
5134 cooperative
752 compilation
729 compilation_of
4835 implementation
3379 integration
20828 rank
75154 num_votes
75137 avg_rating
59346 stddev_rating
21145 bayes_rating
44300 complexity
793 language_dependency


Unnamed: 0,name,year,game_type,min_players,max_players,min_players_rec,max_players_rec,min_players_best,max_players_best,min_age,...,compilation_of,implementation,integration,rank,num_votes,avg_rating,stddev_rating,bayes_rating,complexity,language_dependency
0,Die Macher,1986.0,5497.0,3.0,5.0,4.0,5.0,5.0,5.0,14.0,...,,,,296.0,5265.0,7.62063,1.5788,7.11944,4.3245,1.166667
1,Dragonmaster,1981.0,5497.0,3.0,4.0,3.0,4.0,3.0,4.0,12.0,...,,2174.0,,3816.0,556.0,6.64334,1.45047,5.79084,1.963,
2,Samurai,1998.0,5497.0,2.0,4.0,2.0,4.0,3.0,3.0,10.0,...,,,,217.0,14913.0,7.45017,1.1878,7.24091,2.4879,1.0
3,Tal der Könige,1992.0,,2.0,4.0,2.0,4.0,2.0,4.0,12.0,...,,,,5099.0,338.0,6.59769,1.23454,5.68582,2.6667,
4,Acquire,1964.0,5497.0,2.0,6.0,3.0,6.0,4.0,4.0,12.0,...,,,,279.0,18352.0,7.33757,1.33492,7.14585,2.5038,1.090278


In [24]:
## Normalise the columns for mechanic, type and category to only have 1 x 4 digit value in each

game_item_df_col['category'] = game_item_df_col['category'].str[:4]
game_item_df_col['game_type'] = game_item_df_col['game_type'].str[:4]
game_item_df_col['mechanic'] = game_item_df_col['mechanic'].str[:4]


In [25]:
game_item_df_col.count()

name                   102325
year                    93049
game_type               21870
min_players            100466
max_players             96919
min_players_rec        100466
max_players_rec         96919
min_players_best       100466
max_players_best        96919
min_age                 79906
min_age_rec               930
min_time                80922
max_time                80922
category               100245
mechanic                86813
cooperative              5134
compilation               752
compilation_of            729
implementation           4835
integration              3379
rank                    20828
num_votes               75154
avg_rating              75137
stddev_rating           59346
bayes_rating            21145
complexity              44300
language_dependency       793
dtype: int64

In [26]:
#convert the year from float to integer 
game_item_df_col["year"] = game_item_df_col.loc[game_item_df_col["year"].notnull()].astype({"year":"int32"})
game_item_df_col["min_age_rec"] = game_item_df_col.loc[game_item_df_col["min_age_rec"].notnull()].astype({"min_age_rec":"int32"})
game_item_df_col.count()

name                   102325
year                    93049
game_type               21870
min_players            100466
max_players             96919
min_players_rec        100466
max_players_rec         96919
min_players_best       100466
max_players_best        96919
min_age                 79906
min_age_rec               930
min_time                80922
max_time                80922
category               100245
mechanic                86813
cooperative              5134
compilation               752
compilation_of            729
implementation           4835
integration              3379
rank                    20828
num_votes               75154
avg_rating              75137
stddev_rating           59346
bayes_rating            21145
complexity              44300
language_dependency       793
dtype: int64

In [27]:
# select the rows that have less than three cols with missing data  
temp_df = game_item_df_col.copy()
missing = [] 
for i in range(len(temp_df.index)) :
    missing.append(temp_df.iloc[i].isnull().sum())  
temp_df["missing"] = missing

temp_df = temp_df.loc[temp_df["missing"]<10]
temp_df.drop("missing", axis=1,inplace=True)
temp_df.count()

name                   24959
year                   24822
game_type              13380
min_players            24957
max_players            24953
min_players_rec        24957
max_players_rec        24953
min_players_best       24957
max_players_best       24953
min_age                23885
min_age_rec              928
min_time               24788
max_time               24788
category               24745
mechanic               23634
cooperative             2379
compilation              549
compilation_of           543
implementation          3093
integration             2062
rank                   20117
num_votes              24959
avg_rating             24957
stddev_rating          24937
bayes_rating           20426
complexity             24298
language_dependency      793
dtype: int64

In [29]:
# write out the cleaned dataframe 
temp_df.to_csv(clean_data_file)