# Notebook to open the GameItem data file, clean it up, and write out a cleaned csv file 


In [1]:
import pandas as pd 

In [2]:
# read in the csv file 
data_file = "../data/bga_GameItem.csv"
game_item_df = pd.read_csv(data_file)
game_item_df.head()

Unnamed: 0,bgg_id,name,year,game_type,designer,artist,publisher,min_players,max_players,min_players_rec,...,stddev_rating,bayes_rating,complexity,language_dependency,bga_id,dbpedia_id,luding_id,spielen_id,wikidata_id,wikipedia_id
0,,Monopoly: Dragon Ball Super,2020.0,,,,"id,name,url,num_games,score,game,images",2.0,6.0,,...,,,,,05sYpgsuq9,,,,,
1,,Masmorra: Dungeons of Arcadia - Promo Dice Set,2016.0,,"id,num_games,score,game,url,images",,"name,id,url,num_games,score,game,images",1.0,5.0,,...,,,,,06J5nkYAvF,,,,,
2,,Monopoly: Despicable Me,,,,,"id,name,url,num_games,score,game,images",,,,...,,,,,074t93iJ4c,,,,,
3,,Clank! A Deck-Building Adventure,2012.0,,"id,num_games,score,game,url,images","Rayph Beisner,Raul Ramos,Nate Storm,Derrick He...","id,name,url,num_games,score,game,images",2.0,4.0,,...,,,,,07FXm6tX4o,,,,,
4,,El Grande,1995.0,,"id,num_games,score,game,url,images",Doris Matthäus,"id,name,url,num_games,score,game,images",2.0,5.0,,...,,,,,07sKc1TGJS,,,,,


In [3]:
# drop some obvious columns first 
game_item_df.drop(["designer", "artist", "publisher", "mechanic"],axis=1,inplace=True) 

In [4]:
# Keep the columns that have data  
number_valid_data = game_item_df.count()

cols = []
for ia in range(1,len(number_valid_data)):
    if number_valid_data[ia] > 0:
        cols.append(number_valid_data.index[ia])
        print(number_valid_data[ia], number_valid_data.index[ia]) 
game_item_df_col = game_item_df[cols]
game_item_df_col.head() 

2565 name
1986 year
1982 min_players
1976 max_players
1881 min_age
1905 min_time
1902 max_time
1473 category
1608 num_votes
1606 avg_rating
2565 bga_id


Unnamed: 0,name,year,min_players,max_players,min_age,min_time,max_time,category,num_votes,avg_rating,bga_id
0,Monopoly: Dragon Ball Super,2020.0,2.0,6.0,8.0,60.0,120.0,,,,05sYpgsuq9
1,Masmorra: Dungeons of Arcadia - Promo Dice Set,2016.0,1.0,5.0,13.0,20.0,45.0,v4SfYtS2Lr,6.0,3.333333,06J5nkYAvF
2,Monopoly: Despicable Me,,,,,,,"N0TkEGfEsF,7rV11PKqME",,,074t93iJ4c
3,Clank! A Deck-Building Adventure,2012.0,2.0,4.0,13.0,30.0,60.0,"KUBCKBkGxV,ZTneo8TaIO,tQGLgwdbYH",363.0,3.814161,07FXm6tX4o
4,El Grande,1995.0,2.0,5.0,12.0,60.0,120.0,"QAYkTHK1Dd,nuHYRFmMjU",160.0,3.86875,07sKc1TGJS


In [5]:
#convert the year from float to integer 
game_item_df_col = game_item_df_col.loc[game_item_df_col["year"].notnull()].astype({"year":"int32"})
game_item_df_col.head()

Unnamed: 0,name,year,min_players,max_players,min_age,min_time,max_time,category,num_votes,avg_rating,bga_id
0,Monopoly: Dragon Ball Super,2020,2.0,6.0,8.0,60.0,120.0,,,,05sYpgsuq9
1,Masmorra: Dungeons of Arcadia - Promo Dice Set,2016,1.0,5.0,13.0,20.0,45.0,v4SfYtS2Lr,6.0,3.333333,06J5nkYAvF
3,Clank! A Deck-Building Adventure,2012,2.0,4.0,13.0,30.0,60.0,"KUBCKBkGxV,ZTneo8TaIO,tQGLgwdbYH",363.0,3.814161,07FXm6tX4o
4,El Grande,1995,2.0,5.0,12.0,60.0,120.0,"QAYkTHK1Dd,nuHYRFmMjU",160.0,3.86875,07sKc1TGJS
5,Gloomhaven: Jaws of the Lion,2020,1.0,4.0,14.0,30.0,120.0,"KUBCKBkGxV,ge8pIhEUGE,yq6hVlbM2R,ZTneo8TaIO,up...",58.0,4.586207,08asLSfoZy


In [10]:
# select the rows that have less than three cols with missing data  
temp_df = game_item_df_col.copy()
missing = [] 
for i in range(len(temp_df.index)) :
    missing.append(temp_df.iloc[i].isnull().sum())  
temp_df["missing"] = missing

temp_df = temp_df.loc[temp_df["missing"]<3]
temp_df.drop("missing", axis=1,inplace=True)
temp_df.head() 

Unnamed: 0,name,year,min_players,max_players,min_age,min_time,max_time,category,num_votes,avg_rating,bga_id
1,Masmorra: Dungeons of Arcadia - Promo Dice Set,2016,1.0,5.0,13.0,20.0,45.0,v4SfYtS2Lr,6.0,3.333333,06J5nkYAvF
3,Clank! A Deck-Building Adventure,2012,2.0,4.0,13.0,30.0,60.0,"KUBCKBkGxV,ZTneo8TaIO,tQGLgwdbYH",363.0,3.814161,07FXm6tX4o
4,El Grande,1995,2.0,5.0,12.0,60.0,120.0,"QAYkTHK1Dd,nuHYRFmMjU",160.0,3.86875,07sKc1TGJS
5,Gloomhaven: Jaws of the Lion,2020,1.0,4.0,14.0,30.0,120.0,"KUBCKBkGxV,ge8pIhEUGE,yq6hVlbM2R,ZTneo8TaIO,up...",58.0,4.586207,08asLSfoZy
6,Android: Netrunner,2012,2.0,2.0,14.0,30.0,60.0,"PinhJrhnxU,Ef4oYLHNhI,3B3QpKvXD3",229.0,3.689956,08q3IWC9lC


In [7]:
# write out the cleaned dataframe 
temp_df.to_csv("../data/bga_GameItem_clean.csv")