In [2]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import json
from ast import literal_eval

# functions for pipeline

In [3]:
def drop_categories(df):
    # this function drops all categories of API output which are irrelevant for prediction
    
    # possibly also drop controller_support, fullgame columns
    df.drop("dlc",axis=1, inplace=True)
    df.drop("review_score_desc",axis=1, inplace=True)
    df.drop("legal_notice",axis=1, inplace=True)
    df.drop("recommendations",axis=1, inplace=True)
    df.drop("ext_user_account_notice",axis=1, inplace=True)
    df.drop("reviews",axis=1, inplace=True)
    df.drop("metacritic",axis=1, inplace=True)
    df.drop("demos",axis=1, inplace=True)
    df.drop("drm_notice",axis=1, inplace=True)
    df.drop("alternate_appid",axis=1, inplace=True)
    df.drop("screenshots",axis=1, inplace=True)
    df.drop("movies",axis=1, inplace=True)
    df.drop("support_info",axis=1, inplace=True)
    df.drop("background",axis=1, inplace=True)
    df.drop("background_raw",axis=1, inplace=True)
    df.drop("content_descriptors",axis=1, inplace=True)
    df.drop("packages",axis=1, inplace=True)
    df.drop("package_groups",axis=1, inplace=True)
    df.drop("num_reviews",axis=1, inplace=True)
    df.drop("developers",axis=1, inplace=True)
    df.drop("achievements",axis=1, inplace=True)
    df.drop("detailed_description",axis=1, inplace=True)
    df.drop("about_the_game",axis=1, inplace=True)
    df.drop("supported_languages",axis=1, inplace=True)
    df.drop("header_image",axis=1, inplace=True)
    df.drop("website",axis=1, inplace=True)


In [4]:
def reset_index(df):
    #this function resets the index(first column) of the dataframe. This is need if some rows where dropped 
    #(e.g. because of NaN values in a column)
    #in the returned dataframe, the indices are sequential again -> iloc[idx] can be used without throwing erros
    try:
        df= df.drop("index",axis=1)
    except:
        pass
    try:
        df= df.drop("level_0",axis=1)
    except:
        pass
    df.reset_index(inplace=True)
    return df
    
    

In [5]:
def drop_few_review_games(df, num_reviews=50):
    # drop games with less than num_reviews reviews. also reset the index after dropping rows
    df=  df.loc[lambda df: df['total_reviews'] >= num_reviews, :]
    df = reset_index(df)
    return df

In [6]:
def clean_price(df):
    # Clean up "price_overview" output of API and add new column "price"
    # use price without discounts(initial)
    df.dropna(inplace=True,subset=["price_overview"])
    df = reset_index(df)
    df["price"] = 0
    for idx,row in df.iterrows():
        dict_ = literal_eval(str(row.price_overview).replace("''",'None'))
        df["price"].iloc[idx] = float(dict_['initial'])
    return df


In [7]:
def drop_high_price_games(df, max_price=200):
    # drops few outlier games which have unreasonably high prices. max price is price in euros (price/100)
    df=  df.loc[lambda df: df['price']/100 <= max_price, :]
    df = reset_index(df)
    return df

In [8]:
def clean_genres(df):
    # this function takes the API output of the column genres and cleans it up
    #genres are shown in a list in new column "genres_clean" in output dataframe
    df.dropna(inplace=True,subset=["genres"])
    df = reset_index(df)
    df["genres_clean"] = ""
    
    # for each row, get dictionary of genre column
    for idx,row in df.iterrows():
        dict_ = literal_eval(str(row.genres).replace("''",'None'))
        
        genres = []
        for j in range(len(dict_)):
            genres.append(dict_[j]["description"])
        df["genres_clean"].iloc[idx] = np.array(genres)
    return df

In [9]:
def get_genres_set(df):
    # this function iterates through the new column "genres_clean" and return the set of unique genres contained in the dataframe
    G = []
    for idx,row in df.iterrows():
        for j in range(len(row.genres_clean)):
            G.append(row.genres_clean[j])
    set_G = set(G)
    G=list(set_G)
    return G
        

In [10]:
def one_hot_encode_genres(df,G):
    # this function creates a new column for each unique genre in the dataframe to one-hot-encode it.
    for j in G:
        df[j] = 0
    
    for idx,row in df.iterrows():
        for genre in row.genres_clean:
            df[genre].loc[idx] = 1
    return df

# test pipeline

In [12]:
df = pd.read_csv("clean_data.csv")

In [13]:
drop_categories(df)

In [14]:
df = drop_few_review_games(df, num_reviews=50)

In [16]:
#df

In [17]:
df = clean_price(df)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)


In [18]:
df

Unnamed: 0.1,index,Unnamed: 0,appid,name,review_score,total_positive,total_negative,total_reviews,type,required_age,...,linux_requirements,publishers,price_overview,platforms,categories,genres,release_date,controller_support,fullgame,price
0,0,1,1648390,ScooterFlow,8,446,21,467,game,0,...,[],['UncannyKiwi'],"{'currency': 'EUR', 'initial': 1379, 'final': ...","{'windows': True, 'mac': False, 'linux': False}","[{'id': 2, 'description': 'Single-player'}]","[{'id': '4', 'description': 'Casual'}, {'id': ...","{'coming_soon': False, 'date': '29 Nov, 2021'}",,,1379
1,1,6,1648610,Malum,6,146,53,199,game,0,...,{'minimum': '<strong>Minimum:</strong><br><ul ...,['MalumGames'],"{'currency': 'EUR', 'initial': 399, 'final': 1...","{'windows': True, 'mac': False, 'linux': False}","[{'id': 2, 'description': 'Single-player'}, {'...","[{'id': '1', 'description': 'Action'}, {'id': ...","{'coming_soon': False, 'date': '2 Sep, 2021'}",,,399
2,2,7,1647962,Hell Let Loose – Lethal Tide,8,70,6,76,dlc,0,...,{'minimum': '<strong>Minimum:</strong><br><ul ...,['Team17'],"{'currency': 'EUR', 'initial': 499, 'final': 2...","{'windows': True, 'mac': False, 'linux': False}","[{'id': 1, 'description': 'Multi-player'}, {'i...","[{'id': '1', 'description': 'Action'}, {'id': ...","{'coming_soon': False, 'date': '27 Jul, 2021'}",,"{'appid': '686810', 'name': 'Hell Let Loose'}",499
3,3,20,1647550,NEO: The World Ends with You,8,224,11,235,game,0,...,{'minimum': '<strong>Minimum:</strong><br><ul ...,['Square Enix'],"{'currency': 'EUR', 'initial': 5999, 'final': ...","{'windows': True, 'mac': False, 'linux': False}","[{'id': 2, 'description': 'Single-player'}, {'...","[{'id': '1', 'description': 'Action'}, {'id': ...","{'coming_soon': False, 'date': '19 Oct, 2022'}",full,,5999
4,4,21,1647730,Zool Redimensioned,8,68,2,70,game,0,...,{'minimum': '<strong>Minimum:</strong><br><ul ...,['Secret Mode'],"{'currency': 'EUR', 'initial': 799, 'final': 2...","{'windows': True, 'mac': False, 'linux': False}","[{'id': 2, 'description': 'Single-player'}, {'...","[{'id': '1', 'description': 'Action'}, {'id': ...","{'coming_soon': False, 'date': '18 Aug, 2021'}",,,799
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
853,885,5685,1349400,Planet Zoo: Australia Pack,5,77,34,111,dlc,0,...,[],['Frontier Developments'],"{'currency': 'EUR', 'initial': 999, 'final': 4...","{'windows': True, 'mac': False, 'linux': False}","[{'id': 2, 'description': 'Single-player'}, {'...","[{'id': '28', 'description': 'Simulation'}, {'...","{'coming_soon': False, 'date': '25 Aug, 2020'}",,"{'appid': '703080', 'name': 'Planet Zoo'}",999
854,886,5686,1683340,Kayak VR: Mirage,8,320,38,358,game,0,...,[],['Better Than Life'],"{'currency': 'EUR', 'initial': 2299, 'final': ...","{'windows': True, 'mac': False, 'linux': False}","[{'id': 2, 'description': 'Single-player'}, {'...","[{'id': '4', 'description': 'Casual'}, {'id': ...","{'coming_soon': False, 'date': '12 Jul, 2022'}",,,2299
855,887,5687,1422420,Dodgeball Academia,8,138,11,149,game,0,...,{'minimum': '<strong>Minimum:</strong><br><ul ...,['Humble Games'],"{'currency': 'EUR', 'initial': 2499, 'final': ...","{'windows': True, 'mac': False, 'linux': False}","[{'id': 2, 'description': 'Single-player'}, {'...","[{'id': '1', 'description': 'Action'}, {'id': ...","{'coming_soon': False, 'date': '5 Aug, 2021'}",full,,2499
856,888,5689,359650,Star Trek™: 25th Anniversary,8,87,7,94,game,0,...,{'minimum': '<strong>Minimum:</strong><br><ul ...,['Interplay Entertainment Corp.'],"{'currency': 'EUR', 'initial': 999, 'final': 6...","{'windows': True, 'mac': False, 'linux': False}","[{'id': 2, 'description': 'Single-player'}]","[{'id': '1', 'description': 'Action'}, {'id': ...","{'coming_soon': False, 'date': '7 May, 2015'}",,,999


In [19]:
df = drop_high_price_games(df, max_price=200)

In [20]:
df

Unnamed: 0.1,index,Unnamed: 0,appid,name,review_score,total_positive,total_negative,total_reviews,type,required_age,...,linux_requirements,publishers,price_overview,platforms,categories,genres,release_date,controller_support,fullgame,price
0,0,1,1648390,ScooterFlow,8,446,21,467,game,0,...,[],['UncannyKiwi'],"{'currency': 'EUR', 'initial': 1379, 'final': ...","{'windows': True, 'mac': False, 'linux': False}","[{'id': 2, 'description': 'Single-player'}]","[{'id': '4', 'description': 'Casual'}, {'id': ...","{'coming_soon': False, 'date': '29 Nov, 2021'}",,,1379
1,1,6,1648610,Malum,6,146,53,199,game,0,...,{'minimum': '<strong>Minimum:</strong><br><ul ...,['MalumGames'],"{'currency': 'EUR', 'initial': 399, 'final': 1...","{'windows': True, 'mac': False, 'linux': False}","[{'id': 2, 'description': 'Single-player'}, {'...","[{'id': '1', 'description': 'Action'}, {'id': ...","{'coming_soon': False, 'date': '2 Sep, 2021'}",,,399
2,2,7,1647962,Hell Let Loose – Lethal Tide,8,70,6,76,dlc,0,...,{'minimum': '<strong>Minimum:</strong><br><ul ...,['Team17'],"{'currency': 'EUR', 'initial': 499, 'final': 2...","{'windows': True, 'mac': False, 'linux': False}","[{'id': 1, 'description': 'Multi-player'}, {'i...","[{'id': '1', 'description': 'Action'}, {'id': ...","{'coming_soon': False, 'date': '27 Jul, 2021'}",,"{'appid': '686810', 'name': 'Hell Let Loose'}",499
3,3,20,1647550,NEO: The World Ends with You,8,224,11,235,game,0,...,{'minimum': '<strong>Minimum:</strong><br><ul ...,['Square Enix'],"{'currency': 'EUR', 'initial': 5999, 'final': ...","{'windows': True, 'mac': False, 'linux': False}","[{'id': 2, 'description': 'Single-player'}, {'...","[{'id': '1', 'description': 'Action'}, {'id': ...","{'coming_soon': False, 'date': '19 Oct, 2022'}",full,,5999
4,4,21,1647730,Zool Redimensioned,8,68,2,70,game,0,...,{'minimum': '<strong>Minimum:</strong><br><ul ...,['Secret Mode'],"{'currency': 'EUR', 'initial': 799, 'final': 2...","{'windows': True, 'mac': False, 'linux': False}","[{'id': 2, 'description': 'Single-player'}, {'...","[{'id': '1', 'description': 'Action'}, {'id': ...","{'coming_soon': False, 'date': '18 Aug, 2021'}",,,799
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
850,853,5685,1349400,Planet Zoo: Australia Pack,5,77,34,111,dlc,0,...,[],['Frontier Developments'],"{'currency': 'EUR', 'initial': 999, 'final': 4...","{'windows': True, 'mac': False, 'linux': False}","[{'id': 2, 'description': 'Single-player'}, {'...","[{'id': '28', 'description': 'Simulation'}, {'...","{'coming_soon': False, 'date': '25 Aug, 2020'}",,"{'appid': '703080', 'name': 'Planet Zoo'}",999
851,854,5686,1683340,Kayak VR: Mirage,8,320,38,358,game,0,...,[],['Better Than Life'],"{'currency': 'EUR', 'initial': 2299, 'final': ...","{'windows': True, 'mac': False, 'linux': False}","[{'id': 2, 'description': 'Single-player'}, {'...","[{'id': '4', 'description': 'Casual'}, {'id': ...","{'coming_soon': False, 'date': '12 Jul, 2022'}",,,2299
852,855,5687,1422420,Dodgeball Academia,8,138,11,149,game,0,...,{'minimum': '<strong>Minimum:</strong><br><ul ...,['Humble Games'],"{'currency': 'EUR', 'initial': 2499, 'final': ...","{'windows': True, 'mac': False, 'linux': False}","[{'id': 2, 'description': 'Single-player'}, {'...","[{'id': '1', 'description': 'Action'}, {'id': ...","{'coming_soon': False, 'date': '5 Aug, 2021'}",full,,2499
853,856,5689,359650,Star Trek™: 25th Anniversary,8,87,7,94,game,0,...,{'minimum': '<strong>Minimum:</strong><br><ul ...,['Interplay Entertainment Corp.'],"{'currency': 'EUR', 'initial': 999, 'final': 6...","{'windows': True, 'mac': False, 'linux': False}","[{'id': 2, 'description': 'Single-player'}]","[{'id': '1', 'description': 'Action'}, {'id': ...","{'coming_soon': False, 'date': '7 May, 2015'}",,,999


In [21]:
df = clean_genres(df)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)


In [22]:
df

Unnamed: 0.1,index,Unnamed: 0,appid,name,review_score,total_positive,total_negative,total_reviews,type,required_age,...,publishers,price_overview,platforms,categories,genres,release_date,controller_support,fullgame,price,genres_clean
0,0,1,1648390,ScooterFlow,8,446,21,467,game,0,...,['UncannyKiwi'],"{'currency': 'EUR', 'initial': 1379, 'final': ...","{'windows': True, 'mac': False, 'linux': False}","[{'id': 2, 'description': 'Single-player'}]","[{'id': '4', 'description': 'Casual'}, {'id': ...","{'coming_soon': False, 'date': '29 Nov, 2021'}",,,1379,"[Casual, Indie, Simulation, Sports, Early Access]"
1,1,6,1648610,Malum,6,146,53,199,game,0,...,['MalumGames'],"{'currency': 'EUR', 'initial': 399, 'final': 1...","{'windows': True, 'mac': False, 'linux': False}","[{'id': 2, 'description': 'Single-player'}, {'...","[{'id': '1', 'description': 'Action'}, {'id': ...","{'coming_soon': False, 'date': '2 Sep, 2021'}",,,399,"[Action, Adventure, Indie]"
2,2,7,1647962,Hell Let Loose – Lethal Tide,8,70,6,76,dlc,0,...,['Team17'],"{'currency': 'EUR', 'initial': 499, 'final': 2...","{'windows': True, 'mac': False, 'linux': False}","[{'id': 1, 'description': 'Multi-player'}, {'i...","[{'id': '1', 'description': 'Action'}, {'id': ...","{'coming_soon': False, 'date': '27 Jul, 2021'}",,"{'appid': '686810', 'name': 'Hell Let Loose'}",499,"[Action, Indie, Massively Multiplayer, Simulat..."
3,3,20,1647550,NEO: The World Ends with You,8,224,11,235,game,0,...,['Square Enix'],"{'currency': 'EUR', 'initial': 5999, 'final': ...","{'windows': True, 'mac': False, 'linux': False}","[{'id': 2, 'description': 'Single-player'}, {'...","[{'id': '1', 'description': 'Action'}, {'id': ...","{'coming_soon': False, 'date': '19 Oct, 2022'}",full,,5999,"[Action, RPG]"
4,4,21,1647730,Zool Redimensioned,8,68,2,70,game,0,...,['Secret Mode'],"{'currency': 'EUR', 'initial': 799, 'final': 2...","{'windows': True, 'mac': False, 'linux': False}","[{'id': 2, 'description': 'Single-player'}, {'...","[{'id': '1', 'description': 'Action'}, {'id': ...","{'coming_soon': False, 'date': '18 Aug, 2021'}",,,799,"[Action, Adventure]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
848,850,5685,1349400,Planet Zoo: Australia Pack,5,77,34,111,dlc,0,...,['Frontier Developments'],"{'currency': 'EUR', 'initial': 999, 'final': 4...","{'windows': True, 'mac': False, 'linux': False}","[{'id': 2, 'description': 'Single-player'}, {'...","[{'id': '28', 'description': 'Simulation'}, {'...","{'coming_soon': False, 'date': '25 Aug, 2020'}",,"{'appid': '703080', 'name': 'Planet Zoo'}",999,"[Simulation, Strategy]"
849,851,5686,1683340,Kayak VR: Mirage,8,320,38,358,game,0,...,['Better Than Life'],"{'currency': 'EUR', 'initial': 2299, 'final': ...","{'windows': True, 'mac': False, 'linux': False}","[{'id': 2, 'description': 'Single-player'}, {'...","[{'id': '4', 'description': 'Casual'}, {'id': ...","{'coming_soon': False, 'date': '12 Jul, 2022'}",,,2299,"[Casual, Indie, Racing, Simulation]"
850,852,5687,1422420,Dodgeball Academia,8,138,11,149,game,0,...,['Humble Games'],"{'currency': 'EUR', 'initial': 2499, 'final': ...","{'windows': True, 'mac': False, 'linux': False}","[{'id': 2, 'description': 'Single-player'}, {'...","[{'id': '1', 'description': 'Action'}, {'id': ...","{'coming_soon': False, 'date': '5 Aug, 2021'}",full,,2499,"[Action, Adventure, Indie, RPG, Sports]"
851,853,5689,359650,Star Trek™: 25th Anniversary,8,87,7,94,game,0,...,['Interplay Entertainment Corp.'],"{'currency': 'EUR', 'initial': 999, 'final': 6...","{'windows': True, 'mac': False, 'linux': False}","[{'id': 2, 'description': 'Single-player'}]","[{'id': '1', 'description': 'Action'}, {'id': ...","{'coming_soon': False, 'date': '7 May, 2015'}",,,999,"[Action, Adventure, Simulation]"


In [23]:
G = get_genres_set(df)

In [24]:
G

['Indie',
 'Early Access',
 'Utilities',
 'Sexual Content',
 'Sports',
 'Video Production',
 'Gore',
 'Web Publishing',
 'Nudity',
 'Strategy',
 'Violent',
 'Simulation',
 'Racing',
 'Audio Production',
 'Software Training',
 'Animation & Modeling',
 'Casual',
 'Massively Multiplayer',
 'Design & Illustration',
 'Action',
 'Adventure',
 'RPG',
 'Free to Play']

In [25]:
df = one_hot_encode_genres(df,G)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)


In [26]:
df

Unnamed: 0.1,index,Unnamed: 0,appid,name,review_score,total_positive,total_negative,total_reviews,type,required_age,...,Audio Production,Software Training,Animation & Modeling,Casual,Massively Multiplayer,Design & Illustration,Action,Adventure,RPG,Free to Play
0,0,1,1648390,ScooterFlow,8,446,21,467,game,0,...,0,0,0,1,0,0,0,0,0,0
1,1,6,1648610,Malum,6,146,53,199,game,0,...,0,0,0,0,0,0,1,1,0,0
2,2,7,1647962,Hell Let Loose – Lethal Tide,8,70,6,76,dlc,0,...,0,0,0,0,1,0,1,0,0,0
3,3,20,1647550,NEO: The World Ends with You,8,224,11,235,game,0,...,0,0,0,0,0,0,1,0,1,0
4,4,21,1647730,Zool Redimensioned,8,68,2,70,game,0,...,0,0,0,0,0,0,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
848,850,5685,1349400,Planet Zoo: Australia Pack,5,77,34,111,dlc,0,...,0,0,0,0,0,0,0,0,0,0
849,851,5686,1683340,Kayak VR: Mirage,8,320,38,358,game,0,...,0,0,0,1,0,0,0,0,0,0
850,852,5687,1422420,Dodgeball Academia,8,138,11,149,game,0,...,0,0,0,0,0,0,1,1,1,0
851,853,5689,359650,Star Trek™: 25th Anniversary,8,87,7,94,game,0,...,0,0,0,0,0,0,1,1,0,0
