# Creation of Main Dataset
## Features:
* No. common title words matched
* Fame of publisher
* Fame of developer
* Number of platforms supported
* Multiplayer or not
* No. common genres matched
* No. common tags matched
## Predicting:
* Average number of concurrent players

In [1]:
import pandas as pd
import numpy as np
import matplotlib as plt

### List of common title words from 40k games

In [2]:
TITLE_COMMON_THRESHOLD = 40

In [3]:
df1 = pd.read_csv('data/name_developer_publisher_tags/steam_games.csv')

In [4]:
# remove special chars
def check_chars(value):
    if value == " ":
        return True
    else:
        return value.isalpha()
def remove_special_chars(value):
    return ''.join(filter(check_chars, str(value)))
df1['name'] = df1['name'].map(remove_special_chars)

In [5]:
# create common title word dict
title_word_dict = {}
def create_word_dict(value):
    words = value.split(' ')
    for word in words:
        if word not in title_word_dict:
            title_word_dict[word] = 1
        else:
            title_word_dict[word] += 1
df1['name'].map(create_word_dict)
common_word_list = []
for name in title_word_dict:
    if title_word_dict[name] > TITLE_COMMON_THRESHOLD:
        common_word_list.append(name)

### Publisher & developer fame

In [6]:
# add dict of publisher & developer appearance count
publisher_dict = {}
developer_dict = {}
def create_pub_dict(value):
    if value not in publisher_dict:
        publisher_dict[value] = 1
    else:
        publisher_dict[value] += 1
def create_dev_dict(value):
    if value not in developer_dict:
        developer_dict[value] = 1
    else:
        developer_dict[value] += 1
df1['publisher'].map(create_pub_dict)
df1['developer'].map(create_dev_dict)
print('done')

done


In [7]:
# replace with publisher & developer fame
def replace_with_pub_val(value):
    if value in publisher_dict:
        return publisher_dict[value]
    else:
        return 0
def replace_with_dev_val(value):
    if value in developer_dict:
        return developer_dict[value]
    else:
        return 0    
df1['publisher'] = df1['publisher'].map(replace_with_pub_val)
df1['developer'] = df1['developer'].map(replace_with_dev_val)

### List of common tags

In [8]:
TAG_THRESHOLD = 40

In [9]:
# create common tag dict
tag_dict = {}
def create_tag_dict(value):
    words = str(value).split(',')
    for word in words:
        if word not in tag_dict:
            tag_dict[word] = 1
        else:
            tag_dict[word] += 1
df1['popular_tags'].map(create_tag_dict)
common_tag_list = []
for name in tag_dict:
    if tag_dict[name] > TAG_THRESHOLD:
        common_tag_list.append(name)

In [10]:
# replace with common tag count
def replace_with_tag_count(value):
    words = str(value).split(',')
    gcount = 0
    for word in words:
        if word in common_tag_list:
            gcount += 1
    return gcount
df1['popular_tags'] = df1['popular_tags'].map(replace_with_tag_count)

### Multiplayer or not

In [11]:
# replace with multiplayer value
def replace_with_mult(value):
    if "Multiplayer" in str(value):
        return 1
    return 0
df1['game_details'] = df1['game_details'].map(replace_with_mult)

### List of common genres

In [12]:
GENRE_THRESHOLD = 10

In [13]:
df2 = pd.read_csv('data/name_genres_platforms/games.csv')

In [14]:
# remove special chars
df2['Name'] = df2['Name'].map(remove_special_chars)

In [15]:
# create common genre dict
genre_dict = {}
def create_genre_dict(value):
    words = str(value).split(',')
    for word in words:
        if word not in genre_dict:
            genre_dict[word] = 1
        else:
            genre_dict[word] += 1
df2['Genres'].map(create_genre_dict)
common_genre_list = []
for name in genre_dict:
    if genre_dict[name] > GENRE_THRESHOLD:
        common_genre_list.append(name)

In [16]:
# replace with common genre count
def replace_with_genre_count(value):
    words = str(value).split(',')
    gcount = 0
    for word in words:
        if word in common_genre_list:
            gcount += 1
    return gcount
df2['Genres'] = df2['Genres'].map(replace_with_genre_count)

### Number of platform supported

In [17]:
# replace with platform count
def replace_with_plat_count(value):
    return len(str(value).split(','))
df2['Platform'] = df2['Platform'].map(replace_with_plat_count)

### Average number of concurrent players

In [18]:
df3 = pd.read_csv('data/name_players/SteamCharts.csv', encoding = 'unicode_escape', engine ='python')

In [19]:
df3 = df3.groupby('gamename').mean().reset_index().drop(["year", "gain", "peak"], axis=1)

In [20]:
# remove special chars from name
df3['gamename'] = df3['gamename'].map(remove_special_chars)

In [21]:
# remove empty name
df3['gamename'].map(lambda x: x.strip())
df3 = df3[df3.gamename != ""]

### Add features

In [22]:
# aggregate all features together into one dataset
pub_list = []
dev_list = []
tag_list = []
mult_list = []
genre_list = []
plat_list = []
for _,row in df3.iterrows():
    found = df1[df1.name == row.gamename]
    if found.shape[0] > 0:
        found_1 = found.iloc[[0]]
        pub_list.append(found_1['publisher'].iloc[0])
        dev_list.append(found_1['developer'].iloc[0])
        tag_list.append(found_1['popular_tags'].iloc[0])
        mult_list.append(found_1['game_details'].iloc[0])
    else:
        pub_list.append(-1)
        dev_list.append(-1)
        tag_list.append(-1)
        mult_list.append(-1)

    found2 = df2[df2.Name == row.gamename]
    if found2.shape[0] > 0:
        found_2 = found2.iloc[[0]]
        genre_list.append(found2['Genres'].iloc[0])
        plat_list.append(found2['Platform'].iloc[0])
    else:
        genre_list.append(-1)
        plat_list.append(-1)
maindf = df3.assign(publisher=pub_list).assign(developer=dev_list).assign(tag_common=tag_list).assign(multi=mult_list).assign(genre_common=genre_list).assign(plat_count=plat_list)
maindf.head()

Unnamed: 0,gamename,avg,publisher,developer,tag_common,multi,genre_common,plat_count
0,Orange Juice,203.529286,37,21,20,0,2,1
1,is Better Than,25.405938,-1,-1,-1,-1,2,4
3,Seconds,71.822714,2,3,20,0,5,3
4,Days to Die,9405.545172,1,1,20,1,7,5
5,Operator,114.976182,8,7,20,0,3,2


In [23]:
# change name column to name_common
def replace_with_name_count(value):
    words = str(value).split(',')
    gcount = 0
    for word in words:
        if word in common_word_list:
            gcount += 1
    return gcount
maindf['gamename'] = maindf['gamename'].map(replace_with_name_count)

In [28]:
# remove rows with -1
maindf = maindf[maindf.publisher >= 0]
maindf = maindf[maindf.developer >= 0]
maindf = maindf[maindf.tag_common >= 0]
maindf = maindf[maindf.genre_common >= 0]
maindf = maindf[maindf.multi >= 0]
maindf = maindf[maindf.plat_count >= 0]
maindf.head()

Unnamed: 0,gamename,avg,publisher,developer,tag_common,multi,genre_common,plat_count
0,0,203.529286,37,21,20,0,2,1
3,0,71.822714,2,3,20,0,5,3
4,0,9405.545172,1,1,20,1,7,5
5,0,114.976182,8,7,20,0,3,2
15,0,273.227833,4,5,20,0,2,6


## Export Main Dataset

In [29]:
maindf.to_csv('data/main_dataset.csv')