# Creation of Main Dataset
## Features:
* No. common title words matched
* Fame of publisher
* Fame of developer
* Number of platforms supported
* Multiplayer or not
* No. common genres matched
* No. common tags matched
## Predicting:
* Average number of concurrent players

In [79]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json

## Data preprocessing

In [43]:
df1 = pd.read_csv('data/name_developer_publisher_tags/steam_games.csv')

In [44]:
# remove special chars
def check_chars(value):
    if value == " ":
        return True
    else:
        return value.isalpha()
def remove_special_chars(value):
    return ''.join(filter(check_chars, str(value)))
df1['name'] = df1['name'].map(remove_special_chars)

In [45]:
df2 = pd.read_csv('data/name_genres_platforms/games.csv')

In [46]:
# remove special chars
df2['Name'] = df2['Name'].map(remove_special_chars)

In [47]:
df3 = pd.read_csv('data/name_players/SteamCharts.csv', encoding = 'unicode_escape', engine ='python')

In [48]:
# average number of players for each game
df3 = df3.groupby('gamename').mean().reset_index().drop(["year", "gain", "peak"], axis=1)

In [49]:
# remove special chars from name
df3['gamename'] = df3['gamename'].map(remove_special_chars)

In [50]:
# remove empty name
df3['gamename'].map(lambda x: x.strip())
df3 = df3[df3.gamename != ""]

### List of common title words from 40k games

In [51]:
# Generated from Generate_list_for_webapp
with open('outputs/common_words.json', 'r') as myfile:
    data=myfile.read()

# parse file
common_word_list = json.loads(data)

### Dictionary of appearance count for publisher and developer

In [52]:
# read file
with open('outputs/publisher_dict.json', 'r') as myfile:
    data=myfile.read()

# parse file
publisher_dict = json.loads(data)

In [53]:
# read file
with open('outputs/developer_dict.json', 'r') as myfile:
    data=myfile.read()

# parse file
developer_dict = json.loads(data)

In [54]:
# replace with publisher & developer fame
def replace_with_pub_val(value):
    if value in publisher_dict:
        return publisher_dict[value]
    else:
        return 0
def replace_with_dev_val(value):
    if value in developer_dict:
        return developer_dict[value]
    else:
        return 0    
df1['publisher'] = df1['publisher'].map(replace_with_pub_val)
df1['developer'] = df1['developer'].map(replace_with_dev_val)

### List of common tags

In [55]:
# read file
with open('outputs/common_tags.json', 'r') as myfile:
    data=myfile.read()

# parse file
common_tag_list = json.loads(data)

In [56]:
# replace with common tag count
def replace_with_tag_count(value):
    words = str(value).split(',')
    gcount = 0
    for word in words:
        if word in common_tag_list:
            gcount += 1
    return gcount
df1['popular_tags'] = df1['popular_tags'].map(replace_with_tag_count)

### Multiplayer or not

In [57]:
# replace with multiplayer value
def replace_with_mult(value):
    if "Multiplayer" in str(value):
        return 1
    return 0
df1['game_details'] = df1['game_details'].map(replace_with_mult)

### List of common genres

In [58]:
# read file
with open('outputs/common_genres.json', 'r') as myfile:
    data=myfile.read()

# parse file
common_genre_list = json.loads(data)

In [59]:
# replace with common genre count
def replace_with_genre_count(value):
    words = str(value).split(',')
    gcount = 0
    for word in words:
        if word in common_genre_list:
            gcount += 1
    return gcount
df2['Genres'] = df2['Genres'].map(replace_with_genre_count)

### Number of platforms supported

In [60]:
# replace with platform count
def replace_with_plat_count(value):
    return len(str(value).split(','))
df2['Platform'] = df2['Platform'].map(replace_with_plat_count)

## Create final dataset

In [61]:
# aggregate all features together into one dataset
pub_list = []
dev_list = []
tag_list = []
mult_list = []
genre_list = []
plat_list = []
for _,row in df3.iterrows():
    found = df1[df1.name == row.gamename]
    if found.shape[0] > 0:
        found_1 = found.iloc[[0]]
        pub_list.append(found_1['publisher'].iloc[0])
        dev_list.append(found_1['developer'].iloc[0])
        tag_list.append(found_1['popular_tags'].iloc[0])
        mult_list.append(found_1['game_details'].iloc[0])
    else:
        pub_list.append(-1)
        dev_list.append(-1)
        tag_list.append(-1)
        mult_list.append(-1)

    found2 = df2[df2.Name == row.gamename]
    if found2.shape[0] > 0:
        found_2 = found2.iloc[[0]]
        genre_list.append(found2['Genres'].iloc[0])
        plat_list.append(found2['Platform'].iloc[0])
    else:
        genre_list.append(-1)
        plat_list.append(-1)
maindf = df3.assign(publisher=pub_list).assign(developer=dev_list).assign(tag_common=tag_list).assign(multi=mult_list).assign(genre_common=genre_list).assign(plat_count=plat_list)

In [62]:
# change name column to name_common
def replace_with_name_count(value):
    words = str(value).split(' ')
    gcount = 0
    for word in words:
        if word in common_word_list:
            gcount += 1
    return gcount
maindf['gamename'] = maindf['gamename'].map(replace_with_name_count)

In [63]:
# remove rows with -1
maindf = maindf[maindf.publisher >= 0]
maindf = maindf[maindf.developer >= 0]
maindf = maindf[maindf.tag_common >= 0]
maindf = maindf[maindf.genre_common >= 0]
maindf = maindf[maindf.multi >= 0]
maindf = maindf[maindf.plat_count >= 0]

### Deal with outliers

In [71]:
# remove no. players outliers
maindf = maindf[maindf['avg'] < 50000]

In [83]:
# adjust developer outlier
def get_normal_pub_value(value):
    if value > 1000:
        return 200
    return value
maindf.developer = maindf.developer.map(get_normal_pub_value)

## Export Main Dataset

In [91]:
maindf.to_csv('outputs/main_dataset.csv')