# Sentiment Analysis For Steam Games

Data is collected by scrapy web spider.

We need to organize the data and remove unnecessary elements such as punctuation and new line characters. 

In [75]:
import numpy as np
import tensorflow
import json

## Reading and Preparing Data

Creating genre combination as one-hot vector so that classification would be correct.

* Can differentiate between action and action adventure as genre. 

In [76]:
with open('Steam-Data/steam_genres.json', 'r') as f:
    genres = f.read()

In [77]:
genres = json.loads(genres)

In [106]:
genre_list = [genre['genre'] for genre in genres]
genre_list.sort()

In [107]:
# Usually games combines two or three genres together, but in some rare instances games combine 5 genres
# Let's set the maximum value for the genre to 5
# We would have some managable genre labels.
import itertools
genre_length = len(genre_list) if genre_list == 2 else 2

In [108]:
total_labels = []
for i in range(1, genre_length):
    combination_of_genres = list(itertools.combinations(genre_list, i))
    for combined_genre in combination_of_genres:
        label = ' '.join(combined_genre)
        total_labels.append(label)

In [113]:
labels_length = len(total_labels)
labels_length

12

In [114]:
# One-hot encoded labels would be equal to identity matrix with length of the total_labels
output = np.identity(labels_length)

In [115]:
output

array([[ 1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.]])

In [116]:
# Preparing the game data
with open('Steam-Data/steam_game_data.json', 'r') as f:
    games = f.read()

In [117]:
games = json.loads(games)

In [118]:
game_url = [game['url'] for game in games]

In [119]:
# Extracting appid from the url
import re
game_app_id = []
for i, url in enumerate(game_url):
    match = re.search("http://store.steampowered.com/app/(.+?)/", url)
    if match:
        game_app_id.append(match.group(1))
    else:
        # Removing sub products
        del games[i]

In [120]:
len(games) == len(game_app_id)

True

In [121]:
len(games)

14776

In [122]:
game_length = len(games)

### Combine all description and about page of the game

Description is short information and about has some of the key feautures of the game combining them will help us to correctly categorize them

In [123]:
game_description_information = [game['description'] for game in games]
game_about_information = [game['about'] for game in games]

joined_about_array = []
for game_about_array in game_about_information:
    striped_elements = [element.strip() for element in game_about_array]
    joined_about_array.append(' '.join(striped_elements))

combine_words = []
for i in range(game_length):
    stripped_game_description = str(game_description_information[i]).strip()
    combine_words.append(stripped_game_description + ' ' + joined_about_array[i])

combine_words[0]

"PLAYERUNKNOWN'S BATTLEGROUNDS is a last-man-standing shooter being developed with community feedback. Players must fight to locate weapons and supplies in a massive 8x8 km island to be the lone survivor. This is BATTLE ROYALE.   is a last-man-standing shooter being developed with community feedback. Starting with nothing, players must fight to locate weapons and supplies in a battle to be the lone survivor. This realistic, high tension game is set on a massive 8x8 km island with a level of detail that showcases Unreal Engine 4's capabilities. aka Brendan Greene, is a pioneer of the Battle Royale genre. As the creator of the Battle Royale game-mode found in the ARMA series and H1Z1 : King of the Kill, Greene is co-developing the game with veteran team at Bluehole to create the most diverse and robust Battle Royale experience to date "

In [124]:
import re
game_information = []
for combine in combine_words[:1]:
    words_in_combine = combine.split(' ')
    individual_game_info = ' '.join([re.sub('[^0-9a-zA-Z]+', '', word) for word in words_in_combine])
    game_information.append(individual_game_info)

game_information[0]

'PLAYERUNKNOWNS BATTLEGROUNDS is a lastmanstanding shooter being developed with community feedback Players must fight to locate weapons and supplies in a massive 8x8 km island to be the lone survivor This is BATTLE ROYALE   is a lastmanstanding shooter being developed with community feedback Starting with nothing players must fight to locate weapons and supplies in a battle to be the lone survivor This realistic high tension game is set on a massive 8x8 km island with a level of detail that showcases Unreal Engine 4s capabilities aka Brendan Greene is a pioneer of the Battle Royale genre As the creator of the Battle Royale gamemode found in the ARMA series and H1Z1  King of the Kill Greene is codeveloping the game with veteran team at Bluehole to create the most diverse and robust Battle Royale experience to date '

### Creating word dictionary for games

After combining all of the description and about pages, let's create a dictionary for all the words

In [125]:
all_text = ' '.join(game_information)
all_words = all_text.split(' ')
all_words[:25]

['PLAYERUNKNOWNS',
 'BATTLEGROUNDS',
 'is',
 'a',
 'lastmanstanding',
 'shooter',
 'being',
 'developed',
 'with',
 'community',
 'feedback',
 'Players',
 'must',
 'fight',
 'to',
 'locate',
 'weapons',
 'and',
 'supplies',
 'in',
 'a',
 'massive',
 '8x8',
 'km',
 'island']

### Encoding the words

All of the words needs to be converted into numbers that NN can  understand

In [126]:
from collections import Counter
word_counter = Counter(all_words)
vocab = sorted(word_counter, key=word_counter.get, reverse=True)
vocab_to_int = {word: ii for ii, word in enumerate(vocab, 1)}

game_info_to_ints = []
for each in game_information:
    game_info_to_ints.append([vocab_to_int[word] for word in each.split()])


### Encoding Genres

All of the genres must be encoded so we can use that

In [141]:
genres_to_int = {genre: i for i, genre in enumerate(genre_list)}

game_genres = [sorted(game['genres']) for game in games]
game_genres_to_int = []
for game_genre in game_genres:
    game_genre_coding = []
    for genre in genre_list:
        game_genre_encoding = 1 if genre in game_genre else 0
        game_genre_coding.append(game_genre_encoding)
    game_genres_to_int.append(game_genre_coding)

In [142]:
game_genres_to_int[0]

[1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0]

In [144]:
game_genres[0]

['Action', 'Adventure', 'Early Access', 'Massively Multiplayer', 'Violent']

In [146]:
len(game_genres_to_int)

14776