# Packages importieren

In [1]:
import pandas as pd
from langdetect import detect

import pickle
import numpy as np

from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

import string


# Steam Reviews Datensatz einlesen

In [2]:
data = pd.read_csv('./data/steam_reviews.csv')

data.head(10)

Unnamed: 0,date_posted,funny,helpful,hour_played,is_early_access_review,recommendation,review,title
0,2019-02-10,2,4,578,False,Recommended,&gt Played as German Reich&gt Declare war on B...,Expansion - Hearts of Iron IV: Man the Guns
1,2019-02-10,0,0,184,False,Recommended,yes.,Expansion - Hearts of Iron IV: Man the Guns
2,2019-02-07,0,0,892,False,Recommended,Very good game although a bit overpriced in my...,Expansion - Hearts of Iron IV: Man the Guns
3,2018-06-14,126,1086,676,False,Recommended,Out of all the reviews I wrote This one is pro...,Dead by Daylight
4,2017-06-20,85,2139,612,False,Recommended,Disclaimer I survivor main. I play games for f...,Dead by Daylight
5,2016-12-12,4,55,2694,False,Recommended,ENGLISH After playing for more than two years ...,Dead by Daylight
6,2017-09-17,12,228,48,False,Recommended,Out of all the reviews I wrote This one is pro...,Dead by Daylight
7,2018-12-24,295,219,71,False,Recommended,I have never been told to kill myself more tha...,Dead by Daylight
8,2018-09-21,2,54,400,False,Recommended,Any longtime Dead by Daylight player knows tha...,Dead by Daylight
9,2018-12-05,380,271,414,False,Recommended,if you think cs go is toxic try this game,Dead by Daylight


# Reviews die mehr als 150 Zeichen haben

In [3]:
data['review length'] = data['review'].str.len()
data_temp = data.loc[data['review length'] > 150.0]
data_temp = data_temp.reset_index(drop = True)

# Language Detection

### Nur Reviews die in Englisch geschrieben sind

In [None]:
lang = []
for rev in data_temp["review"]:
   try:
      l = detect(rev)
      lang.append(l)
   except Exception:
      lang.append("unknown")
      pass

data_temp['lang'] = lang

In [None]:
data_temp.head()

In [None]:
data_not_en = data_temp[data_temp['lang'] != 'en']

In [None]:
data_not_en.head(20)

In [None]:
# Nicht englisch sprachige Reviews herausfiltern
data_en = data_temp[data_temp['lang'] == 'en']

# Fix Funny

In [None]:
def fix_funny(count):
    if(count > 100000):
        return 0
    else:
        return count

In [None]:
data_en['funny'] = data_en['funny'].apply(fix_funny)

# Reviews kombinieren und Kennzahlen berechnen

In [None]:
data_en['recommendation'] = data_en['recommendation'] == 'Recommended'

In [None]:
games_reviewed = data_en.groupby('title').count()
games_reviewed = games_reviewed[['date_posted']]
games_reviewed.columns = ['review_count']
games_reviewed.head()

In [None]:
# Anzahl Recommendation
games_reviewed['recommendation_count'] = data_en.groupby('title').sum()['recommendation']

# Ratio Recommendation
games_reviewed['recommendation_ratio'] = games_reviewed['recommendation_count']/games_reviewed['review_count']

# Durchschnittliche Anzahl Stunden pro Spiel
games_reviewed['avg_playtime'] = data_en.groupby('title').sum()['hour_played'] / games_reviewed['review_count']

games_reviewed['reviews_combined'] = data_en.groupby('title')['review'].apply(lambda x: ' '.join(x))

games_reviewed = games_reviewed.reset_index()

games_reviewed['docid'] = range(1, 1+len(games_reviewed))

games_reviewed.head()

# Datensatz abspeichern

In [None]:
# with open('./data/games_reviewed_en.pickle', 'wb') as handle:
#     pickle.dump(games_reviewed, handle, protocol=pickle.HIGHEST_PROTOCOL)

## TEMPORARY - Games Reviewed English - einlesen um von hier weiter zu arbeiten

In [4]:
with open('./data/games_reviewed_en.pickle', 'rb') as handle:
    games_reviewed = pickle.load(handle)

In [97]:
games_reviewed.head()

Unnamed: 0,title,review_count,recommendation_count,recommendation_ratio,avg_playtime,reviews_combined,docid
0,ACE COMBAT™ 7: SKIES UNKNOWN,8,8.0,1.0,19.75,Oh man it's been at least a decade since Ace C...,1
1,ARK: Survival Evolved,129,25.0,0.193798,1013.790698,Before i start the review THE GAMEPLAY OF THIS...,2
2,ASTRONEER,1431,1363.0,0.952481,54.39413,i have not played much of this game and i dont...,3
3,Battlefleet Gothic: Armada 2,8,6.0,0.75,62.75,Dear Devs thanks for taking your time and maki...,4
4,Beat Saber,6,6.0,1.0,80.5,When I pull off a really difficult section I g...,5


# Tags einlesen, verknüpfen und abspeichern

In [98]:
tags = pd.read_csv('./data/genres_utf.csv', sep = ";",encoding='latin-1')

tags_list = tags.values.tolist()

In [138]:
list_temp = []
id_temp = []
for i in range(0,len(tags_list)):   
    id_temp.append(tags_list[i][1])
    temp = tags_list[i][2:20]
    temp.append('All')
    # print(temp)
    list_temp.append(temp)


In [139]:
tags_df = pd.DataFrame({'docid': id_temp, 'tags': list_temp})

In [127]:
games_reviewed2 = games_reviewed.join(tags_df.set_index('docid'), on='docid')

In [152]:
tags_temp = []
for item in tags_list:
    item.append('All')
    for j in range(2,len(item)):
        tags_temp.append(item[j])

In [155]:
unique_tags = set(tags_temp)
unique_tags = list(unique_tags)[1:len(list(unique_tags))]

In [156]:
unique_tags = sorted(unique_tags)

In [157]:
games_reviewed2.head()

Unnamed: 0,title,review_count,recommendation_count,recommendation_ratio,avg_playtime,reviews_combined,docid,tags
0,ACE COMBAT™ 7: SKIES UNKNOWN,8,8.0,1.0,19.75,Oh man it's been at least a decade since Ace C...,1,"[Flugsimulation, Jet, Military, War, Shooter, ..."
1,ARK: Survival Evolved,129,25.0,0.193798,1013.790698,Before i start the review THE GAMEPLAY OF THIS...,2,"[Open-World-Survival-Craft, Survival, Open-Wor..."
2,ASTRONEER,1431,1363.0,0.952481,54.39413,i have not played much of this game and i dont...,3,"[Open-World-Survival-Craft, Open-World, Multip..."
3,Battlefleet Gothic: Armada 2,8,6.0,0.75,62.75,Dear Devs thanks for taking your time and maki...,4,"[Strategy, Warhammer 40k, Space, Real-timeTact..."
4,Beat Saber,6,6.0,1.0,80.5,When I pull off a really difficult section I g...,5,"[VR, Rhythmus, Musik, Good soundtrack, Indie, ..."


In [158]:
# with open('./data/unique_tags.pickle', 'wb') as handle:
#     pickle.dump(unique_tags, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [131]:
# with open('./data/games_reviewed.pickle', 'wb') as handle:
#     pickle.dump(games_reviewed2, handle, protocol=pickle.HIGHEST_PROTOCOL)

# 1) Documents

In [5]:
corpus = games_reviewed.loc[:,['title','reviews_combined','docid']]
# df_games = pd.Series(corpus.reviews_combined.values,index=corpus.title).to_dict()

# Dictionary erstellen
df_games = corpus.set_index('title').to_dict("index")

## 1.1) Dict pro Wort - mit Stopwords & Stemming

In [6]:
def add_document(doc):
    """
    Add Document to the dictionary
    """

    global documents, docid_counter, the_index

    doc, id = doc.values()

    if doc in documents.values():
        print(f'document already included!')
        print(doc)
    
    docid = id
    documents[docid] = doc
    docid_counter += 1

    stemmer = SnowballStemmer("english")

    for term in word_tokenize(doc.lower().strip()):
        stop = stopwords.words('english') + list(string.punctuation) + ['\n']

        if term in stop:
            continue
        else:
            term = stemmer.stem(term)
            if not term in the_index.keys():
                the_index[term] = {
                    'df':1,
                    'docs':{
                        docid:1 # tf_d = 1
                    }
                }         
            else:
                # doc already seen
                if docid in the_index[term]['docs'].keys():
                    the_index[term]['docs'][docid] += 1
                    
                else:
                    the_index[term]['docs'][docid] = 1
                    the_index[term]['df'] += 1                      


In [7]:
def fill_dict(dict):
    """
    Function to fill the dictionary
    """
    
    for key in dict:
        print("adding document: ", key)
        add_document(dict[key])



In [14]:
'''Map document titles to document ids'''
documents = {}
'''A running counter for assigning numerical IDs to documents'''
docid_counter = 1
'''The document-term frequencies'''
the_index = dict()

In [15]:
# takes a lot of time
fill_dict(df_games)

adding document:  ACE COMBAT™ 7: SKIES UNKNOWN
adding document:  ARK: Survival Evolved
adding document:  ASTRONEER
adding document:  Battlefleet Gothic: Armada 2
adding document:  Beat Saber
adding document:  Cold Waters
adding document:  Dead by Daylight
adding document:  Divinity: Original Sin 2 - Definitive Edition
adding document:  Don't Starve Together
adding document:  Euro Truck Simulator 2
adding document:  Expansion - Hearts of Iron IV: Man the Guns
adding document:  Factorio
adding document:  Farming Simulator 19
adding document:  Football Manager 2019
adding document:  Foundation
adding document:  GOD EATER 3
adding document:  Garry's Mod
adding document:  Grand Theft Auto V
adding document:  Human: Fall Flat
adding document:  Insurgency: Sandstorm
adding document:  Kenshi
adding document:  Left 4 Dead 2
adding document:  MONSTER HUNTER: WORLD
adding document:  Moonlighter
adding document:  My Time At Portia
adding document:  NBA 2K19
adding document:  Overcooked! 2
adding d

In [1]:
# print(the_index)

# Index, Documents und Docid_counter abspeichern

In [16]:
with open('./data/the_index_stem.pickle', 'wb') as handle:
    pickle.dump(the_index, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [17]:
with open('./data/documents.pickle', 'wb') as handle:
    pickle.dump(documents, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [18]:
with open('./data/docid_counter.pickle', 'wb') as handle:
    pickle.dump(docid_counter, handle, protocol=pickle.HIGHEST_PROTOCOL)