In [1]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
import time
import json
import altair as alt
import re
from sklearn.preprocessing import MinMaxScaler

game_details = pd.read_csv("steam_game_details.csv")
game_stats = pd.read_csv("steam_game_stats.csv")
game_reviews = pd.read_csv("steam_review_data.csv")

game_reviews = game_reviews.dropna()

combined_details = game_details.merge(game_stats, on="app_id")
combined_details['owners_upper'] = combined_details['owners'].apply(
    lambda x: int(re.search(r"\.\. ([\d,]+)", x).group(1).replace(',', '')) if pd.notnull(x) else None
)

combined_details['total'] = combined_details['positive'] + combined_details['negative']

combined_details['positive_ratio'] = combined_details['positive'] / combined_details['total']

scaler = MinMaxScaler()

combined_details['ranked_positive'] = combined_details['positive_ratio'].rank(pct=True)
combined_details['ranked_owners'] = combined_details['owners_upper'].rank(pct=True)

combined_details['popularity_score_ranked'] = (
    0.3 * combined_details['ranked_positive'] +
    0.7 * combined_details['ranked_owners']
)

combined_details['popularity_rating'] = (combined_details['popularity_score_ranked'] * 5).round(1)

In [2]:
agg_reviews = game_reviews.groupby(by="app_id")['review_text'].apply(lambda x: ' '.join(x))
agg_reviews

app_id
10        Ruined my life. This will be more of a ''my ex...
20        When I got this for Christmas in 1999 along wi...
30        Even though its old, its better than Call of D...
40        Buy this game and join the community of one Th...
50        This is the first game created by Gearbox - th...
                                ...                        
562600    I must be the most ♥♥♥♥♥♥♥♥ person in the enti...
562700    its not fun and it takes multiple shots to tag...
563180    Surprisingly varied, if slightly too exacting ...
563400    Ludu is a pretty neat 'adventure' puzzle game....
563430    Do you like RPGs? Do you like exploring dungeo...
Name: review_text, Length: 9368, dtype: object

In [19]:
with_reviews = combined_details.merge(agg_reviews,on='app_id')

In [13]:
game_reviews

Unnamed: 0,app_id,app_name,review_text,review_score,review_votes
0,10,Counter-Strike,Ruined my life.,1,0
1,10,Counter-Strike,This will be more of a ''my experience with th...,1,1
2,10,Counter-Strike,This game saved my virginity.,1,0
3,10,Counter-Strike,• Do you like original games? • Do you like ga...,1,0
4,10,Counter-Strike,"Easy to learn, hard to master.",1,1
...,...,...,...,...,...
6417101,99910,Puzzle Pirates,I really ove this game but it needs somethings...,-1,0
6417102,99910,Puzzle Pirates,"Used to play Puzzel Pirates 'way back when', b...",-1,0
6417103,99910,Puzzle Pirates,"This game was aright, though a bit annoying. W...",-1,0
6417104,99910,Puzzle Pirates,"I had a nice review to recommend this game, bu...",-1,0


In [None]:
import nltk
from nltk.corpus import stopwords
nltk.download(['wordnet', 'stopwords', 'punkt'])
nltk.download('punkt_tab')
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

en_stopwords = stopwords.words('english')
lmr = WordNetLemmatizer()

def findTopics(text):
    topics = []
    
    for t in word_tokenize(text):
        if t.isalpha():
            t = lmr.lemmatize(t.lower())
            if t not in en_stopwords:
                topics.append(t)
    return topics

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/davidlee/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/davidlee/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/davidlee/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/davidlee/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


['ruined',
 'life',
 'experience',
 'game',
 'type',
 'review',
 'saying',
 'thing',
 'like',
 'great',
 'gameplay',
 'suit',
 'something',
 'experienced',
 'go',
 'remember',
 'back',
 'wa',
 'friend',
 'house',
 'wa',
 'playing',
 'game',
 'know',
 'name',
 'game',
 'internet',
 'find',
 'week',
 'passed',
 'another',
 'friend',
 'came',
 'computer',
 'brought',
 'disc',
 'game',
 'told',
 'wa',
 'one',
 'best',
 'game',
 'moment',
 'knew',
 'going',
 'game',
 'saw',
 'friend',
 'house',
 'saw',
 'logo',
 'wa',
 'filled',
 'gamegasm',
 'wa',
 'happy',
 'wa',
 'playing',
 'hardcore',
 'made',
 'friend',
 'clan',
 'wa',
 'involved',
 'community',
 'even',
 'made',
 'two',
 'first',
 'game',
 'played',
 'competitively',
 'wa',
 'experience',
 'playing',
 'public',
 'server',
 'mod',
 'fun',
 'playing',
 'competitively',
 'made',
 'intense',
 'stressful',
 'pleasant',
 'way',
 'ofcourse',
 'looking',
 'current',
 'scene',
 'might',
 'seem',
 'like',
 'much',
 'back',
 'wa',
 'different',

In [24]:
test = findTopics(with_reviews['review_text'][0])
from gensim.corpora.dictionary import Dictionary

doc_dict = Dictionary([test])

In [32]:
article_bow = doc_dict.doc2bow(test)

most_frequent = sorted(article_bow, key=lambda x: x[1], reverse=True)

term_ids, counts = zip(*most_frequent)

top_terms = [doc_dict[id] for id in term_ids[:20]]
top_term_counts = counts[:5]

top_terms

['game',
 'best',
 'c',
 'play',
 'good',
 'still',
 'old',
 'ever',
 'one',
 'fps',
 'wa',
 'time',
 'played',
 'like',
 'go',
 'playing',
 'great',
 'classic',
 'fun',
 'lt']