In [1]:
from bs4 import BeautifulSoup
import numpy as np
import requests
import tqdm
import platform
import types
import bs4
import os
import json
import time
import pandas as pd
import zipfile
import io

def imports():
    for name, val in globals().items():
        if isinstance(val, types.ModuleType):
            try:
                yield val.__name__, val.__version__
                pass
            except:
                yield val.__name__
                pass
            pass
        pass
    pass

list(imports())

['builtins',
 'builtins',
 ('numpy', '1.26.3'),
 ('requests', '2.31.0'),
 ('tqdm', '4.66.1'),
 ('platform', '1.0.8'),
 'types',
 ('bs4', '4.12.2'),
 'os',
 ('json', '2.0.9'),
 'time',
 ('pandas', '2.1.4'),
 'zipfile',
 'io']

In [2]:
def append_text(file, content):
    with open(file, 'a') as f: f.write(content); return
def replace_text(file, content):
    with open(file, 'w') as f: f.write(content); return

In [3]:
data_location = '../data/raw_data/'

In [4]:
# code based on https://andrew-muller.medium.com/scraping-steam-user-reviews-9a43f9e38c92
def get_n_appids(tag_no, n=4000): # according to steam there are about ~3600 FPS games so we set n=4000 here 
    appids = []
    url = f'https://store.steampowered.com/search/?category1=998&tags={tag_no}&page='
    games_added_to_list=1
    while games_added_to_list!=0:
        page,games_added_to_list = 0,0
        while page*25 < n:
            page += 1
            response = requests.get(url=url+str(page), headers={'User-Agent': 'Mozilla/5.0'})
            soup = BeautifulSoup(response.text, 'html.parser')
            for row in soup.find_all(class_='search_result_row'):
                if row['data-ds-appid'] not in appids:
                    appids.append(row['data-ds-appid'])
                    games_added_to_list+=1
                else:
                    continue
            time.sleep(np.random.uniform()/2)
            pass
    return appids[:n]

In [5]:
game_id_filepath = data_location + 'games_list.txt'
if not os.path.isfile(game_id_filepath):
    single_FPS_appids = np.unique(get_n_appids(tag_no="1663%2C4182"))
    append_text(file=game_id_filepath, content='\n'.join(single_FPS_appids))
    single_FPS_appids = np.array(single_FPS_appids)
    pass
else:
    with open(game_id_filepath, 'r') as f:
        single_FPS_appids = f.read().splitlines()
        pass
    pass
print(len(single_FPS_appids), "unique game IDs have been extracted.")

3676 unique game IDs have been extracted.


In [6]:
def get_one_review(appid, params={'json':1}):
    url = 'https://store.steampowered.com/appreviews/'
    response = requests.get(url=url+appid, params=params, headers={'User-Agent': 'Mozilla/5.0'})
    return response.json()

def get_all_reviews(appid, failed_file_location):
    review_count=int(((get_one_review(str(appid)))['query_summary'])['total_reviews'])
    reviews = []
    cursor = '*'
    params = {
            'json' : 1,
            'filter' : 'all',
            'language' : 'english',
            'day_range' : 9223372036854775807,
            'review_type' : 'all',
            'purchase_type' : 'all'
            }
    while review_count > 0:
        params['cursor'] = cursor.encode()
        params['num_per_page'] = min(100, review_count)
        review_count -= 100
        response = get_one_review(appid, params)
        cursor = response['cursor']
        reviews += response['reviews']
        if len(response['reviews']) < 100:
            break
            pass
        time.sleep(np.random.uniform()/2)
        pass
    if len(reviews)==0:
        append_text(file=failed_file_location, content=appid + '\n')
        pass
    return reviews

def scrape_reviews(game_IDs_to_scrape, failed_file_location):
    for i in tqdm.tqdm(game_IDs_to_scrape, ncols=80):
        appid=str(i)
        reviews=get_all_reviews(appid, failed_file_location)
        reviews_flattened = [reviews[iii]|reviews[iii].pop('author', None)|{'game_id' : appid} for iii in range(len(reviews))]
        append_text(file=raw_review_location, content=json.dumps(reviews_flattened) + '\n')
        append_text(file=scraped_games_location, content=appid + '\n')
        pass
    pass

In [7]:
scraped_games_location = data_location + 'games_list_scraped.txt'
location_games_to_rescrape = data_location + 'games_list_to_rescrape.txt'
raw_review_location = data_location + 'reviews.txt'

In [8]:
try:
    scraped_games = np.loadtxt(scraped_games_location, dtype=str)
    pass
except:
    scraped_games = []
    pass
game_IDs_to_scrape = np.setdiff1d(single_FPS_appids, scraped_games)

In [9]:
scrape_reviews(game_IDs_to_scrape, failed_file_location=location_games_to_rescrape)

0it [00:00, ?it/s]


In [10]:
raw_review_location = data_location + 'reviews.txt.zip'
with zipfile.ZipFile(raw_review_location) as zf:
    with io.TextIOWrapper(zf.open('reviews.txt'), encoding='utf-8') as f:
        all_reviews_raw = f.readlines()
        pass
    pass
all_reviews = [json.loads(all_reviews_raw[xxx]) for xxx in range(len(all_reviews_raw))]

In [11]:
# saving the game in CSV
game_id,game_reviews, player_review_list = [], [], []
time_stamp_created_list, time_stamp_updated_list = [], []
recommendation_id_list, scores, votes_up_total = [], [], []
voted_up_all=[]
for i in tqdm.tqdm(range(len(all_reviews)), ncols=100):
    game_reviews = all_reviews[i]
    if len(game_reviews) > 0:
        for j in range(len(game_reviews)):
            review = game_reviews[j]
            player_review = review["review"]
            timestamp_created = review["timestamp_created"]
            timestamp_updated = review["timestamp_updated"]
            recommendation_id = review["recommendationid"]
            score = review["weighted_vote_score"]
            votes_up = review["votes_up"]
            voted_up=review["voted_up"]
            game = review['game_id']
            game_id.append(game)
            player_review_list.append(player_review)
            time_stamp_created_list.append(timestamp_created)
            time_stamp_updated_list.append(timestamp_updated)
            recommendation_id_list.append(recommendation_id)
            scores.append(score)
            votes_up_total.append(votes_up)
            voted_up_all.append(voted_up)
            pass
        pass
    pass

df = pd.DataFrame(
    list(zip(recommendation_id_list, game_id, scores,
             player_review_list, time_stamp_created_list,
             time_stamp_updated_list, votes_up_total,voted_up_all)),
    columns=['recommendation_id', 'game_id', 'score',
             'review', 'timestamp_created', 'timestamp_updated', 'votes_up','voted_up']
)
df.to_csv(f"{data_location}FPS_reviews.csv.zip", index=False)

100%|█████████████████████████████████████████████████████████| 6498/6498 [00:03<00:00, 1770.98it/s]
