In [4]:
import json
import requests
import pandas as pd
from pandas import json_normalize
import numpy as np
from bs4 import BeautifulSoup

response = requests.get("https://www.roblox.com/games/list-json?sortFilter=1&MaxRows=200")

if response.status_code == 200:
  listOfGames = response.json()[:]
  dfGames = json_normalize(listOfGames)
dfGames

Unnamed: 0,CreatorID,CreatorName,CreatorUrl,Plays,Price,ProductID,IsOwned,IsVotingEnabled,TotalUpVotes,TotalDownVotes,...,UniverseID,HasErrorOcurred,GameDetailReferralUrl,Url,RetryUrl,Final,Name,PlaceID,PlayerCount,ImageId
0,9213039,Sulley,https://www.roblox.com/groups/9213039,206794392,0,0,False,True,382752,23935,...,2324662457,False,https://www.roblox.com/games/6299805723/UPDATE...,https://t0.rbxcdn.com/5be59735f4268699029c6e7b...,,True,[UPDATE 8] Anime Fighters Simulator,6299805723,121268,7297572866
1,5774246,Easy.gg,https://www.roblox.com/groups/5774246,687418481,0,0,False,True,293101,89071,...,2619619496,False,https://www.roblox.com/games/6872265039/BedWar...,https://t1.rbxcdn.com/8d1e30854fd12367dcbe5a15...,,True,BedWars 👑 [BATTLE PASS!],6872265039,123795,7345414099
2,295182,DreamCraft,https://www.roblox.com/groups/295182,24694730552,0,0,False,True,4782421,927076,...,383310974,False,https://www.roblox.com/games/920587237/Adopt-Me,https://t6.rbxcdn.com/2ac673dd7ec5072c8b86120a...,,True,Adopt Me!,920587237,207804,7335544287
3,3194064,Bizarre Studios®,https://www.roblox.com/groups/3194064,540558583,0,0,False,True,578592,61846,...,1016936714,False,https://www.roblox.com/games/2809202155/NEW-UP...,https://t5.rbxcdn.com/3d95fe7cb86e0c11ad441e3a...,,True,『NEW UPDATE』 Your Bizarre Adventure,2809202155,46223,7293568117
4,4372130,go play eclipsis,https://www.roblox.com/groups/4372130,2134531943,0,0,False,True,922043,65481,...,994732206,False,https://www.roblox.com/games/2753915549/UPDATE...,https://t1.rbxcdn.com/796304f71a0dd825c4807aba...,,True,[UPDATE 14] Blox Fruits,2753915549,92597,6542116584
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,326871177,iannel123,https://www.roblox.com/users/326871177/profile/,1419263,0,0,False,True,6103,1047,...,2642182531,False,https://www.roblox.com/games/6914337402/NEW-FI...,https://t5.rbxcdn.com/ae7ccbdedf9477d6a517d8c2...,,True,[ NEW FIGHTING STYLE | 10x EVENT ] Tokyo Rove...,6914337402,1058,7312825321
196,1235888,The MLG Gang,https://www.roblox.com/groups/1235888,146591961,0,0,False,True,284543,26964,...,2492481398,False,https://www.roblox.com/games/6610021055/Noob-A...,https://t3.rbxcdn.com/8cc54c815fe1da2759bd7e05...,,True,Noob Army Tycoon,6610021055,3126,7252253335
197,5929470,Playful Club,https://www.roblox.com/groups/5929470,32137513,0,0,False,True,95737,6043,...,2345362906,False,https://www.roblox.com/games/6339160453/Update...,https://t1.rbxcdn.com/4ff41d952be96e49970b763a...,,True,[Update4]Naruto War Tycoon,6339160453,3417,7093494011
198,3461453,Nosniy Games,https://www.roblox.com/groups/3461453,56142746,0,0,False,True,105311,28769,...,1424449565,False,https://www.roblox.com/games/4468711919/Super-...,https://t2.rbxcdn.com/7bfe0c7f490b26cb0263970a...,,True,Super Golf!,4468711919,1417,7027357233


This is a great start, but looking through our columns, we notice we're missing Genre and Description. The API endpoint doesn't seem to get this for us, so we'll need to scrape each game's webpage to find this information. Luckily, the API endpoint gives us the URL for each of the games' detail pages, so we can just use that.

In [None]:
genre = []
desc = []
for game in dfGames["GameDetailReferralUrl"]:
  gameResponse = requests.get(game)
  soup = BeautifulSoup(gameResponse.content, "html.parser")

  gameGenres = soup.find_all("li", {"class", "game-stat"})
  listItemString = str(gameGenres[6])
  genreSliced = listItemString[150: listItemString[150:].find('<li') + 150]
  genre.append(genreSliced)

  description = str(soup.find("pre", {"class", "text game-description linkify"})).strip("<pre class=\"text game-description linkify\">")
  desc.append(description)

dfGames['Genre'] = genre
dfGames['Description'] = desc
dfGames.head(3)

## Data Cleaning

Looking at our dataframe, we notice there are a lot of columns with information we probably don't need. The columns we'll be removing are:


* CreatorID
* IsOwned
* IsVotingEnabled (all games have this as True)
* HasErrorOcurred
* Url (this Url holds the game's Icon image)
* RetryUrl
* Final
* ImageId

In [None]:
dfGames.drop(columns=['CreatorID', 'IsOwned', 'IsVotingEnabled', 'HasErrorOcurred', 'Url', 'RetryUrl', 'Final', 'ImageId'], inplace=True)
dfGames

## Scraping rolimons

The rolimons website is a bit more complicated to scrape than Roblox. Since the data we want to grab is hidden behind some mouse presses and mouse hovers, we'll use Selenium to simulate these events.

In [None]:
!pip install selenium
!apt-get update
!apt install chromium-chromedriver

In [None]:
dfGames = pd.read_csv("games.csv")

import sys
sys.path.insert(0,'/usr/lib/chromium-browser/chromedriver')

from selenium import webdriver
from selenium.webdriver.common.action_chains import ActionChains

# Setting up selenium to be useable in Google Colab
options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')

In [None]:
from datetime import datetime

# getDays(str inputString) -> datetime Object
# used to turn rolimons date time format into a datetime for easy arithmetic
def getDays(s):
  s = s.split(' ', 1)[1]
  return datetime.strptime(s, '%b %d %Y, %I:%M %p')

In [None]:
import os.path
from os import path

if not path.exists("games_playercount.csv"):      #if games_playercount.csv exists, no need to webscrape rollimons as process takes long time
  wd = webdriver.Chrome('chromedriver',options=options)

  for i in range(91):
    dfGames.loc[:,str(i)] = np.nan

  for index, row in dfGames.iterrows():
    print("Now scraping rolimons for " + str(index) + ": " + row['Name'])

    id = str(row['PlaceID'])
    wd.get("https://www.rolimons.com/game/" + id)

    # Find Daily button to switch to Daily tab on rolimons
    dailyButton = wd.find_element_by_link_text('Daily')

    a = ActionChains(wd)
    a.move_to_element(dailyButton).perform()
    a.click().perform()

    # Find player count chart
    frame = wd.find_element_by_id('players_chart_container_daily')

    # Hovering over the chart to get the tooltip to appear
    a.move_to_element_with_offset(frame, 200, 100).perform()
    a.click().perform()
    a.reset_actions()

    # Locating tooltip and 3m buttons
    gtags = frame.find_elements_by_tag_name('g')
    threeMonthButton = gtags[-4]
    tooltip = gtags[-1]

    # Click on the 3m button. If the game doesn't have 3 month data, we'll stick to using the defaul All (which will be < 3 months of data)
    threeMonthButton.click()

    lastDay = ""
    x = 353
    firstDay = 0

    # Poll several locations on the chart and get player count and day
    while x > 20:
      a.move_to_element_with_offset(frame, x, 100).perform()
      a.reset_actions()

      x -= 50 # Move our cursor 50 px to the left
      tspans = tooltip.find_elements_by_tag_name('tspan')

      # Locating which day this is and the player count on that day
      day = tspans[0].get_attribute('innerHTML')
      count = tspans[-1].get_attribute('innerHTML')

      if day != lastDay: # Do this to avoid polling the same day more than once
        # getDays turns the rolimons' date format to a python datetime
        thisDay = getDays(day)

        if lastDay == "": # Set the first day
          firstDay = thisDay
        
        lastDay = day 
        
        delta = firstDay - thisDay # Get difference between first and current day

        dfGames.loc[index,str(90 - delta.days)] = int(count.replace(',', '')) # Cleaning commas from player count values

    dfGames.to_csv(path_or_buf="games_playercount.csv", index=False)