In [None]:
import requests

headers = {
    'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
    'accept-language': 'en-US,en;q=0.5',
    'connection' : 'keep-alive',
    'sec-fetch-dest': 'document',
    'sec-fetch-mode': 'navigate',
    'sec-fetch-site': 'same-origin',
    'upgrade-insecure-requests': '1',
    'user-agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:101.0) Gecko/20100101 Firefox/101.0',
}

session = requests.session()

In [None]:
def get_links():
  base_url = r'https://embed.gog.com/games/ajax/filtered?mediaType=game'
  page = 1
  links = dict()
  
  while True:
    page_url = f'{base_url}&page={page}'
    response = session.get(page_url)
    gog_response = response.json()

    products = gog_response['products']
    for product in products:
      id = product['id']
      link = product['url']
      links[id] = link
    
    if page > gog_response['totalPages']:
      return links

    page += 1

In [None]:
links = get_links()
len(links)

6496

In [None]:
def get_genres(element):
  content = element.text.splitlines()
  
  result = []
  for text in content:
    genre = text.strip()
    if genre and not genre == '-':
      result.append(genre)
      
  return result

In [None]:
def get_tags(element):
  content = element.text.splitlines()
  if len(content) > 1:
    content.pop(1) # If there a lot of tags, remove `show {n} more...`
  
  result = []
  for tags_text in content:
    for tag_text in tags_text.split(','):
      tag = tag_text.strip()
      if tag:
        result.append(tag)

  return result

In [None]:
from bs4 import BeautifulSoup
import pandas as pd

def get_additional_info(links):
  result = pd.DataFrame()
  for id, url in links.items():
    response = session.get(f'https://www.gog.com{url}', headers=headers)

    soup = BeautifulSoup(response.text, 'lxml')
    [genres_content, tags_content, *_] = soup.find_all('div', class_='details__content table__row-content')

    if genres_content:
      genres = get_genres(genres_content)

    if tags_content:
      tags = get_tags(tags_content)

    data = {'id':id, 'genres':genres, 'tags':tags}
    df = pd.DataFrame.from_dict(data, orient='index')
    result = result.append(df.transpose())

  return result.set_index('id')

In [None]:
from itertools import zip_longest

def grouper(iterable, n, *, incomplete='fill', fillvalue=None):
    "Collect data into non-overlapping fixed-length chunks or blocks"
    # grouper('ABCDEFG', 3, fillvalue='x') --> ABC DEF Gxx
    # grouper('ABCDEFG', 3, incomplete='strict') --> ABC DEF ValueError
    # grouper('ABCDEFG', 3, incomplete='ignore') --> ABC DEF
    args = [iter(iterable)] * n
    if incomplete == 'fill':
        return zip_longest(*args, fillvalue=fillvalue)
    if incomplete == 'strict':
        return zip(*args, strict=True)
    if incomplete == 'ignore':
        return zip(*args)
    else:
        raise ValueError('Expected fill, strict, or ignore')

In [None]:
def group_to_string(group):
  ids = []
  for item in group:
    if not item:
      continue

    id_str = str(item[0])
    ids.append(id_str)

  return ','.join(ids)

In [None]:
def get_games_info(links):
  base_url = r'https://api.gog.com/products?expand=downloads,expanded_dlcs,description,screenshots,videos,related_products,changelog'
  MAX_IDS = 50

  result = pd.DataFrame()
  for links_group in grouper(links.items(), MAX_IDS):    
    ids = group_to_string(links_group)
    
    full_url = f'{base_url}&ids={ids}'
    response = session.get(full_url)
    
    data = pd.DataFrame(response.json())
    result = result.append(data)
    
  return result.set_index('id') 

In [None]:
games_info = get_games_info(links)

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
games_info.to_json('/content/gdrive/MyDrive/Графы/games_info.json')

In [None]:
additional_info = get_additional_info(links)

In [None]:
additional_info.to_json('/content/gdrive/MyDrive/Графы/additional_info.json')