In [None]:
import datetime
import json
import pandas as pd
import os
import re
import requests

In [None]:
# Kudos to https://nodes.guru/

In [None]:
force_download = False
force_download_on_build_id_change = True
RELATIVE_LOCAL_PATH = os.path.join('data', 'nodes_guru')

ALL_PROJECTS_FILE = "nodes-guru-all-projects.json"
NODESGURU_BUILD_ID_VERSION="nodes-guru.buildid"
FINAL_CSV="nodes-guru-project-summary.csv"
os.makedirs(RELATIVE_LOCAL_PATH, exist_ok=True)

In [None]:
# get all the projects
all_projects_filepath = os.path.join(RELATIVE_LOCAL_PATH, ALL_PROJECTS_FILE)
if not force_download and force_download_on_build_id_change:
    # check the current page to get the buildId
    page = requests.get('https://nodes.guru')
    m = re.search('"buildId":"([^"]+)"', page.text)
    if m:
        current_build_id=m.group(1)
    else:
        print("buildID not found ...")
        current_build_id="HNqu_x6wipeTmVq9jcFp1"
    local_build_id = None
    if os.path.exists(os.path.join(RELATIVE_LOCAL_PATH, NODESGURU_BUILD_ID_VERSION)):
        with open(os.path.join(RELATIVE_LOCAL_PATH, NODESGURU_BUILD_ID_VERSION)) as r:
            local_build_id = r.read()
    if local_build_id == current_build_id:
        # downloaded with same buildId
        force_download_on_build_id_change = False
    else:
        with open(os.path.join(RELATIVE_LOCAL_PATH, NODESGURU_BUILD_ID_VERSION), 'w') as w:
            w.write(current_build_id)

if not os.path.exists(all_projects_filepath) or force_download or force_download_on_build_id_change:
    r = requests.get('https://nodes.guru/api/search')
    with open(all_projects_filepath, 'w') as w:
        w.write(json.dumps(r.json()))

In [None]:
# list the projects
with open(all_projects_filepath) as r:
    data = json.loads(r.read())

In [None]:
# get the project data
for project in data:
    project_slug = project['slug']
    path = os.path.join(RELATIVE_LOCAL_PATH, f'{project_slug}.json')
    if not os.path.exists(path) or force_download or force_download_on_build_id_change:
        project_data = requests.get(f'https://nodes.guru/_next/data/HNqu_x6wipeTmVq9jcFp1/{project_slug}.json')
        with open(path, 'w') as w:
            w.write(json.dumps(project_data.json()))

In [None]:
# use the project data. Let's select some data
projects_data = []
for project in data:
    project_slug = project['slug']
    path = os.path.join(RELATIVE_LOCAL_PATH, f'{project_slug}.json')
    with open(path, 'r') as r:
        project_data = json.loads(r.read())
        startDate = project_data['pageProps']['data']['additionalInfo']['startDate']
        if startDate is not None:
            startDate = str(datetime.datetime.fromtimestamp(int(startDate.split(';')[1].strip())).date())
        
        web = project_data['pageProps']['data']['socialLinks']['website']
        twitter = project_data['pageProps']['data']['socialLinks']['twitter']
        if twitter:
            twitter = twitter.split('?')[0].lstrip('https://twitter.com/')
        projects_data.append([project_slug, web, twitter, startDate, project_data['pageProps']['data']['additionalInfo']['rewards']])

# generate the pandas DataFrame
df = pd.DataFrame(data=projects_data, columns=['project', 'web', 'twitter', 'rewards_start_date', 'rewards_details']).sort_values('project')
df.to_csv(os.path.join(RELATIVE_LOCAL_PATH, FINAL_CSV), index=False)
df.head()