In [10]:
# All the data from PyPI
pypi_data_path = 'retrieved_data/pypi_data.json'

# Only Packages from PyPI that linked to Github Repo
pypi_github_pkgs_path = 'retrieved_data/pypi_github_data.json'

# Data from GitHub API for packages identified in step 2
github_data_path = 'retrieved_data/github_data.json'

## 1.) Retrieve All PyPI Data

In [2]:
from async_pypi_retrieval import get_all_packages

# There will many 404 errors but this is expected - watch for any strange errors
get_all_packages(save_to=pypi_data_path)

  0%|          | 0/1 [00:00<?, ?it/s]

Scanning 100 packages
Getting all packages...
404 Client Error: Not Found for url: https://pypi.org/pypi/pymemo/json
404 Client Error: Not Found for url: https://pypi.org/pypi/mercury-scraper/json
404 Client Error: Not Found for url: https://pypi.org/pypi/jfftools/json


100%|██████████| 1/1 [00:11<00:00, 11.09s/it]

There were 3 exceptions out of 100 requests.
Saving data to /data...
Saved data to: retrieved_data/pypi_data.json





## 2.) Find All PyPI Data That Links to a GitHub Repository

In [3]:
import ujson
import time
from urllib.parse import urlparse

In [4]:
# Load data we just collected
with open(pypi_data_path, 'r', encoding='utf-8') as f:
    json = ujson.loads(f.read())

In [6]:
verbose = False
github_packages = []

for package_data in json['data']:
    urls = package_data['project_urls']
    if urls is None:
        continue

    is_github = False
    for url in urls.values():
        parsed = urlparse(url)
        p_split = parsed.path.split('/')

        # Greater than two so a user and repo is listed
        if 'github.com' in str(parsed.netloc) and len(p_split) > 2:
            # Build link to just repo if it goes any further
            cleaned_github = 'https://github.com/' + p_split[1] + '/' + p_split[2] + '/'
            package_data['github_link'] = cleaned_github
            # Save
            github_packages.append(package_data)
            is_github = True
            break

    if not is_github and verbose:
        # View all links associated with packages NOT identified as having a GitHub Link
        # Helpful to ensure we don't miss anyy
        print(f'Is not github: {urls} from {package_data["name"]}')
        
# Log and save at completion
print(f'There were  {len(github_packages)}/{len(json["data"])} packages with github links found.')
print(f'Saving data to /data...')
with open(pypi_github_pkgs_path, 'w', encoding='utf-8') as f:
    ujson.dump({
        "data": github_packages,
        "timestamp": time.time()
    }, f)

print(f'Saved data to: {pypi_github_pkgs_path}')

There were  78/97 packages with github links found.
Saving data to /data...
Saved data to: retrieved_data/pypi_github_data.json


## 3.) Retrieve data from GitHub API

This is the trickiest part of the data to get due to being ratelimited by GitHub's API. To get this without constantly re-running the script every hour this while loop will run checking if requests will be served or rate limited - until stopped.

This means **the looping cell will need to be terminated by hand after a couple days or so** (takes that long to go through all the data.)

In [7]:
from environs import Env

env = Env()
env.read_env()

github_auth = env("GITHUB_AUTH")

In [15]:
import time
from async_github_retrieval import RetrieveGitHubData
from github_utils import GitHubUtils

github_utils = GitHubUtils(github_auth)
github_retrieval = RetrieveGitHubData(github_data_path, pypi_github_pkgs_path, github_auth)

In [13]:
# Initialize file with structure to allow us to update a state
# Will only need to run this cell once!
github_retrieval.init_data_map()

Exception: The data map has already been initialized! Delete it if you are sure you want to start over.

This cell below **will need to be terminated by hand after a couple days or so** (takes that long to go through all the data.)

Expect to see errors like "loading", "403", and "rate_limited"

In [14]:
while True:
    github_retrieval.clear_error()
    # Every 20 successes clear errors
    successes = 0
    while successes < 20:
        if github_utils.within_github_rate_limit():
            print('\nWITHIN RATE LIMIT!')
            successes += 1
            github_retrieval.get_all_github_data()
        else:
            print('Waiting 5 Minutes...')
            time.sleep(60 * 5)

CLEARING ERRORS...
Saving data to /data...
Saved data to: retrieved_data/github_data.json

------------- SOME STATS -------------------
There were 65 packages with data
There were 13 packages that throw errors
There are 78 packages total
--------------------------------------------



WITHIN RATE LIMIT!


100%|██████████| 1/1 [00:00<00:00, 7738.57it/s]

Setting up packages to be distributed to workers...
Getting all github data...
There were 0 exceptions out of 78 requests.
Saving data...
Saved data to: retrieved_data/github_data.json

------------- SOME STATS -------------------
There were 65 packages with data
There were 13 packages that throw errors
There are 78 packages total
--------------------------------------------





100%|██████████| 1/1 [00:00<00:00, 7913.78it/s]


WITHIN RATE LIMIT!
Setting up packages to be distributed to workers...
Getting all github data...
There were 0 exceptions out of 78 requests.
Saving data...
Saved data to: retrieved_data/github_data.json

------------- SOME STATS -------------------
There were 65 packages with data
There were 13 packages that throw errors
There are 78 packages total
--------------------------------------------





100%|██████████| 1/1 [00:00<00:00, 4228.13it/s]


WITHIN RATE LIMIT!
Setting up packages to be distributed to workers...
Getting all github data...
There were 0 exceptions out of 78 requests.
Saving data...
Saved data to: retrieved_data/github_data.json

------------- SOME STATS -------------------
There were 65 packages with data
There were 13 packages that throw errors
There are 78 packages total
--------------------------------------------





100%|██████████| 1/1 [00:00<00:00, 6297.75it/s]


WITHIN RATE LIMIT!
Setting up packages to be distributed to workers...
Getting all github data...
There were 0 exceptions out of 78 requests.
Saving data...
Saved data to: retrieved_data/github_data.json

------------- SOME STATS -------------------
There were 65 packages with data
There were 13 packages that throw errors
There are 78 packages total
--------------------------------------------





100%|██████████| 1/1 [00:00<00:00, 2839.75it/s]


WITHIN RATE LIMIT!
Setting up packages to be distributed to workers...
Getting all github data...
There were 0 exceptions out of 78 requests.
Saving data...
Saved data to: retrieved_data/github_data.json

------------- SOME STATS -------------------
There were 65 packages with data
There were 13 packages that throw errors
There are 78 packages total
--------------------------------------------





100%|██████████| 1/1 [00:00<00:00, 5801.25it/s]


WITHIN RATE LIMIT!
Setting up packages to be distributed to workers...
Getting all github data...
There were 0 exceptions out of 78 requests.
Saving data...
Saved data to: retrieved_data/github_data.json

------------- SOME STATS -------------------
There were 65 packages with data
There were 13 packages that throw errors
There are 78 packages total
--------------------------------------------





100%|██████████| 1/1 [00:00<00:00, 6636.56it/s]


WITHIN RATE LIMIT!
Setting up packages to be distributed to workers...
Getting all github data...
There were 0 exceptions out of 78 requests.
Saving data...
Saved data to: retrieved_data/github_data.json

------------- SOME STATS -------------------
There were 65 packages with data
There were 13 packages that throw errors
There are 78 packages total
--------------------------------------------





100%|██████████| 1/1 [00:00<00:00, 6553.60it/s]


WITHIN RATE LIMIT!
Setting up packages to be distributed to workers...
Getting all github data...
There were 0 exceptions out of 78 requests.
Saving data...
Saved data to: retrieved_data/github_data.json

------------- SOME STATS -------------------
There were 65 packages with data
There were 13 packages that throw errors
There are 78 packages total
--------------------------------------------





100%|██████████| 1/1 [00:00<00:00, 3407.23it/s]


WITHIN RATE LIMIT!
Setting up packages to be distributed to workers...
Getting all github data...
There were 0 exceptions out of 78 requests.
Saving data...
Saved data to: retrieved_data/github_data.json

------------- SOME STATS -------------------
There were 65 packages with data
There were 13 packages that throw errors
There are 78 packages total
--------------------------------------------





100%|██████████| 1/1 [00:00<00:00, 1749.08it/s]


WITHIN RATE LIMIT!
Setting up packages to be distributed to workers...
Getting all github data...
There were 0 exceptions out of 78 requests.
Saving data...
Saved data to: retrieved_data/github_data.json

------------- SOME STATS -------------------
There were 65 packages with data
There were 13 packages that throw errors
There are 78 packages total
--------------------------------------------





100%|██████████| 1/1 [00:00<00:00, 6061.13it/s]


WITHIN RATE LIMIT!
Setting up packages to be distributed to workers...
Getting all github data...
There were 0 exceptions out of 78 requests.
Saving data...
Saved data to: retrieved_data/github_data.json

------------- SOME STATS -------------------
There were 65 packages with data
There were 13 packages that throw errors
There are 78 packages total
--------------------------------------------





100%|██████████| 1/1 [00:00<00:00, 7194.35it/s]


WITHIN RATE LIMIT!
Setting up packages to be distributed to workers...
Getting all github data...
There were 0 exceptions out of 78 requests.
Saving data...
Saved data to: retrieved_data/github_data.json

------------- SOME STATS -------------------
There were 65 packages with data
There were 13 packages that throw errors
There are 78 packages total
--------------------------------------------





100%|██████████| 1/1 [00:00<00:00, 1842.84it/s]


WITHIN RATE LIMIT!
Setting up packages to be distributed to workers...
Getting all github data...
There were 0 exceptions out of 78 requests.
Saving data...
Saved data to: retrieved_data/github_data.json

------------- SOME STATS -------------------
There were 65 packages with data
There were 13 packages that throw errors
There are 78 packages total
--------------------------------------------





100%|██████████| 1/1 [00:00<00:00, 2087.76it/s]


WITHIN RATE LIMIT!
Setting up packages to be distributed to workers...
Getting all github data...
There were 0 exceptions out of 78 requests.
Saving data...
Saved data to: retrieved_data/github_data.json

------------- SOME STATS -------------------
There were 65 packages with data
There were 13 packages that throw errors
There are 78 packages total
--------------------------------------------





100%|██████████| 1/1 [00:00<00:00, 4848.91it/s]


WITHIN RATE LIMIT!
Setting up packages to be distributed to workers...
Getting all github data...
There were 0 exceptions out of 78 requests.
Saving data...
Saved data to: retrieved_data/github_data.json

------------- SOME STATS -------------------
There were 65 packages with data
There were 13 packages that throw errors
There are 78 packages total
--------------------------------------------





100%|██████████| 1/1 [00:00<00:00, 5645.09it/s]


WITHIN RATE LIMIT!
Setting up packages to be distributed to workers...
Getting all github data...
There were 0 exceptions out of 78 requests.
Saving data...
Saved data to: retrieved_data/github_data.json

------------- SOME STATS -------------------
There were 65 packages with data
There were 13 packages that throw errors
There are 78 packages total
--------------------------------------------





100%|██████████| 1/1 [00:00<00:00, 6462.72it/s]


WITHIN RATE LIMIT!
Setting up packages to be distributed to workers...
Getting all github data...
There were 0 exceptions out of 78 requests.
Saving data...
Saved data to: retrieved_data/github_data.json

------------- SOME STATS -------------------
There were 65 packages with data
There were 13 packages that throw errors
There are 78 packages total
--------------------------------------------





100%|██████████| 1/1 [00:00<00:00, 10280.16it/s]


WITHIN RATE LIMIT!
Setting up packages to be distributed to workers...
Getting all github data...
There were 0 exceptions out of 78 requests.
Saving data...
Saved data to: retrieved_data/github_data.json

------------- SOME STATS -------------------
There were 65 packages with data
There were 13 packages that throw errors
There are 78 packages total
--------------------------------------------





100%|██████████| 1/1 [00:00<00:00, 7384.34it/s]


WITHIN RATE LIMIT!
Setting up packages to be distributed to workers...
Getting all github data...
There were 0 exceptions out of 78 requests.
Saving data...
Saved data to: retrieved_data/github_data.json

------------- SOME STATS -------------------
There were 65 packages with data
There were 13 packages that throw errors
There are 78 packages total
--------------------------------------------





100%|██████████| 1/1 [00:00<00:00, 6442.86it/s]


WITHIN RATE LIMIT!
Setting up packages to be distributed to workers...
Getting all github data...
There were 0 exceptions out of 78 requests.
Saving data...
Saved data to: retrieved_data/github_data.json

------------- SOME STATS -------------------
There were 65 packages with data
There were 13 packages that throw errors
There are 78 packages total
--------------------------------------------


CLEARING ERRORS...
Saving data to /data...
Saved data to: retrieved_data/github_data.json

------------- SOME STATS -------------------
There were 65 packages with data
There were 13 packages that throw errors
There are 78 packages total
--------------------------------------------





100%|██████████| 1/1 [00:00<00:00, 7121.06it/s]


WITHIN RATE LIMIT!
Setting up packages to be distributed to workers...
Getting all github data...
There were 0 exceptions out of 78 requests.
Saving data...
Saved data to: retrieved_data/github_data.json

------------- SOME STATS -------------------
There were 65 packages with data
There were 13 packages that throw errors
There are 78 packages total
--------------------------------------------





100%|██████████| 1/1 [00:00<00:00, 7169.75it/s]


WITHIN RATE LIMIT!
Setting up packages to be distributed to workers...
Getting all github data...
There were 0 exceptions out of 78 requests.
Saving data...
Saved data to: retrieved_data/github_data.json

------------- SOME STATS -------------------
There were 65 packages with data
There were 13 packages that throw errors
There are 78 packages total
--------------------------------------------







WEIRD ERROR: 
Waiting 5 Minutes...


KeyboardInterrupt: 