# New CML workflow query

In this notebook, I retrieve the URLs of the CML workflows as suggested by the iterativeAI devs.

In [None]:
import calendar
import configparser
import json
import time
import re
from pathlib import Path

from github import Github
from github.PaginatedList import PaginatedList

DATA_FOLDER: Path = Path("../data")
SLEEP_TIME: float = 2.0

Define utility functions

In [None]:
def check_rate_limit(github: Github):
        """Check the rate limit of the Github API."""

        core_rate_limit = github.get_rate_limit().core
        if core_rate_limit.remaining <= 5:
            print("Rate limit reached...")
            reset_timestamp = calendar.timegm(core_rate_limit.reset.timetuple())
            # add 5 seconds to be sure the rate limit has been reset)
            sleep_time = reset_timestamp - calendar.timegm(time.gmtime()) + 5
            print(f"Sleeping for {sleep_time} seconds...")
            time.sleep(sleep_time)
            print("Resuming after sleep...")


def load_gh_token(env_file_path: str = "../env.ini") -> str:
    """Load the GitHub token."""

    config = configparser.ConfigParser()
    env_file = Path(env_file_path)
    if not env_file.exists():
        raise ValueError(f"The env file '{env_file_path}' does not exist.")
    else:
        config.read(env_file)

    return json.loads(config.get("GITHUB", "TOKEN_LIST"))[0]

def search_workflows(github: Github, extension: str) -> PaginatedList:
    """Perform a search code query on GitHub.
    
    It searches all workflow files in the repository's .github/workflows folder
    with the given extension that use the `iterative/setup-cml` action.
    """
    
    QUERY = f'iterative/setup-cml path:.github/workflows extension:{extension}'
    return github.search_code(QUERY)


def extract_workflow_data(workflow_html_url: str) -> tuple[str, str]:
    """Extracts the repository URL and the workflow file name from the workflow URL.

    For instance, given the URL:
        `https://github.com/Etheredge-Works/gloves/blob/4d9bb10bdd294bcdd5213e6b8181ad83538cb22a/.github/workflows/training.yaml`

    the function returns:
        - Etheredge-Works/gloves
        - training.yaml

    Args:
        workflow_html_url (str): the URL of the workflow file

    Returns:
        tuple[str, str]: the repository URL and the workflow file name
    """

    PATTERN = r"https:\/\/github\.com\/(.*)\/blob.*\/.github/workflows/(.*)"

    match = re.search(PATTERN, workflow_html_url)

    if match:
        return match.group(1), match.group(2)
    else:
        raise ValueError(
            f"Could not extract repository URL and workflow file name from URL: {workflow_html_url}"
        )

def get_workflow_data(github: Github, query_result: PaginatedList, sleep_time: float = 1.0) -> dict:
    """Returns a dict containing information about the retrieved workflow files.
    Args:
        github (Github): the GitHub object used to connect to the API
        query_result (PaginatedList): the query result to parse
        sleep_time (float, optional): Time to wait after each request.
            Defaults to 1, avoiding secondary rate limit errors.

    Returns:
        dict: dictionary with the repository URL as key and the list of relevant 
        workflow files as value.
    """

    # workflows: dict = {}

    # for file in query_result:
        
    #     time.sleep(sleep_time)
    #     # check_rate_limit(github)
        
    #     # Add repository URL as key in dictionary and set value to empty list
    #     repository_url = file.repository.html_url
    #     if repository_url not in workflows:
    #         workflows[repository_url] = []

    #     # Add workflow file to list of workflows for repository
    #     workflows[repository_url].append(file.name)

    # return workflows

    workflows: dict = {}

    for file in query_result:
        
        time.sleep(sleep_time)
        

        workflow_url = file.html_url
        repository_url, workflow_filename = extract_workflow_data(workflow_url)

        # Add repository URL as key in dictionary and set value to empty list
        if repository_url not in workflows:
            workflows[repository_url] = []

        # Add workflow file to list of workflows for repository
        workflows[repository_url].append(workflow_filename)

    return workflows

## Perform queries on GitHub.

Since a single query returns more than 1000 results, i.e., the maximum allowed by the GitHub API, we need to perform multiple queries to get all the results.
By restricting the search on the file extension, we can be sure to retrieve all relevant files with two queries: one for `.yml` files and one for `.yaml` files.

In [None]:
GITHUB_TOKEN = load_gh_token()
github = Github(GITHUB_TOKEN)
print(f"Connected to GitHub as `{github.get_user().login}`")

### Query 1: .yml files

In [None]:
print("\nSEARCHING `.yml` FILES...")
result = search_workflows(github, 'yml')
print(f'Total number of retrieved files: {result.totalCount}')

print("\nRetrieving workflow data...")
yml_workflows = get_workflow_data(github, result, sleep_time=SLEEP_TIME)
print("Workflow data retrieved.")

print("Writing workflow data to file...")
filename = "yml_CML_workflows.json"
with open(DATA_FOLDER / filename, "w") as f:
    json.dump(yml_workflows, f, indent=4)
print(f"Data written to file: `{filename}`.\n")

### Query 2: .yaml files

In [None]:
print("\nSEARCHING `.yaml` FILES...")
result = search_workflows(github, 'yaml')
print(f'Total number of retrieved files: {result.totalCount}')

print("\nRetrieving workflow data...")
yaml_workflows = get_workflow_data(github, result, sleep_time=SLEEP_TIME)
print("Workflow data retrieved.")

print("Writing workflow data to file...")
filename = "yaml_CML_workflows.json"
with open(DATA_FOLDER / filename, "w") as f:
    json.dump(yml_workflows, f, indent=4)
print(f"Data written to file: `{filename}`.\n")

## Merge results

In [None]:
from copy import deepcopy

merge = deepcopy(yml_workflows)
for workflow in yaml_workflows:
    if workflow in merge:
        merge[workflow].extend(yaml_workflows[workflow])
    else:
        merge[workflow] = yaml_workflows[workflow]

print("Writing workflow data to file...")
filename = "merged_CML_workflows.json"
with open(DATA_FOLDER / filename, "w") as f:
    json.dump(yml_workflows, f, indent=4)
print(f"Data written to file: `{filename}`.")