In [None]:
pip install python-dotenv pandas

In [1]:
import pandas as pd
import requests
import time
from dotenv import load_dotenv
import os

load_dotenv() 

True

In [None]:
# === Configuration ===
CSV_FILE = "git_repo_filtered_js_commit_date.csv"
GITHUB_TOKEN = os.getenv("GITHUB_TOKEN")
GRAPHQL_URL = "https://api.github.com/graphql"
HEADERS = {
    "Authorization": f"Bearer {GITHUB_TOKEN}",
    "Accept": "application/vnd.github+json"
}

repos_with_CI = set()
invalid_repos = set()
repos_with_no_workflows = set()
repos_with_network_error = set()
repo_name_to_branch = {}

In [None]:
# === Step 1: Read CSV and convert to list of repos ===
df = pd.read_csv(CSV_FILE)

# Split 'name' into owner and repo
repos = []
for _, row in df.iterrows():
    if '/' not in row['name']:
        print(f"Skipping invalid repo name: {row['name']}")
        continue
    owner, repo_name = row['name'].split('/', 1)
    repos.append({
        "owner": owner.strip(),
        "name": repo_name.strip(),
        "default_branch": row['default_branch'].strip()
    })
    repo_name_to_branch[f"{owner.strip()}/{repo_name.strip()}"] = row['default_branch'].strip()

print(repos[:5])

[{'owner': 'bigbluebutton', 'name': 'bigbluebutton', 'default_branch': 'v3.0.x-release'}, {'owner': 'zabinx', 'name': 'duskrpg', 'default_branch': 'master'}, {'owner': 'apache', 'name': 'cordova-android', 'default_branch': 'master'}, {'owner': 'aws-samples', 'name': 'aws-dynamodb-examples', 'default_branch': 'master'}, {'owner': 'dgarijo', 'name': 'widoco', 'default_branch': 'master'}]


In [None]:
# === Step 2: Build GraphQL query for multiple repos ===
def build_query(repos, start, end):
    query_parts = []
    for i, repo in enumerate(repos[start:end]):
        query_parts.append(f"""
        repo{i}: repository(owner: "{repo['owner']}", name: "{repo['name']}") {{
            workflows: object(expression: "{repo['default_branch']}:.github/workflows") {{
                ... on Tree {{
                    entries {{
                        name
                        type
                    }}
                }}
            }}
        }}
        """)
    full_query = "query { " + " ".join(query_parts) + " }"
    # print("query:", full_query)
    return full_query

In [None]:
# === Step 3: Execute query and parse results ===
def check_workflows(repos, start, end):
    try:
        query = build_query(repos, start, end)
        response = requests.post(GRAPHQL_URL, json={"query": query}, headers=HEADERS)
        response.raise_for_status()
        data = response.json()
        # print("data:", data)

        # If GitHub responds with an error (403, 502, etc.)
        if response.status_code != 200:
            print(f"Skipping batch {start}:{end} (HTTP {response.status_code})")
            for repo in repos[start:end]:
                repos_with_network_error.add(f"{repo['owner']}/{repo['name']}")
            return

        data = response.json()

        for i, repo in enumerate(repos[start:end]):
            key = f"repo{i}"
            repo_data = data.get("data", {}).get(key, {})

            if not repo_data:
                invalid_repos.add(f"{repo['owner']}/{repo['name']}")
                continue
            workflows = repo_data.get("workflows")

            if workflows and workflows.get("entries"):
                repos_with_CI.add(f"{repo['owner']}/{repo['name']}")
            else:
                repos_with_no_workflows.add(f"{repo['owner']}/{repo['name']}")
    except requests.exceptions.RequestException as e:
        print(f"Request failed for batch {start}:{end}: {e}")
        for repo in repos[start:end]:
            repos_with_network_error.add(f"{repo['owner']}/{repo['name']}")

In [None]:
start = 0
batch_size = 100
limit = 24612

repos_with_CI = set()
invalid_repos = set()
repos_with_no_workflows = set()
repos_with_network_error = set()

for start in range(start, limit, batch_size):
    end = min(start + batch_size, limit)
    check_workflows(repos, start, end)
    print(f"Processed {start} to {end}. repos_with_CI:{len(repos_with_CI)} repos_with_network_error:{len(repos_with_network_error)} repos_with_no_workflows:{len(repos_with_no_workflows)} invalid_repos:{len(invalid_repos)}")
    time.sleep(10)

Processed 0 to 100. repos_with_CI:54 repos_with_network_error:0 repos_with_no_workflows:45 invalid_repos:1
Processed 100 to 200. repos_with_CI:120 repos_with_network_error:0 repos_with_no_workflows:77 invalid_repos:3
Processed 200 to 300. repos_with_CI:152 repos_with_network_error:0 repos_with_no_workflows:144 invalid_repos:4
Processed 300 to 400. repos_with_CI:177 repos_with_network_error:0 repos_with_no_workflows:219 invalid_repos:4
Processed 400 to 500. repos_with_CI:194 repos_with_network_error:0 repos_with_no_workflows:301 invalid_repos:5
Processed 500 to 600. repos_with_CI:228 repos_with_network_error:0 repos_with_no_workflows:366 invalid_repos:6
Processed 600 to 700. repos_with_CI:259 repos_with_network_error:0 repos_with_no_workflows:435 invalid_repos:6
Processed 700 to 800. repos_with_CI:283 repos_with_network_error:0 repos_with_no_workflows:511 invalid_repos:6
Processed 800 to 900. repos_with_CI:310 repos_with_network_error:0 repos_with_no_workflows:583 invalid_repos:7
Proces

In [None]:
# Save each set to a separate file
with open("repos_with_CI.txt", "w") as f:
    for repo in (repos_with_CI):
        f.write(repo + "\n")

with open("repos_with_no_workflows.txt", "w") as f:
    for repo in (repos_with_no_workflows):
        f.write(repo + "\n")

with open("invalid_repos.txt", "w") as f:
    for repo in (invalid_repos):
        f.write(repo + "\n")

with open("repos_with_network_error.txt", "w") as f:
    for repo in (repos_with_network_error):
        f.write(repo + "\n")

In [None]:
# retrying repos with network error
unchecked_repos = []
for error_repo in repos_with_network_error:
    if '/' not in row['name']:
        print(f"Skipping invalid repo name: {row['name']}")
        continue
    owner, repo_name = error_repo.split('/', 1)
    unchecked_repos.append({
        "owner": owner.strip(),
        "name": repo_name.strip(),
        "default_branch": repo_name_to_branch[f"{owner.strip()}/{repo_name.strip()}"].strip()
    })

print(unchecked_repos[:5])

batch_size = 100
repos_with_network_error = set()

for start in range(0, len(repos_with_network_error), batch_size):
    end = min(start + batch_size, limit)
    check_workflows(repos_with_network_error, start, end)
    print(f'repos_with_CI:{len(repos_with_CI)}, repos_with_network_error: {len(repos_with_network_error)}')


[{'owner': 'filtersheroes', 'name': 'polishcookieconsent', 'default_branch': 'master'}, {'owner': 'amitmerchant1990', 'name': 'notepad', 'default_branch': 'master'}, {'owner': 'an0na', 'name': 'r', 'default_branch': 'master'}, {'owner': 'frog23', 'name': 'tomscottmap', 'default_branch': 'master'}, {'owner': 'vasturiano', 'name': 'three-fatline', 'default_branch': 'master'}]


In [None]:
# Save each set to a separate file again!
with open("repos_with_CI.txt", "w") as f:
    for repo in (repos_with_CI):
        f.write(repo + "\n")

with open("repos_with_no_workflows.txt", "w") as f:
    for repo in (repos_with_no_workflows):
        f.write(repo + "\n")

with open("invalid_repos.txt", "w") as f:
    for repo in (invalid_repos):
        f.write(repo + "\n")

with open("repos_with_network_error.txt", "w") as f:
    for repo in (repos_with_network_error):
        f.write(repo + "\n")

In [None]:
print(f"repos_with_CI:{len(repos_with_CI)} repos_with_network_error:{len(repos_with_network_error)} repos_with_no_workflows:{len(repos_with_no_workflows)} invalid_repos:{len(invalid_repos)}")

repos_with_CI:12160 repos_with_network_error:0 repos_with_no_workflows:11921 invalid_repos:431
