In [5]:
pip install python-dotenv pandas


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [73]:
import pandas as pd
import requests
import time
from dotenv import load_dotenv
import os
import json

load_dotenv() 

True

In [83]:
# Configuration
CSV_FILE = "files/git_repo_filtered_js_commit_date.csv"
GITHUB_TOKEN = os.getenv("GITHUB_TOKEN")
GRAPHQL_URL = "https://api.github.com/graphql"
HEADERS = {
    "Authorization": f"Bearer {GITHUB_TOKEN}",
    "Accept": "application/vnd.github+json"
}

repos_from_csv = []

repos_with_CI = set()
invalid_repos = set()
repos_with_no_workflows = set()
repos_with_network_error = set()
repo_name_to_branch = {}

In [109]:

def read_filtered_repo_csvlist():
    df = pd.read_csv(CSV_FILE)

    # Split 'name' into owner and repo
    for _, row in df.iterrows():
        if '/' not in row['name']:
            print(f"Skipping invalid repo name: {row['name']}")
            continue
        owner, repo_name = row['name'].split('/', 1)
        repos_from_csv.append({
            "owner": owner.strip(),
            "name": repo_name.strip(),
            "default_branch": row['default_branch'].strip()
        })
        repo_name_to_branch[f"{owner.strip()}/{repo_name.strip()}".lower()] = row['default_branch'].strip()

    print(repos_from_csv[:5])

read_filtered_repo_csvlist()

[{'owner': 'bigbluebutton', 'name': 'bigbluebutton', 'default_branch': 'v3.0.x-release'}, {'owner': 'zabinx', 'name': 'duskrpg', 'default_branch': 'master'}, {'owner': 'apache', 'name': 'cordova-android', 'default_branch': 'master'}, {'owner': 'aws-samples', 'name': 'aws-dynamodb-examples', 'default_branch': 'master'}, {'owner': 'dgarijo', 'name': 'widoco', 'default_branch': 'master'}]


In [29]:
# Step 2: Build GraphQL query for multiple repos
def build_query_to_check_workflows(repos, start, end):
    query_parts = []
    for i, repo in enumerate(repos[start:end]):
        query_parts.append(f"""
        repo{i}: repository(owner: "{repo['owner']}", name: "{repo['name']}") {{
            workflows: object(expression: "{repo['default_branch']}:.github/workflows") {{
                ... on Tree {{
                    entries {{
                        name
                        type
                    }}
                }}
            }}
        }}
        """)
    full_query = "query { " + " ".join(query_parts) + " }"
    # print("query:", full_query)
    return full_query

In [18]:
# Step 3: Execute query and parse results
def check_workflows(repos, start, end):
    try:
        query = build_query_to_check_workflows(repos, start, end)
        response = requests.post(GRAPHQL_URL, json={"query": query}, headers=HEADERS)
        response.raise_for_status()
        data = response.json()

        # If GitHub responds with an error (403, 502, etc.)
        if response.status_code != 200:
            print(f"Skipping batch {start}:{end} (HTTP {response.status_code})")
            for repo in repos[start:end]:
                repos_with_network_error.add(f"{repo['owner']}/{repo['name']}")
            return

        data = response.json()

        for i, repo in enumerate(repos[start:end]):
            key = f"repo{i}"
            repo_data = data.get("data", {}).get(key, {})

            if not repo_data:
                invalid_repos.add(f"{repo['owner']}/{repo['name']}")
                continue
            workflows = repo_data.get("workflows")

            if workflows and workflows.get("entries"):
                repos_with_CI.add(f"{repo['owner']}/{repo['name']}")
            else:
                repos_with_no_workflows.add(f"{repo['owner']}/{repo['name']}")
    except requests.exceptions.RequestException as e:
        print(f"Request failed for batch {start}:{end}: {e}")
        for repo in repos[start:end]:
            repos_with_network_error.add(f"{repo['owner']}/{repo['name']}")

In [20]:
start = 0
batch_size = 100
limit = len(repos_from_csv)

repos_with_CI = set()
invalid_repos = set()
repos_with_no_workflows = set()
repos_with_network_error = set()

for start in range(start, limit, batch_size):
    end = min(start + batch_size, limit)
    check_workflows(repos_from_csv, start, end)
    print(f"Processed {start} to {end}. repos_with_CI:{len(repos_with_CI)} repos_with_network_error:{len(repos_with_network_error)} repos_with_no_workflows:{len(repos_with_no_workflows)} invalid_repos:{len(invalid_repos)}")
    time.sleep(5)

Processed 0 to 100. repos_with_CI:54 repos_with_network_error:0 repos_with_no_workflows:45 invalid_repos:1
Processed 100 to 200. repos_with_CI:120 repos_with_network_error:0 repos_with_no_workflows:77 invalid_repos:3
Processed 200 to 300. repos_with_CI:152 repos_with_network_error:0 repos_with_no_workflows:144 invalid_repos:4
Processed 300 to 400. repos_with_CI:177 repos_with_network_error:0 repos_with_no_workflows:219 invalid_repos:4
Processed 400 to 500. repos_with_CI:194 repos_with_network_error:0 repos_with_no_workflows:301 invalid_repos:5
Processed 500 to 600. repos_with_CI:228 repos_with_network_error:0 repos_with_no_workflows:366 invalid_repos:6
Processed 600 to 700. repos_with_CI:259 repos_with_network_error:0 repos_with_no_workflows:435 invalid_repos:6
Processed 700 to 800. repos_with_CI:283 repos_with_network_error:0 repos_with_no_workflows:511 invalid_repos:6
Processed 800 to 900. repos_with_CI:310 repos_with_network_error:0 repos_with_no_workflows:583 invalid_repos:7
Proces

In [21]:
# Save each set to a separate file
def save_repo_names_to_file():
    with open("files/repos_with_CI.txt", "w") as f:
        for repo in (repos_with_CI):
            f.write(repo + "\n")

    with open("files/repos_with_no_workflows.txt", "w") as f:
        for repo in (repos_with_no_workflows):
            f.write(repo + "\n")

    with open("files/invalid_repos.txt", "w") as f:
        for repo in (invalid_repos):
            f.write(repo + "\n")

    with open("files/repos_with_network_error.txt", "w") as f:
        for repo in (repos_with_network_error):
            f.write(repo + "\n")

In [22]:
save_repo_names_to_file()

In [47]:
# retrying repos with network error
unchecked_repos = []
for error_repo in repos_with_network_error:
    owner, repo_name = error_repo.split('/', 1)
    unchecked_repos.append({
        "owner": owner.strip(),
        "name": repo_name.strip(),
        "default_branch": repo_name_to_branch[f"{owner.strip()}/{repo_name.strip()}"].strip()
    })

batch_size = 100
repos_with_network_error = set()

for start in range(0, len(unchecked_repos), batch_size):
    end = min(start + batch_size, limit)
    check_workflows(unchecked_repos, start, end)

print(f'repos_with_CI:{len(repos_with_CI)}, repos_with_network_error: {len(repos_with_network_error)}')


repos_with_CI:12215, repos_with_network_error: 0


In [51]:
save_repo_names_to_file()
print(f"repos_with_CI:{len(repos_with_CI)} repos_with_network_error:{len(repos_with_network_error)} repos_with_no_workflows:{len(repos_with_no_workflows)} invalid_repos:{len(invalid_repos)}")

repos_with_CI:12215 repos_with_network_error:0 repos_with_no_workflows:11965 invalid_repos:432


In [50]:
# read from saved files
repos_list_with_CI = []

with open("files/repos_with_CI.txt", "r") as file:
    for line in file:
        repos_list_with_CI.append(line.lower())

print(f"repos_with_CI:{len(repos_with_CI)}")

repos_with_CI:12215


In [52]:
def build_query_for_download_workflows(start, end, repos_list):
    query_parts = []
    for idx, repo_full in enumerate(repos_list[start:end]):
        owner, name = [x.strip() for x in repo_full.split("/")]
        query_parts.append(f"""
        repo{idx}: repository(owner: "{owner}", name: "{name}") {{
            object(expression: "HEAD:.github/workflows") {{
                ... on Tree {{
                    entries {{
                        name
                        type
                        object {{
                            ... on Blob {{
                                text
                            }}
                        }}
                    }}
                }}
            }}
        }}
        """)
    return "query {\n" + "\n".join(query_parts) + "\n}"

In [53]:
def download_workflows(repos_list):
    start = 0
    batch_size = 50
    limit = len(repos_list)
    sleep_time = 3
    workflow_files = "workflow_files"

    for start in range(start, limit, batch_size):
        time.sleep(sleep_time)  # To respect rate limits
        print(f"Processing chunk {start} to {min(start + batch_size, limit)}")
        try:
            end = min(start + batch_size, limit)
            query = build_query_for_download_workflows(start, end, repos_list)
            response = requests.post(
                "https://api.github.com/graphql",
                json={"query": query},
                headers=HEADERS,
                timeout=30  # Optional: avoid hanging
            )
            response.raise_for_status()
            data = response.json().get("data", {})

            for idx, repo_full in enumerate(repos_list[start:end]):
                owner, repo_name = repo_full.split("/")
                repo_key = f"repo{idx}"

                if(data.get(repo_key, {}) is None or data.get(repo_key, {}).get("object") is None):
                    print(f"Data missing {repo_full}")
                    continue

                entries = data.get(repo_key, {}).get("object", {}).get("entries", [])

                if not entries:
                    print(f"No workflows found in {repo_full}")
                    continue

                save_folder = os.path.join(workflow_files, owner, repo_name)
                os.makedirs(save_folder, exist_ok=True)

                for entry in entries:
                    if entry["type"] == "blob":
                        filename = entry["name"]
                        content = entry["object"]["text"]
                        file_path = os.path.join(save_folder, filename)
                        if content is not None:
                            with open(file_path, "w") as f:
                                f.write(content)

        except Exception as e:
            print(f"Error processing chunk {start}-{end}: {e}")

In [54]:
download_workflows(repos_list_with_CI)

Processing chunk 0 to 50
Processing chunk 50 to 100
Processing chunk 100 to 150
Processing chunk 150 to 200
Processing chunk 200 to 250
Processing chunk 250 to 300
Processing chunk 300 to 350
Processing chunk 350 to 400
Processing chunk 400 to 450
Processing chunk 450 to 500
Processing chunk 500 to 550
Processing chunk 550 to 600
Processing chunk 600 to 650
Processing chunk 650 to 700
Processing chunk 700 to 750
Processing chunk 750 to 800
Processing chunk 800 to 850
Processing chunk 850 to 900
Processing chunk 900 to 950
Processing chunk 950 to 1000
Processing chunk 1000 to 1050
Processing chunk 1050 to 1100
Processing chunk 1100 to 1150
Processing chunk 1150 to 1200
Processing chunk 1200 to 1250
Data missing monsternone/tmall-miao

Processing chunk 1250 to 1300
Processing chunk 1300 to 1350
Processing chunk 1350 to 1400
Processing chunk 1400 to 1450
Processing chunk 1450 to 1500
Processing chunk 1500 to 1550
Processing chunk 1550 to 1600
Processing chunk 1600 to 1650
Processing chunk

In [95]:
def total_downloaded_workflows(root_folder):
    result = []
    for parent in os.listdir(root_folder):
        parent_path = os.path.join(root_folder, parent)
        if os.path.isdir(parent_path):
            for child in os.listdir(parent_path):
                child_path = os.path.join(parent_path, child)
                if os.path.isdir(child_path):
                    result.append(f"{parent}/{child}".lower().strip())
    return result

# Example usage:
root = "workflow_files"
total_downloaded_workflow_list = total_downloaded_workflows(root)

print("\nTotal:", len(total_downloaded_workflow_list))


Total: 12210


In [None]:
def get_unretrieved_repos():
    unretrieved_repos_list = []
    for r in repos_list_with_CI:
        if r not in total_downloaded_workflow_list:
            unretrieved_repos_list.append(r)
    
    return unretrieved_repos_list


In [70]:
unretrieved_repos_list = get_unretrieved_repos()

In [72]:
# download unretrieved repos: for network issues or other issues
download_workflows(unretrieved_repos_list)

Processing chunk 0 to 5
Data missing monsternone/tmall-miao

Data missing hjyssg/shigureader

Data missing jayofelony/pwnagotchi

Data missing nulldev/spendenr-ai-d

Data missing cpinitiative/ide



In [146]:
build_datas = []
repos_with_missing_build_datas = []

def get_build_data_rest_optimized():
    index = 0
    for repo in total_downloaded_workflow_list:
        
        index += 1
        if(index%20 == 0):
            print(f"Processing {index} out of {len(total_downloaded_workflow_list)}")
        
        try:
            time.sleep(1)

            OWNER, REPO = repo.split('/', 1)
            BRANCH = repo_name_to_branch[f"{OWNER}/{REPO}".lower()]

            # Get workflow runs directly (this gets the latest commit info too)
            runs_url = f"https://api.github.com/repos/{OWNER}/{REPO}/actions/runs"
            params = {"per_page": 1} # Ignoring main/master branch, as latest commit could be in any branch

            runs_resp = requests.get(runs_url, headers=HEADERS, params=params)
            runs_resp.raise_for_status()
            runs_data = runs_resp.json()

            if "workflow_runs" in runs_data and runs_data["workflow_runs"]:
                run = runs_data["workflow_runs"][0]
                latest_sha = run.get("head_sha")  # Get SHA from workflow run

                build_data = {
                    "repo": f"{OWNER}/{REPO}",
                    "branch": run.get("head_branch"),
                    "default_branch": BRANCH,
                    "commit": latest_sha,
                    "workflow_name": run.get("name"),
                    "run_id": run.get("id"),
                    "status": run.get("status"),
                    "conclusion": run.get("conclusion"),
                    "event": run.get("event"),
                    "url": run.get("html_url"),
                    "start_time": run.get("run_started_at"),
                    "end_time": run.get("updated_at"),
                    "runner_environment": run.get("runner_environment", "N/A"),
                }

                build_datas.append(build_data)
            else:
                print(f"No workflow runs found: {OWNER}/{REPO}, Potentially default branch changed")
                repos_with_missing_build_datas.append(f"{OWNER}/{REPO}")
                
        except Exception as e:
            print(f"Error processing {OWNER}/{REPO}: {e}")
            repos_with_missing_build_datas.append(f"{OWNER}/{REPO}")

In [147]:
get_build_data_rest_optimized()


df_build_data = pd.DataFrame(build_datas)
df_build_data.to_csv("files/build_data.csv", index=False)


with open("files/repos_with_missing_build_datas.txt", "w") as f:
    for repo in (repos_with_missing_build_datas):
        f.write(repo + "\n")

No workflow runs found: code-collabo/node-mongo-cli, Potentially default branch changed
No workflow runs found: asaitoshiya/brostr, Potentially default branch changed
Processing 20 out of 12210
No workflow runs found: glennfaison/mtn-momo, Potentially default branch changed
No workflow runs found: vkcom/appearance, Potentially default branch changed
Processing 40 out of 12210
No workflow runs found: kubeclipper/console, Potentially default branch changed
Processing 60 out of 12210
No workflow runs found: yarnpkg/yarn, Potentially default branch changed
No workflow runs found: asifsha/react-native-picker-cascader, Potentially default branch changed
Processing 80 out of 12210
No workflow runs found: vzze/volume-control, Potentially default branch changed
No workflow runs found: arival/kilocolor, Potentially default branch changed
Processing 100 out of 12210
No workflow runs found: siyuan-community/siyuan-developer-docs, Potentially default branch changed
No workflow runs found: lizongyin

In [None]:
build_datas = []
repos_with_missing_build_datas = []