In [2]:
import requests
from requests.auth import HTTPBasicAuth 
import pandas as pd
import time
import os
import math
import datetime
import subprocess

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [5]:
projects = pd.read_excel('project_5000Up.xlsx')
project_list = projects.iloc[:, 0].tolist()
project_list = project_list[:10]
project_set = set(project_list)

In [7]:
def collect_sbom(project_list, dir_path):
    for project in project_list:
        owner_repo = project[19:]
        sbom_url = f"https://api.github.com/repos/{owner_repo}/dependency-graph/sbom"
        file_name = owner_repo.split('/')
        file_name = f"{file_name[0]}_{file_name[1]}_sbom.json"
        print(file_name)
        
        headers = {
            'Accept': 'application/vnd.github+json',
            'Authorization': 'Bearer ',
            'X-GitHub-Api-Version': '2022-11-28',
        }
        response = requests.get(sbom_url, headers=headers)
        if response.status_code == 200:
            with open(f"{dir_path}/{file_name}", "wb") as file:
                file.write(response.content)
            print(f"{project}: SBOM downloaded")
        else:
            print(f"{project}: SBOM download failed")

In [8]:
current_dir = os.getcwd()
date = datetime.date.today()
dir_name = f"sbom_{date}"
if not os.path.exists(dir_name): 
    os.makedirs(dir_name)
sbom_dir = f"{current_dir}/{dir_name}"
collect_sbom(project_list, sbom_dir)

freeCodeCamp_freeCodeCamp_sbom.json
https://github.com/freeCodeCamp/freeCodeCamp: SBOM downloaded
EbookFoundation_free-programming-books_sbom.json
https://github.com/EbookFoundation/free-programming-books: SBOM downloaded
sindresorhus_awesome_sbom.json
https://github.com/sindresorhus/awesome: SBOM downloaded
public-apis_public-apis_sbom.json
https://github.com/public-apis/public-apis: SBOM downloaded
jwasham_coding-interview-university_sbom.json
https://github.com/jwasham/coding-interview-university: SBOM downloaded
996icu_996.ICU_sbom.json
https://github.com/996icu/996.ICU: SBOM downloaded
kamranahmedse_developer-roadmap_sbom.json
https://github.com/kamranahmedse/developer-roadmap: SBOM downloaded
donnemartin_system-design-primer_sbom.json
https://github.com/donnemartin/system-design-primer: SBOM downloaded
codecrafters-io_build-your-own-x_sbom.json
https://github.com/codecrafters-io/build-your-own-x: SBOM downloaded
facebook_react_sbom.json
https://github.com/facebook/react: SBOM dow

In [62]:
repo_url = []
repo_stars = []
repo_wiki = []
repo_open_issues = []
repo_forks = []
repo_last_update = []
repo_size = []
repo_created_date = []
repo_last_push = []
repo_language = []
repo_discussions = []
repo_pages = []
repo_license = []
repo_archived = []
repo_projects = []
repo_homepage = []
repo_org = []
repo_topics = []
repo_ssh_url = []

In [46]:
def get_github_repo_info(search_filter, page_number, project_set):
    api_url = f"https://api.github.com/search/repositories?q="+str(search_filter)+"&page="+str(page_number)+"&per_page=100"
    
    headers = {
    "Authorization": f"Bearer",
    "Accept": "application/vnd.github.v3+json"
    }

    # Get number of repos to be used to determine number of pages in the calling cell below
    num_repos = 0
    
    response = requests.get(api_url, headers=headers)

    while response.status_code != 200:
        print(f"Request failed on page {page_number}")
        delay_seconds = 60  # default delay
        time.sleep(delay_seconds)
        response = requests.get(api_url, headers=headers)
    
    #response = requests.get(api_url)
    if response.status_code == 200:
        # Parse the JSON response
        repo_list = response.json()['items']
        num_repos = response.json()['total_count']

        if not repo_list:
            return None
        
        for repo_info in repo_list:
            if not project_set:
                break
            
            repo_name = repo_info.get("full_name", "Name not found")
            if f"https://github.com/{repo_name}" in project_set:
                project_set.remove(f"https://github.com/{repo_name}")
            else:
                continue
                
            # Extract and print relevant information
            repo_url.append(repo_info.get("html_url", "URL not found"))
            repo_stars.append(repo_info.get("stargazers_count", "Stargazers count not found"))
            repo_wiki.append(repo_info.get("has_wiki", "Wiki not found"))
            repo_open_issues.append(repo_info.get("open_issues_count", "Open issues count not found"))
            repo_forks.append(repo_info.get("forks_count", "Forks count not found"))
            repo_last_update.append(repo_info.get("updated_at", "Last update not found"))
            repo_size.append(repo_info.get("size", "size not found"))
            repo_created_date.append(repo_info.get("created_at", "Created date not found"))
            repo_last_push.append(repo_info.get("pushed_at", "Last push not found"))
            repo_language.append(repo_info.get("language", "Language not found"))
            repo_discussions.append(repo_info.get("has_discussions", "Discussions not found"))
            repo_pages.append(repo_info.get("has_pages", "Pages not found"))
            repo_archived.append(repo_info.get("archived", "Archived not found"))
            repo_projects.append(repo_info.get("has_projects", "Projects not found"))
            repo_topics.append(len(repo_info.get("topics", "No Topics")))
            repo_ssh_url.append(repo_info.get("ssh_url", "Projects not found"))
            repo_org.append(repo_info['owner'].get("type", "No type"))
            
            # Conditional statements are to avoid possible errors
            license = repo_info.get("license", "None")
            if license == "None" or license is None:
                repo_license.append("None")
            else:
                repo_license.append(license["spdx_id"])
                
                
            homepage = repo_info.get("homepage", "No Homepage")
            if homepage is None or len(homepage) == 0:
               repo_homepage.append("None")
            else:
                repo_homepage.append(homepage)      
            
        
    else:
        # If the request was not successful, print an error message
        print("Error:", response.status_code)
        print("Response:", response.text)
        
    return num_repos

In [57]:
def get_projects(low, high, project_set):
    # Variable for determining range of projects
    decrement = 500
    # While loop to go through each range from low to high
    while high != low:
        if not project_set:
            break

        # Change ranges accordingly to get <1000 projects
        if high == 400000:
            decrement = 375000
        elif high == 25000:
            decrement = 5000
        elif high == 15000:
            decrement = 500
        elif high == 5000:
            decrement = 100
        elif high == 3000 or high == 1000:
            decrement = 10
        elif high == 680:
            decrement = 5
        elif high == 500:
            decrement = 1
            
            
        # Search URL just in case => q=stars%3A120..120+created%3A2021-01-01..2021-12-31&
        
        # Add the 'created:' parameter for <178 stars
        if high <= 179:
            decrement = 1
            # Value of 9 goes down to year 2016
            for i in range(9):
                year = 2024 - i
                created_date = "+created%3A" + str(year) + "-01-01.." + str(year) + "-12-31"
                print(high-decrement, high-1, year, 1)
                return_value = get_github_repo_info("stars%3A"+str(high-decrement)+'..'+str(high-1)+created_date, 1, project_set)
                if return_value == None:
                    break
                # For loop to run function to get features
                for page_number in range(2,math.ceil(return_value/100)+1):
                    print(high-decrement, high-1, page_number)
                    return_value = get_github_repo_info("stars%3A"+str(high-decrement)+'..'+str(high-1)+created_date, page_number, project_set)
                    if return_value == None:
                        break
            # One more request for all projects <=2015
            created_date = "+created%3A<=2015-12-31"
            print(high-decrement, high-1, 2015, 1)
            return_value = get_github_repo_info("stars%3A"+str(high-decrement)+'..'+str(high-1)+created_date, 1, project_set)
            if return_value == None:
                break
            for page_number in range(2,math.ceil(return_value/100)+1):
                print(high-decrement, high-1, 2015, page_number)
                return_value = get_github_repo_info("stars%3A"+str(high-decrement)+'..'+str(high-1)+created_date, page_number, project_set)
                if return_value == None:
                    break
        else:
            print(high-decrement, high-1, 1)
            return_value = get_github_repo_info("stars%3A"+str(high-decrement)+'..'+str(high-1), 1, project_set)
            if return_value == None:
                break
            for page_number in range(2,math.ceil(return_value/100)+1):
                # For loop to run function to get features
                print(high-decrement, high-1, page_number)
                return_value = get_github_repo_info("stars%3A"+str(high-decrement)+'..'+str(high-1), page_number, project_set)
                if return_value == None:
                    break
        high -= decrement

In [63]:
project_set_copy = project_set.copy()
get_projects(5000,400000,project_set_copy)

25000 399999 1
25000 399999 2
25000 399999 3
25000 399999 4
25000 399999 5
25000 399999 6
25000 399999 7
25000 399999 8
25000 399999 9


In [64]:
projects_df = pd.DataFrame({'Project URL':repo_url,
                            'Clone SSH URL':repo_ssh_url,
                            'Organization':repo_org,
                            'Homepage':repo_homepage,
                            'Last Update':repo_last_update, 
                            'Last Push':repo_last_push,
                            'Created Date':repo_created_date,
                            'Archived':repo_archived,
                            'Size':repo_size, 
                            'Number of Stars':repo_stars, 
                            'Number of Open Issues':repo_open_issues, 
                            'Number of forks':repo_forks, 
                            'Has a Wiki':repo_wiki,
                            'Has Discussions':repo_discussions,
                            'Has Projects':repo_projects,
                            'Has Pages':repo_pages,
                            'License':repo_license,
                            'Language':repo_language,
                            'Topics': repo_topics})

In [65]:
projects_df

Unnamed: 0,Project URL,Clone SSH URL,Organization,Homepage,Last Update,Last Push,Created Date,Archived,Size,Number of Stars,Number of Open Issues,Number of forks,Has a Wiki,Has Discussions,Has Projects,Has Pages,License,Language,Topics
0,https://github.com/freeCodeCamp/freeCodeCamp,git@github.com:freeCodeCamp/freeCodeCamp.git,Organization,http://contribute.freecodecamp.org/,2024-04-02T22:47:27Z,2024-04-02T19:41:53Z,2014-12-24T17:49:19Z,False,426726,385871,352,35061,False,False,True,False,BSD-3-Clause,TypeScript,16
1,https://github.com/EbookFoundation/free-progra...,git@github.com:EbookFoundation/free-programmin...,Organization,https://ebookfoundation.github.io/free-program...,2024-04-02T22:48:28Z,2024-04-01T19:49:53Z,2013-10-11T06:50:37Z,False,18415,317368,41,59555,False,True,False,True,CC-BY-4.0,,5
2,https://github.com/sindresorhus/awesome,git@github.com:sindresorhus/awesome.git,User,,2024-04-02T22:53:41Z,2024-04-02T03:31:18Z,2014-07-11T13:42:37Z,False,1450,296595,47,26669,False,False,False,True,CC0-1.0,,5
3,https://github.com/public-apis/public-apis,git@github.com:public-apis/public-apis.git,Organization,http://public-apis.org,2024-04-02T22:48:36Z,2024-04-02T22:41:24Z,2016-03-20T23:49:42Z,False,5088,288750,249,31457,False,False,False,False,MIT,Python,13
4,https://github.com/jwasham/coding-interview-un...,git@github.com:jwasham/coding-interview-univer...,User,,2024-04-02T22:46:40Z,2024-04-01T15:14:04Z,2016-06-06T02:34:12Z,False,22092,280901,56,72120,False,False,False,False,CC-BY-SA-4.0,,12
5,https://github.com/kamranahmedse/developer-roa...,git@github.com:kamranahmedse/developer-roadmap...,User,https://roadmap.sh,2024-04-02T22:52:34Z,2024-04-02T21:33:00Z,2017-03-15T13:45:52Z,False,2630566,271310,1129,36618,True,False,False,True,NOASSERTION,TypeScript,18
6,https://github.com/996icu/996.ICU,git@github.com:996icu/996.ICU.git,User,https://996.icu,2024-04-02T21:27:35Z,2024-03-01T07:34:01Z,2019-03-26T07:31:14Z,False,187804,268927,16712,21273,False,False,False,False,NOASSERTION,,0
7,https://github.com/codecrafters-io/build-your-...,git@github.com:codecrafters-io/build-your-own-...,Organization,https://codecrafters.io,2024-04-02T22:48:53Z,2024-04-01T03:40:36Z,2018-05-09T12:03:18Z,False,992,253333,319,24043,False,False,False,False,,,6
8,https://github.com/donnemartin/system-design-p...,git@github.com:donnemartin/system-design-prime...,User,,2024-04-02T22:52:17Z,2024-03-12T03:11:27Z,2017-02-26T16:15:28Z,False,11187,250375,418,42915,True,False,True,False,NOASSERTION,Python,13
9,https://github.com/facebook/react,git@github.com:facebook/react.git,Organization,https://react.dev,2024-04-02T22:24:43Z,2024-04-02T22:50:32Z,2013-05-24T16:15:54Z,False,424781,220903,1589,45135,True,False,True,True,MIT,JavaScript,6


In [21]:
try:
    with pd.ExcelWriter(
        "project_100..177.xlsx",
        mode="a",
        engine="openpyxl",
        if_sheet_exists="overlay",
    ) as writer:
         projects_df.to_excel(writer,sheet_name="Sheet1", startrow=writer.sheets["Sheet1"].max_row, index = False,header= False)
except FileNotFoundError:
    projects_df.to_excel("project_100..177.xlsx", index=False)