In [145]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import pandas as pd
import time
from bs4 import BeautifulSoup
import threading
from concurrent.futures import ThreadPoolExecutor

In [157]:
# Declare the lists for each feature to be scraped
repo_url = []
repo_watches = []
repo_sponsors = []
repo_open_issues = []
repo_closed_issues = []
repo_labels = []
repo_milestones = []
repo_open_prs = []
repo_closed_prs = []
lock = threading.Lock()

In [158]:
def watch_sponsors(project_url, project, driver):
    
    driver.get(project_url)
    
    # Wait for the document to be in 'complete' state
    WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.TAG_NAME, 'body'))
    )
    
    # Parse HTML
    # Get number of watches and sponsered?
    html = driver.page_source
    soup = BeautifulSoup(html,"html.parser")
    
    num_watches = soup.find(href=f"/{project}/watchers").find("strong").text
    
    creator = project.split('/')[0]
    sponsored = "Yes" if soup.find(href=f"/sponsors/{creator}") != None else "No"


    repo_watches.append(num_watches)
    repo_sponsors.append(sponsored)

In [159]:
def issues(project_url, project, driver):
    
    issue_url = project_url + "/issues"
    driver.get(issue_url)
    html = driver.page_source
    soup = BeautifulSoup(html,"html.parser")
    
    open_issues = None if soup.find(href=f"/{project}/issues?q=is%3Aopen+is%3Aissue") == None else soup.find(href=f"/{project}/issues?q=is%3Aopen+is%3Aissue").text.split()[0]
    closed_issues = None if soup.find(href=f"/{project}/issues?q=is%3Aissue+is%3Aclosed") == None else soup.find(href=f"/{project}/issues?q=is%3Aissue+is%3Aclosed").text.split()[0]
    num_labels = None if soup.find(href=f"/{project}/labels") == None else soup.find(href=f"/{project}/labels").find("span").text
    num_milestones = None if soup.find(href=f"/{project}/milestones") == None else soup.find(href=f"/{project}/milestones").find("span").text


    repo_open_issues.append(open_issues)
    repo_closed_issues.append(closed_issues)
    repo_labels.append(num_labels)
    repo_milestones.append(num_milestones)

In [160]:
def pull_requests(project_url, project, driver):
    
    pull_url = project_url + "/pulls"
    driver.get(pull_url)
    html = driver.page_source
    soup = BeautifulSoup(html,"html.parser")
    
    open_prs = soup.find(href=f"/{project}/pulls?q=is%3Aopen+is%3Apr").text.split()[0]
    closed_prs = soup.find(href=f"/{project}/pulls?q=is%3Apr+is%3Aclosed").text.split()[0]


    repo_open_prs.append(open_prs)
    repo_closed_prs.append(closed_prs)

In [161]:
# Potential Scrapes:
# ../issues => Open/closed issues, # of labels, # of milestones
# ../pulls
# ../actions => # of workflow runs
# ../pulse => Active pr and Active issues
# ../network/dependencies
# Sponsered
# Watches
# 
# Ones in Bash Script:
# Commits
# Tags/Releases
# Branches
# Languages
# Contributors

# Headless mode for Chrome
chrome_options = Options()
chrome_options.add_argument("--disable-extensions")
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--headless")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--disable-renderer-backgrounding")
chrome_options.add_argument("--disable-background-timer-throttling")
chrome_options.add_argument("--disable-backgrounding-occluded-windows")
chrome_options.add_argument("--disable-client-side-phishing-detection")
chrome_options.add_argument("--disable-crash-reporter")
chrome_options.add_argument("--disable-oopr-debug-crash-dump")
chrome_options.add_argument("--no-crash-upload")
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--disable-extensions")
chrome_options.add_argument("--disable-low-res-tiling")
chrome_options.add_argument("--log-level=3")
chrome_options.add_argument("--silent")

def scrape_page(project_url):
    
    print(project_url)
    # Add url to list
    repo_url.append(project_url)
    
    # Get the OWNER/REPO
    project = project_url[19:]
    print(project)

    # Set up Web Driver
    driver = webdriver.Chrome(options=chrome_options)

    # Get number of watches and sponsors
    watch_sponsors(project_url, project, driver)
    
    # Issues
    #thread = threading.Thread(name=watch_sponsors,target=issues(project_url, project, driver))
    #thread.start()
    issues(project_url, project, driver)

    # Pull Requests
    pull_requests(project_url, project, driver)
    
    # Clean close the Web Session and window(s)
    driver.quit()

In [162]:
project_list = pd.read_excel('project_5000Up.xlsx')

In [163]:
project_list = project_list['Project URL'].tolist()
project_list[:10]

['https://github.com/freeCodeCamp/freeCodeCamp',
 'https://github.com/EbookFoundation/free-programming-books',
 'https://github.com/sindresorhus/awesome',
 'https://github.com/public-apis/public-apis',
 'https://github.com/jwasham/coding-interview-university',
 'https://github.com/996icu/996.ICU',
 'https://github.com/kamranahmedse/developer-roadmap',
 'https://github.com/donnemartin/system-design-primer',
 'https://github.com/codecrafters-io/build-your-own-x',
 'https://github.com/facebook/react']

In [164]:
#df = pd.read_excel('projects.xlsx') # can also index sheet by name or fetch all sheets
#project_list = df['Project'].tolist()
#project_list = ["https://github.com/twbs/bootstrap", "https://github.com/freeCodeCamp/freeCodeCamp"]
#for p in project_list[:10]:
#    scrape_page(p)
with ThreadPoolExecutor(max_workers = 10) as p:
    p.map(scrape_page, project_list[:10])

https://github.com/freeCodeCamp/freeCodeCamp
freeCodeCamp/freeCodeCamp
https://github.com/EbookFoundation/free-programming-books
EbookFoundation/free-programming-books
https://github.com/sindresorhus/awesome
sindresorhus/awesome
https://github.com/public-apis/public-apis
public-apis/public-apis
https://github.com/jwasham/coding-interview-university
jwasham/coding-interview-university
https://github.com/996icu/996.ICU
996icu/996.ICU
https://github.com/kamranahmedse/developer-roadmap
kamranahmedse/developer-roadmap
https://github.com/donnemartin/system-design-primer
donnemartin/system-design-primer
https://github.com/codecrafters-io/build-your-own-x
codecrafters-io/build-your-own-x
https://github.com/facebook/react
facebook/react


In [165]:
print(len(repo_url))
print(len(repo_watches))
print(len(repo_sponsors))
print(len(repo_open_issues))
print(len(repo_closed_issues))
print(len(repo_labels))
print(len(repo_milestones))
print(len(repo_open_prs))
print(len(repo_closed_prs))

10
10
10
10
10
10
10
10
10


In [166]:
projects_df = pd.DataFrame({'Project URL':repo_url, 
                            'Number of Watches':repo_watches,
                            'Sponsored':repo_sponsors,
                            'Open Issues':repo_open_issues,
                            'Closed Issues':repo_closed_issues,
                            'Number of Labels':repo_labels,
                            'Number of Milestones':repo_milestones,
                            'Open Pull Requests':repo_open_prs,
                            'Closed Pull Requests':repo_closed_prs})

In [167]:
projects_df

Unnamed: 0,Project URL,Number of Watches,Sponsored,Open Issues,Closed Issues,Number of Labels,Number of Milestones,Open Pull Requests,Closed Pull Requests
0,https://github.com/freeCodeCamp/freeCodeCamp,4.2k,No,,,14,3,3,1970
1,https://github.com/EbookFoundation/free-progra...,7.6k,Yes,22.0,312.0,8,0,136,239
2,https://github.com/sindresorhus/awesome,9.7k,Yes,177.0,426.0,1,0,273,2655
3,https://github.com/public-apis/public-apis,4.9k,No,24.0,1038.0,73,0,16,9836
4,https://github.com/jwasham/coding-interview-un...,6.6k,No,1170.0,11550.0,67,1,21,1980
5,https://github.com/996icu/996.ICU,6.9k,Yes,731.0,1068.0,17,0,11,933
6,https://github.com/kamranahmedse/developer-roa...,8.5k,Yes,270.0,17414.0,44,4,436,14158
7,https://github.com/donnemartin/system-design-p...,8.6k,Yes,45.0,369.0,12,0,54,35527
8,https://github.com/codecrafters-io/build-your-...,6.6k,No,197.0,80.0,16,0,219,297
9,https://github.com/facebook/react,4.1k,No,11.0,547.0,21,0,241,2866


try:
    with pd.ExcelWriter(
        "project_HTMLfeatures.xlsx",
        mode="a",
        engine="openpyxl",
        if_sheet_exists="overlay",
    ) as writer:
         projects_df.to_excel(writer,sheet_name="Sheet1", startrow=writer.sheets["Sheet1"].max_row, index = False,header= False)
except FileNotFoundError:
    projects_df.to_excel("project_HTMLfeatures.xlsx", index=False)