In [1]:
import pandas as pd
import time
from bs4 import BeautifulSoup
import asyncio
from playwright.async_api import async_playwright, Playwright

In [2]:
# Declare the lists for each feature to be scraped
repo_url = []
repo_watches = []
repo_sponsors = []
repo_open_issues = []
repo_closed_issues = []
repo_labels = []
repo_milestones = []
repo_open_prs = []
repo_closed_prs = []

In [3]:
async def run(playwright, project_urls):
    chromium = playwright.chromium  # Use Chromium browser
    browser = await chromium.launch(headless=True)
    context = await browser.new_context()

    for url in project_urls:
        page = await context.new_page()
        await page.goto(url)
        html = await page.content()
        print(f"Parsing {url}")
        soup = BeautifulSoup(html,"html.parser")
        
        # Get the OWNER/REPO
        project = url[19:]
        repo_url.append(url)
        
        # Parse HTML
        # Get number of watches and sponsered?
        num_watches = soup.find(href=f"/{project}/watchers").find("strong").text
        
        creator = project.split('/')[0]
        sponsored = "Yes" if soup.find(href=f"/sponsors/{creator}") != None else "No"
        
        repo_watches.append(num_watches)
        repo_sponsors.append(sponsored)
        
        
        # Issues
        issue_url = url + "/issues"
        await page.goto(issue_url)
        html = await page.content()
        soup = BeautifulSoup(html,"html.parser")
        
        open_issues = soup.find(href=f"/{project}/issues?q=is%3Aopen+is%3Aissue").text.split()[0]
        closed_issues = soup.find(href=f"/{project}/issues?q=is%3Aissue+is%3Aclosed").text.split()[0]
        num_labels = soup.find(href=f"/{project}/labels").find("span").text
        num_milestones = soup.find(href=f"/{project}/milestones").find("span").text
        
        repo_open_issues.append(open_issues)
        repo_closed_issues.append(closed_issues)
        repo_labels.append(num_labels)
        repo_milestones.append(num_milestones)
        
        
        # Pull Requests
        pull_url = url + "/pulls"
        await page.goto(pull_url)
        html = await page.content()
        soup = BeautifulSoup(html,"html.parser")
        
        open_prs = soup.find(href=f"/{project}/pulls?q=is%3Aopen+is%3Apr").text.split()[0]
        closed_prs = soup.find(href=f"/{project}/pulls?q=is%3Apr+is%3Aclosed").text.split()[0]
        
        repo_open_prs.append(open_prs)
        repo_closed_prs.append(closed_prs)

        # Close current page instance
        await page.close()

    await browser.close()

In [4]:
async def main():
    project_list = ["https://github.com/twbs/bootstrap", "https://github.com/freeCodeCamp/freeCodeCamp", "https://github.com/geekyutao/Inpaint-Anything", "https://github.com/spring-attic/spring-mvc-showcase", "https://github.com/raspberrypi/firmware", "https://github.com/negomi/react-burger-menu", "https://github.com/yyhsong/iDataV", "https://github.com/yipianfengye/android-zxingLibrary", "https://github.com/nikic/FastRoute", "https://github.com/vercel/platforms", "https://github.com/thinkingjimmy/Learning-Prompt"]

    async with async_playwright() as playwright:
        await run(playwright, project_list)

In [5]:
await main()

Parsing https://github.com/twbs/bootstrap
Parsing https://github.com/freeCodeCamp/freeCodeCamp


In [6]:
projects_df = pd.DataFrame({'Project URL':repo_url, 
                            'Number of Watches':repo_watches,
                            'Sponsored':repo_sponsors,
                            'Open Issues':repo_open_issues,
                            'Closed Issues':repo_closed_issues,
                            'Number of Labels':repo_labels,
                            'Number of Milestones':repo_milestones,
                            'Open Pull Requests':repo_open_prs,
                            'Closed Pull Requests':repo_closed_prs})

In [7]:
projects_df

Unnamed: 0,Project URL,Number of Watches,Sponsored,Open Issues,Closed Issues,Number of Labels,Number of Milestones,Open Pull Requests,Closed Pull Requests
0,https://github.com/twbs/bootstrap,6.8k,Yes,394,21901,60,0,138,14899
1,https://github.com/freeCodeCamp/freeCodeCamp,8.5k,Yes,270,17414,44,4,54,35527
