In [69]:
import requests
import pandas as pd
import os
from bs4 import BeautifulSoup

def get_topics_page():
    topics_url = 'https://github.com/topics'
    response = requests.get(topics_url)
    if response.status_code != 200:
        raise Exception('Failed to load page {}'.format(topic_url))
    doc = BeautifulSoup(response.text, 'html.parser')
    return doc

In [70]:
doc = get_topics_page()

In [71]:
def get_topic_titles(doc):
    selection_class = 'f3 lh-condensed mb-0 mt-1 Link--primary'
    topic_title_tags = doc.find_all('p', class_= selection_class)
    topic_titles = []
    for tag in topic_title_tags:
        topic_titles.append(tag.text)
    return topic_titles
def get_topic_descs(doc):
    desc_selector = 'f5 color-text-secondary mb-0 mt-1'
    topic_desc_tags = doc.find_all('p', class_=desc_selector)
    topic_descs = []
    for tag in topic_desc_tags:
        topic_descs.append(tag.text.strip())
    return topic_descs
def get_topic_urls(doc):
    topic_link_tags = doc.find_all('a', class_= 'd-flex no-underline')
    topic_urls = []
    base_url = 'https://github.com'
    for tag in topic_link_tags:
        topic_urls.append(base_url + tag['href'])
    return topic_urls

In [72]:
titles = get_topic_titles(doc)
titles
len(titles)
desc = get_topic_descs(doc)
desc
len(desc)
url = get_topic_urls(doc)
url
len(url)

30

In [73]:
def scrape_topics():
    topics_url = 'https://github.com/topics'
    response = requests.get(topics_url)
    if response.status_code != 200:
        raise Exception('Failed to load page {}'.format(topic_url))
    doc = BeautifulSoup(response.text, 'html.parser')
    topics_dict = {
        'title': titles,
        'description': desc,
        'url': url
    }
    return pd.DataFrame(topics_dict)

In [74]:
dataframe=scrape_topics()
dataframe

Unnamed: 0,title,description,url
0,3D,3D modeling is the process of virtually develo...,https://github.com/topics/3d
1,Ajax,Ajax is a technique for creating interactive w...,https://github.com/topics/ajax
2,Algorithm,Algorithms are self-contained sequences that c...,https://github.com/topics/algorithm
3,Amp,Amp is a non-blocking concurrency framework fo...,https://github.com/topics/amphp
4,Android,Android is an operating system built by Google...,https://github.com/topics/android
5,Angular,Angular is an open source web application plat...,https://github.com/topics/angular
6,Ansible,Ansible is a simple and powerful automation en...,https://github.com/topics/ansible
7,API,An API (Application Programming Interface) is ...,https://github.com/topics/api
8,Arduino,Arduino is an open source hardware and softwar...,https://github.com/topics/arduino
9,ASP.NET,ASP.NET is a web framework for building modern...,https://github.com/topics/aspnet


In [75]:
dataframe.to_csv('topics', index=None)

In [76]:
def get_topic_page(topic_url):
   
    response = requests.get(topic_url)
    
    if response.status_code != 200:
        raise Exception('Failed to load page {}'.format(topic_url))
   
    topic_doc = BeautifulSoup(response.text, 'html.parser')
    return topic_doc

In [77]:
def parse_star_count(stars_tag):
    if stars_tag[-1]=='k':
        return int(float(stars_tag[:-1]) * 1000)
    return int(stars_tag)     

In [78]:
def get_repo_info(h1_tag, star_tag):
    # returns all the required info about a repository
    a_tags = h1_tag.find_all('a')
    username = a_tags[0].text.strip()
    repo_name = a_tags[1].text.strip()
    base_url='https://github.com'
    repo_url =  base_url + a_tags[1]['href']
    stars = parse_star_count(star_tag.text.strip())
    return username, repo_name, stars, repo_url


In [79]:
def get_topic_repos(topic_doc):
   
    h1_selection_class = 'f3 color-text-secondary text-normal lh-condensed'
    repo_tags = topic_doc.find_all('h1', class_= h1_selection_class)
 
    star_tags = topic_doc.find_all('a', class_='social-count float-none')
    
    topic_repos_dict = { 'username': [], 'repo_name': [], 'stars': [],'repo_url': []}

    
    for i in range(len(repo_tags)):
        repo_info = get_repo_info(repo_tags[i], star_tags[i])
        topic_repos_dict['username'].append(repo_info[0])
        topic_repos_dict['repo_name'].append(repo_info[1])
        topic_repos_dict['stars'].append(repo_info[2])
        topic_repos_dict['repo_url'].append(repo_info[3])
        
    return pd.DataFrame(topic_repos_dict)

In [62]:
topic_url=url[7]
topic_df = get_topic_repos(get_topic_page(topic_url))
topic_df

Unnamed: 0,username,repo_name,stars,repo_url
0,public-apis,public-apis,118000,https://github.com/public-apis/public-apis
1,httpie,httpie,50600,https://github.com/httpie/httpie
2,neovim,neovim,43200,https://github.com/neovim/neovim
3,strapi,strapi,36000,https://github.com/strapi/strapi
4,slatedocs,slate,32400,https://github.com/slatedocs/slate
5,tiangolo,fastapi,30000,https://github.com/tiangolo/fastapi
6,hoppscotch,hoppscotch,28600,https://github.com/hoppscotch/hoppscotch
7,littlecodersh,ItChat,21400,https://github.com/littlecodersh/ItChat
8,encode,django-rest-framework,20800,https://github.com/encode/django-rest-framework
9,YMFE,yapi,20700,https://github.com/YMFE/yapi


In [80]:
topic_df = get_topic_repos(get_topic_page(topic_url))
topic_df.to_csv('API', index=None)

In [81]:
def scrape_topic(topic_url, path):
    if os.path.exists(path):
        print("The file {} already exists. Skipping...".format(path))
        return
    topic_df = get_topic_repos(get_topic_page(topic_url))
    topic_df.to_csv(path, index=None)

In [82]:
def scrape_topics_repos():
    print('Scraping list of topics')
    topics_df = scrape_topics()
    
    os.makedirs('data', exist_ok=True)
    for index, row in topics_df.iterrows():
        print('Scraping top repositories for "{}"'.format(row['title']))
        scrape_topic(row['url'], 'data/{}.csv'.format(row['title']))

In [83]:
scrape_topics_repos()

Scraping list of topics
Scraping top repositories for "3D"
Scraping top repositories for "Ajax"
Scraping top repositories for "Algorithm"
Scraping top repositories for "Amp"
Scraping top repositories for "Android"
Scraping top repositories for "Angular"
Scraping top repositories for "Ansible"
Scraping top repositories for "API"
Scraping top repositories for "Arduino"
Scraping top repositories for "ASP.NET"
Scraping top repositories for "Atom"
Scraping top repositories for "Awesome Lists"
Scraping top repositories for "Amazon Web Services"
Scraping top repositories for "Azure"
Scraping top repositories for "Babel"
Scraping top repositories for "Bash"
Scraping top repositories for "Bitcoin"
Scraping top repositories for "Bootstrap"
Scraping top repositories for "Bot"
Scraping top repositories for "C"
Scraping top repositories for "Chrome"
Scraping top repositories for "Chrome extension"
Scraping top repositories for "Command line interface"
Scraping top repositories for "Clojure"
Scrapin

Exception: Failed to load page https://github.com/topics/code-review