# Outline: -Scrape https://github.com/topics
-Get the list of topics. For each get the title, topic URl, topic description -For each topic get top 25 repositories 
-For each repository get the reponame, username, stars and repo url -For each topic create a csv file

In [None]:
#Read the star_count
def parse_star_count(star_str):
    star_str = star_str.strip()
    if star_str[-1] == 'k':
       return int(float(star_str[:-1]) * 1000)
    return int(star_str)

In [None]:
import os
import pandas as pd
from bs4 import BeautifulSoup

def get_topic_page(topic_url):
    #Download the page
    response = requests.get(topic_url)
    #Check successful response
    if response.status_code != 200:
        raise Exception('Failed to load page {}'.format(topic_url))
    #Parse using Beautiful soup
    topic_doc =  BeautifulSoup(response.text, 'html.parser')
    return topic_doc

    #Get the a_tag from h1_tag and repo url
def get_repo_info(h1_tag, star_tags):
    a_tags = h1_tag.find_all('a')
    username = a_tags[0].text.split()
    repo_name = a_tags[1].text.split()
    repo_url = base_url + a_tags[1]['href']
    stars = parse_star_count(star_tags.text.strip())
    return username, repo_name, stars, repo_url

def get_topic_repos(topic_doc):
    #Get the h1 tags containing repo title, repo URL and username
    h1_selection_class = 'f3 color-fg-muted text-normal lh-condensed'
    repo_tags = topic_doc.find_all('h3', {'class': h1_selection_class})
    #Get star_tags
    star_tags = topic_doc.find_all('span', {'class': 'Counter js-social-count'})
    
    topic_repos_dict={
    'username': [],
    'repo_name': [],
    'stars': [],
    'repo_url': []
    }

    for i in range(len(repo_tags)):
        repo_info = get_repo_info(repo_tags[i], star_tags[i])
        topic_repos_dict['username'].append(repo_info[0])
        topic_repos_dict['repo_name'].append(repo_info[1])
        topic_repos_dict['stars'].append(repo_info[2])
        topic_repos_dict['repo_url'].append(repo_info[3])
        
    return pd.DataFrame(topic_repos_dict)
    
    #Turn the doc to csv
def scrape_topic(topic_url, path):
    if os.path.exists(path):
        print("The file {} already exists.".format(path))
        return 
    topic_df = get_topic_repos(get_topic_page(topic_url))
    topic_df.to_csv(path + '.csv', index = None)
    
    
    

In [None]:
#get the title, description, url and turn it to a dataframe for each page

def get_topic_titles(doc):
    selection_class = 'f3 lh-condensed mb-0 mt-1 Link--primary'
    topic_title_tags = doc.find_all('p',{'class': selection_class})
    topic_titles =[]
    for tag in topic_title_tags:
        topic_titles.append(tag.text)
    return topic_titles

def get_topic_desc(doc):
    desc_class = 'f5 color-fg-muted mb-0 mt-1'
    topic_desc_tags = doc.find_all('p',{'class': desc_class})
    topic_desc = []
    for tag in topic_desc_tags:
        topic_desc.append(tag.text.strip())
    return topic_desc

def get_topic_urls(doc):
    topic_link_class = 'no-underline flex-grow-0'
    topic_link_tag = doc.find_all('a',{'class': topic_link_class})
    topic_link=[]
    base_url = 'https://github.com'
    for tag in topic_link_tag:
        topic_link.append(base_url + tag['href'])
    return topic_link
    
def scrape_topics():
    topics_url = 'https://github.com/topics'
    response = requests.get(topics_url)
    if response.status_code != 200:
        raise Exception('Failed to load page {}'.format(topics_url))
    topic_dict ={
        'title': get_topic_titles(doc),
        'description':get_topic_desc(doc),
        'url': get_topic_urls(doc)
    }
    return pd.DataFrame(topic_dict)

In [None]:
#Putting everything together
import os
def scrape_topics_repos():
    print('Scraping list of topics')
    topics_df = scrape_topics()
    os.makedirs('data', exist_ok = True)
    for index, row in topics_df.iterrows():
        print('Scrapping top repositories for "{}"'.format(row['title']))
        scrape_topic(row['url'], 'data/{}.csv'.format(row['title']))